## Imports and Data Loading

In [1]:
import pandas as pd
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import keras
import keras.layers as layers
from keras.models import Model
import mlflow
import mlflow.tensorflow

# Load train and test datasets
df_train = pd.read_csv('stackoverflow_questions_cleaned_train.csv')
df_test = pd.read_csv('stackoverflow_questions_cleaned_test.csv')

# Print available columns in train dataset
print(df_train.columns)

# Extract 'sentence_use' column
X_train_brut = df_train['sentence_use']
X_test_brut = df_test['sentence_use']

Index(['date', 'title', 'tags', 'score', 'answer_count', 'sentence_bow',
       'sentence_bow_lem', 'sentence_dl', 'sentence_use'],
      dtype='object')


## Load Universal Sentence Encoder and Encode Texts

In [2]:
# Load the Universal Sentence Encoder
module_url = "https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2"
embed = hub.load(module_url)

# Function to encode texts using USE
def encode_texts(texts):
    return embed(texts)

# Encode train and test texts
X_train = encode_texts(X_train_brut)
X_test = encode_texts(X_test_brut)

2024-08-02 15:16:10.801337: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-08-02 15:16:10.801361: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-02 15:16:10.801367: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-02 15:16:10.801413: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-02 15:16:10.801427: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-08-02 15:16:14.504613: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


## Process Tags

In [3]:
# Number of top tags to consider
number_of_tags = 50

# Create a list of all tags in the training set
all_tags = [tag for tags in df_train['tags'].apply(eval) for tag in tags]

# Limit tags to the top most frequent
top_tags = [tag for tag, count in Counter(all_tags).most_common(number_of_tags)]

# Filter tags to keep only the top tags
df_train['filtered_tags'] = df_train['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])
df_test['filtered_tags'] = df_test['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])

# Remove rows without tags in the training set
df_train = df_train[df_train['filtered_tags'].map(len) > 0]

# Encode tags with MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y_train = mlb.fit_transform(df_train['filtered_tags'])
y_test = mlb.transform(df_test['filtered_tags'])

## Display Embeddings

In [4]:
# Display embeddings of test texts
for i, message_embedding in enumerate(np.array(X_test).tolist()):
    print("Message: {}".format(X_test_brut[i]))
    print("Embedding size: {}".format(len(message_embedding)))
    message_embedding_snippet = ", ".join((str(x) for x in message_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Message: Marquee doesn't scroll everything
Embedding size: 512
Embedding: [-0.05534021556377411, -0.015510442666709423, 0.07882224768400192, ...]

Message: Junit 5's @Testcontainers doesn't work with cucumber and spring boot 3
Embedding size: 512
Embedding: [-0.054960642009973526, -0.030519818887114525, -0.027384620159864426, ...]

Message: How does Java handle memory with regards to homonymous local variables declared inside different not-nested code blocks inside a method?
Embedding size: 512
Embedding: [0.04648120328783989, -0.0038503389805555344, -0.07326491922140121, ...]

Message: Turn a list of tuples into pandas dataframe with single column
Embedding size: 512
Embedding: [-0.01979111135005951, -0.07007542252540588, 0.030770618468523026, ...]

Message: Why does the Rust compiler drop unused variables in the reverse order they were declared?
Embedding size: 512
Embedding: [0.07779866456985474, -0.051991548389196396, -0.01265705469995737, ...]

Message: How to insert a pagebreak a

## Define UniversalEmbedding Layer

In [5]:
# Define a custom embedding layer using the Universal Sentence Encoder
class UniversalEmbedding(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(UniversalEmbedding, self).__init__(**kwargs)
        self.embed = embed

    def call(self, inputs):
        return self.embed(inputs)

## Set Parameters and Log with MLflow

In [20]:
# Parameters for the model
units_per_layer = [512]  # Units in each layer
number_of_layers = len(units_per_layer)  # Number of dense layers
activation = 'relu'
output_activation = 'sigmoid'
optimizer = 'adam'
loss = 'binary_crossentropy'
epochs = 10
batch_size = 32

# Log the model and results in MLflow
mlflow.set_experiment("stackoverflow_multilabel_classification")

mlflow.start_run(run_name="USE_multilabel_classification")
description = f"Training with USE for multilabel classification with {number_of_layers} dense layers and units {units_per_layer}"
mlflow.set_tag("mlflow.note.content", description)

# Log parameters
mlflow.log_param("number_of_tags", number_of_tags)
mlflow.log_param("embedder", "Universal Sentence Encoder")
mlflow.log_param("number_of_layers", number_of_layers)
mlflow.log_param("units_per_layer", units_per_layer)
mlflow.log_param("activation", activation)
mlflow.log_param("output_activation", output_activation)
mlflow.log_param("optimizer", optimizer)
mlflow.log_param("loss", loss)
mlflow.log_param("epochs", epochs)
mlflow.log_param("batch_size", batch_size)

32

## Build and Compile Model

In [21]:
# Create the model dynamically based on parameters
input_text = layers.Input(shape=[], dtype=tf.string)
embedding = UniversalEmbedding()(input_text)
x = embedding
for units in units_per_layer:
    x = layers.Dense(units, activation=activation)(x)
pred = layers.Dense(number_of_tags, activation=output_activation)(x)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss=loss, optimizer=optimizer,
                metrics=[keras.metrics.F1Score(average='micro', threshold=None, name="f1_score_micro", dtype=None),
                        keras.metrics.F1Score(average='weighted', threshold=None, name="f1_score_weighted", dtype=None),
                        keras.metrics.MeanIoU(num_classes=number_of_tags),
                        'accuracy'
                        ])
model.summary()

## Prepare Data for Training

In [22]:
# Prepare training and test data
train_text = df_train['sentence_use'].tolist()
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_tags = np.asarray(y_train, dtype=np.int8)

test_text = df_test['sentence_use'].tolist()
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_tags = np.asarray(y_test, dtype=np.int8)

## Train Model and Log Metrics

In [23]:
# Train the model
history = model.fit(train_text,
                    train_tags,
                    validation_data=(test_text, test_tags),
                    epochs=epochs,
                    batch_size=batch_size
                    )

# Save model weights
model.save_weights('./model.weights.h5')
mlflow.log_artifact('./model.weights.h5')

# Log metrics
for metric in history.history.keys():
    for epoch, value in enumerate(history.history[metric]):
        mlflow.log_metric(metric, value, step=epoch)
        
mlflow.end_run()

Epoch 1/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 969ms/step - accuracy: 0.1536 - f1_score_micro: 0.1300 - f1_score_weighted: 0.0726 - loss: 0.2877 - mean_io_u_3: 0.4857 - val_accuracy: 0.4184 - val_f1_score_micro: 0.3090 - val_f1_score_weighted: 0.2267 - val_loss: 0.0845 - val_mean_io_u_3: 0.4888
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 1s/step - accuracy: 0.4424 - f1_score_micro: 0.3868 - f1_score_weighted: 0.2984 - loss: 0.0909 - mean_io_u_3: 0.4856 - val_accuracy: 0.4597 - val_f1_score_micro: 0.3991 - val_f1_score_weighted: 0.3319 - val_loss: 0.0675 - val_mean_io_u_3: 0.4888
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 963ms/step - accuracy: 0.5068 - f1_score_micro: 0.4794 - f1_score_weighted: 0.4170 - loss: 0.0722 - mean_io_u_3: 0.4857 - val_accuracy: 0.4791 - val_f1_score_micro: 0.4408 - val_f1_score_weighted: 0.3958 - val_loss: 0.0625 - val_mean_io_u_3: 0.4888
Epoch 4/10
[1m197/

## Load Model Weights and Predict

In [19]:
# Load the model weights
model.load_weights('./model.weights.h5')

# New text data for prediction
new_text = ["How to read a csv file with pandas?",
            "How to read a csv file in python?",
            "What is the best metric for multilabel classification with a neural network?", 
            "What is the capital of Paris?"]

# Ensure new_text is in the correct format
new_text = np.array(new_text, dtype=object)[:, np.newaxis]

# Predict
predicts = model.predict(new_text, batch_size=32)

# Display predictions
# Threshold to determine the tags
threshold = 0.5  # You can adjust this threshold

# Get the predicted tags
predicted_tags = (predicts > threshold).astype(int)

# Transform predicted tags back to the original form
predicted_tag_names = mlb.inverse_transform(predicted_tags)

for i, text in enumerate(new_text):
    print(f"Question: {text[0]}")
    print(f"Predicted Tags: {predicted_tag_names[i]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Question: How to read a csv file with pandas?
Predicted Tags: ('python', 'pandas')
Question: How to read a csv file in python?
Predicted Tags: ('python',)
Question: What is the best metric for multilabel classification with a neural network?
Predicted Tags: ('python',)
Question: What is the capital of Paris?
Predicted Tags: ()
