# Word2Vec pretrained

## Cell 1: Imports and Data Loading

In [1]:
import pandas as pd
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf
import numpy as np
import keras
import keras.layers as layers
from keras.models import Model
import mlflow
import mlflow.tensorflow
from gensim.models import Word2Vec
import gensim.downloader as api

# Load train and test datasets
df_train = pd.read_csv('stackoverflow_questions_cleaned_train.csv')
df_test = pd.read_csv('stackoverflow_questions_cleaned_test.csv')

# Print available columns in train dataset
print(df_train.columns)

# Extract 'sentence_use' column for training and testing
X_train_brut = df_train['sentence_bow_lem']
X_test_brut = df_test['sentence_bow_lem']

Index(['date', 'title', 'tags', 'score', 'answer_count', 'sentence_bow',
       'sentence_bow_lem', 'sentence_dl', 'sentence_use'],
      dtype='object')


## Cell 2: Load or Train Word2Vec Model and Encode Texts

In [19]:
# Load a pre-trained Word2Vec model
w2v_model = api.load('word2vec-google-news-300')
vector_size = 300

# Function to encode texts using Word2Vec
def encode_texts_w2v(texts, model, vector_size=vector_size):
    encoded_texts = []
    for text in texts:
        tokens = text.split()  # Simple tokenization
        vectors = [model[word] for word in tokens if word in model]
        if vectors:
            encoded_texts.append(np.mean(vectors, axis=0))
        else:
            encoded_texts.append(np.zeros(vector_size))
    return np.array(encoded_texts)

## Cell 3: Process Tags (same as USE)

In [20]:
# Number of top tags to consider
number_of_tags = 50

# Create a list of all tags in the training set
all_tags = [tag for tags in df_train['tags'].apply(eval) for tag in tags]

# Limit tags to the top most frequent
top_tags = [tag for tag, count in Counter(all_tags).most_common(number_of_tags)]

# Filter tags to keep only the top tags
df_train['filtered_tags'] = df_train['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])
df_test['filtered_tags'] = df_test['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])

# Extract 'sentence_bow_lem' column for training and testing
X_train_brut = df_train['sentence_bow_lem']
X_test_brut = df_test['sentence_bow_lem']
# Encode train and test texts
X_train = encode_texts_w2v(X_train_brut, w2v_model)
X_test = encode_texts_w2v(X_test_brut, w2v_model)

# Remove rows without tags in the training set
df_train = df_train[df_train['filtered_tags'].map(len) > 0]

# Ensure the lengths are consistent
assert X_train.shape[0] == df_train.shape[0], "Mismatch in number of training samples after filtering"

# Encode tags with MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y_train = mlb.fit_transform(df_train['filtered_tags'])
y_test = mlb.transform(df_test['filtered_tags'])

# Ensure the lengths are consistent
assert y_train.shape[0] == X_train.shape[0], "Mismatch in number of training samples after encoding"
assert y_test.shape[0] == X_test.shape[0], "Mismatch in number of testing samples after encoding"

## Cell 4: Define Model

In [21]:
def create_model(input_shape, number_of_tags, layer_units=[256], activation='relu'):
    input_text = layers.Input(shape=input_shape)
    
    x = input_text
    for units in layer_units:
        x = layers.Dense(units, activation=activation)(x)
    
    pred = layers.Dense(number_of_tags, activation='sigmoid')(x)
    model = Model(inputs=[input_text], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam',
                  metrics=[keras.metrics.F1Score(average='micro', threshold=None, name="f1_score_micro", dtype=None),
                           keras.metrics.F1Score(average='weighted', threshold=None, name="f1_score_weighted", dtype=None),
                           keras.metrics.MeanIoU(num_classes=number_of_tags),
                           'accuracy'])
    return model

## Cell 5: Train Model

In [22]:
def train_model(model, train_data, train_tags, test_data, test_tags, epochs=10, batch_size=32):
    history = model.fit(train_data,
                        train_tags,
                        validation_data=(test_data, test_tags),
                        epochs=epochs,
                        batch_size=batch_size
                        )
    return history

## Cell 6: Log Model with MLflow

In [35]:
layer_units = [256]
number_of_layers = len(layer_units)
epochs = 10
batch_size = 32
activation = 'relu'


# Log the model and results in MLflow
mlflow.set_experiment("stackoverflow_multilabel_classification")

mlflow.start_run(run_name="Word2Vec_model")
description = f"Training with Word2Vec for multilabel classification with {number_of_layers} dense layers and units {layer_units}"
mlflow.set_tag("mlflow.note.content", description)

# Log parameters
mlflow.log_param("number_of_tags", number_of_tags)
mlflow.log_param("embedder", "Universal Sentence Encoder")
mlflow.log_param("number_of_layers", number_of_layers)
mlflow.log_param("units_per_layer", layer_units)
mlflow.log_param("activation", activation)
mlflow.log_param("output_activation", 'sigmoid')
mlflow.log_param("optimizer", 'adam')
mlflow.log_param("loss", 'binary_crossentropy')
mlflow.log_param("epochs", epochs)
mlflow.log_param("batch_size", batch_size)
mlflow.log_param("w2v_model", 'word2vec-google-news-300')
mlflow.log_param("vector_size", vector_size)

model = create_model(input_shape=(vector_size,), number_of_tags=number_of_tags, layer_units=layer_units)
history = train_model(model, X_train, y_train, X_test, y_test, epochs=epochs, batch_size=batch_size)

# # Save model weights
model.save_weights('./w2v_model.weights.h5')
mlflow.log_artifact('./w2v_model.weights.h5')

# mlflow.keras.log_model(model, "model")

for metric, values in history.history.items():
    for epoch, value in enumerate(values):
        mlflow.log_metric(metric, value, step=epoch)
        
mlflow.end_run()

Epoch 1/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.1577 - f1_score_micro: 0.1354 - f1_score_weighted: 0.0504 - loss: 0.2913 - mean_io_u_2: 0.4857 - val_accuracy: 0.3637 - val_f1_score_micro: 0.1454 - val_f1_score_weighted: 0.0452 - val_loss: 0.1006 - val_mean_io_u_2: 0.4888
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.2278 - f1_score_micro: 0.1871 - f1_score_weighted: 0.0788 - loss: 0.1175 - mean_io_u_2: 0.4855 - val_accuracy: 0.3697 - val_f1_score_micro: 0.1880 - val_f1_score_weighted: 0.1075 - val_loss: 0.0938 - val_mean_io_u_2: 0.4888
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.2842 - f1_score_micro: 0.2478 - f1_score_weighted: 0.1636 - loss: 0.1063 - mean_io_u_2: 0.4857 - val_accuracy: 0.3761 - val_f1_score_micro: 0.2190 - val_f1_score_weighted: 0.1582 - val_loss: 0.0885 - val_mean_io_u_2: 0.4888
Epoch 4/10
[1m197/197[0

## Cell 7: Save Model Weights

In [27]:
# Save model weights
model.save_weights('./model.weights.h5')

## Cell 8: Load Model Weights and Predict

In [34]:
# Load the model weights
model.load_weights('./model.weights.h5')

# New text data for prediction
new_text = ["How to read a csv file with pandas?",
            "How to read a csv file in python?",
            "What is the best metric for multilabel classification with a neural network?", 
            "What is the capital of Paris?"]

# Encode new texts using Word2Vec
new_text_encoded = encode_texts_w2v(new_text, w2v_model)

# Predict
predicts = model.predict(new_text_encoded, batch_size=32)

# Display predictions
threshold = 0.25  # You can adjust this threshold

# Get the predicted tags
predicted_tags = (predicts > threshold).astype(int)

# Transform predicted tags back to the original form
predicted_tag_names = mlb.inverse_transform(predicted_tags)

for i, text in enumerate(new_text):
    print(f"Question: {text}")
    print(f"Predicted Tags: {predicted_tag_names[i]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Question: How to read a csv file with pandas?
Predicted Tags: ('python',)
Question: How to read a csv file in python?
Predicted Tags: ()
Question: What is the best metric for multilabel classification with a neural network?
Predicted Tags: ('python',)
Question: What is the capital of Paris?
Predicted Tags: ()


## Visualize Embeddings