<a href="https://colab.research.google.com/github/amaye15/stackoverflow-question-classifier/blob/main/code/N5_Model_Performance_Mlflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task

**Définir et mettre en œuvre un pipeline d’entraînement des modèles, avec centralisation du stockage des modèles et formalisation des résultats et mesures des différentes expérimentations réalisées, afin d’industrialiser le projet de Machine Learning.**

- CE1 Vous avez mis en oeuvre un pipeline d’entraînement des modèles reproductible
- CE2 Vous avez sérialisé et stocké les modèles créés dans un registre centralisé afin de pouvoir facilement les réutiliser.
- CE3 Vous avez formalisé des mesures et résultats de chaque expérimentation, afin de les analyser et de les comparer

In [1]:
%pip install mlflow datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.2/80.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [2]:
import os
import torch
import mlflow
import joblib

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

from tqdm.notebook import trange, tqdm
from transformers import BertTokenizer, BertModel

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score

from datasets import load_dataset

os.environ['MLFLOW_TRACKING_USERNAME'] = "andrewmayes14"
os.environ['MLFLOW_TRACKING_PASSWORD'] = "ccb096afadd26486a787461f3495219662998c4b"
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = "mlflow"

mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME']
                        + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')

def is_top_k(row, y_col, y_pred_col, k):
    """
    Check if the actual value in a specified column is within the top 'k' predicted values in another column.

    This function is designed to operate on a row of a pandas DataFrame. It compares the actual value from one column
    ('y_col') with a list of predicted values in another column ('y_pred_col'), and checks if the actual value is within
    the top 'k' elements of the predicted list.

    Parameters:
    row (pd.Series): A row from a pandas DataFrame.
    y_col (str): The name of the column containing the actual value.
    y_pred_col (str): The name of the column containing the list of predicted values.
    k (int): The number of top elements from the predicted values list to consider.

    Returns:
    bool: True if the actual value is within the top 'k' predicted values, False otherwise.
    """
    return row[y_col] in row[y_pred_col][:k]

mlflow.autolog()


2023/12/15 19:42:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2023/12/15 19:42:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2023/12/15 19:42:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


# Setup

In [3]:
# Constants
NAME = "amaye15/Stack-Overflow-Zero-Shot-Classification"
RESPOSITORY = "amaye15/Stack-Overflow-Zero-Shot-Classification"
STACK_KEY = "ub*oRqta6kWgck7l2tG5ng(("
HF_KEY = "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"
K = 20
COMPONENTS = 2
RANDOM_STATE = 42
TEST_SIZE = 0.3

# Load dataset (assuming load_dataset is a defined function)
ds = load_dataset(NAME)
df = ds["train"].to_pandas()

# Dataframe Manipulation
df["Main_Tag"] = df["Tags"].str.replace(" ", "").apply(lambda x: next(iter(x.split(","))))
df["Predicted_Main_Tag"] = df["Predicted_Tags"].str.replace(" ", "").apply(lambda x: next(iter(x.split(","))))
df["Predicted_Tags"] = df["Predicted_Tags"].str.replace(" ", "").str.split(",")

# Assuming is_top_k is a defined function
df = df[df.apply(lambda row: is_top_k(row, y_col = "Main_Tag", y_pred_col = "Predicted_Tags", k = K), axis=1)].copy()

# Text Processing
top_ten = df["Main_Tag"].value_counts().to_frame().reset_index().rename(columns={"index":"Main_Tag", "Main_Tag":"index"}).loc[:9, "Main_Tag"].to_list()

# Masking
mask = df["Main_Tag"].isin(top_ten).to_list()

df = df[mask].copy()

Downloading readme:   0%|          | 0.00/602 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/16.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/111030 [00:00<?, ? examples/s]

# BERT

In [4]:

# Start MLflow run
mlflow.start_run()

name = 'bert-base-uncased'
mlflow.log_param("model_name", name)

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(name)
model = BertModel.from_pretrained(name)

# Move model to the chosen device
model.to(device)

# Assuming df["Title"] and df["Main_Tag"] are your dataframe columns
text = df["Title"].to_list()

# Define batch size
batch_size = 128  # Adjust batch size based on your GPU memory
mlflow.log_param("batch_size", batch_size)

# Placeholder for batch encoded inputs
batch_encoded_inputs = []

# Batch encode in a loop
for start_idx in tqdm(range(0, len(text), batch_size), desc="Encoding"):
    # Get the batch
    batch = text[start_idx:start_idx + batch_size]

    # Encode the batch and move to the same device as model
    batch_encoded = tokenizer(batch, padding="longest", truncation=True, return_tensors='pt').to(device)

    # Process with the model
    with torch.no_grad():
        encoded_results = model(**batch_encoded)

    # Move results to CPU for further processing/storage
    batch_results = encoded_results.last_hidden_state.mean(dim=1).cpu().tolist()

    # Store the processed batch
    batch_encoded_inputs.extend(batch_results)

x_train, x_test, y_train, y_test = train_test_split(batch_encoded_inputs,
                                                    df["Main_Tag"].tolist(),
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE,  # Assuming RANDOM_STATE is defined earlier
                                                    stratify=df["Main_Tag"].tolist())

# Perform Grid Search
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance']
}
grid_search = GridSearchCV(KNeighborsClassifier(metric='cosine'), param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(x_train, y_train)

# Log the best parameters and score
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric("best_grid_score", grid_search.best_score_)

# Log the complete results of the grid search
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.to_csv("grid_search_results.csv")
mlflow.log_artifact("grid_search_results.csv")

# Predict and evaluate using the best estimator
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(x_test)

# Log additional metrics
accuracy = best_knn.score(x_test, y_test)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1_score", f1)

# Log the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm)
cm_df.to_csv("confusion_matrix.csv")
mlflow.log_artifact("confusion_matrix.csv")

# Save and log the best KNN model
joblib.dump(best_knn, "best_knn_model.pkl")
mlflow.log_artifact("best_knn_model.pkl")

# End the MLflow run
mlflow.end_run()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Encoding:   0%|          | 0/475 [00:00<?, ?it/s]



Fitting 5 folds for each of 8 candidates, totalling 40 fits


2023/12/15 14:53:59 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


# USE

In [5]:
# Start MLflow run
mlflow.start_run()

# Check if GPU is available and set memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Load the Universal Sentence Encoder model
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(model_url)
mlflow.log_param("model_url", model_url)

# Assuming df["Title"] is your dataframe and text processing functions are defined
text = df["Title"].to_list()

# Define batch size
batch_size = 128  # Adjust based on your memory availability
mlflow.log_param("batch_size", batch_size)

# Placeholder for batch encoded inputs
batch_encoded_inputs = []

# Batch encode in a loop
for start_idx in tqdm(range(0, len(text), batch_size), desc="Encoding"):
    # Get the batch
    batch = text[start_idx:start_idx + batch_size]

    # Encode the batch using the model
    encoded_results = model(batch)

    # Store the encoded batch
    for result in encoded_results.numpy().tolist():
        batch_encoded_inputs.append(result)

# Split the data
x_train, x_test, y_train, y_test = train_test_split(batch_encoded_inputs,
                                                    df["Main_Tag"].tolist(),
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE,  # Assuming RANDOM_STATE is defined
                                                    stratify=df["Main_Tag"].tolist())

# Perform Grid Search
param_grid = {'n_neighbors': [3, 5, 7, 10], 'weights': ['uniform', 'distance']}
grid_search = GridSearchCV(KNeighborsClassifier(metric='cosine'), param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(x_train, y_train)

# Log the best parameters and score
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric("best_grid_score", grid_search.best_score_)

# Log the complete results of the grid search
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.to_csv("grid_search_results.csv")
mlflow.log_artifact("grid_search_results.csv")

# Predict and evaluate using the best estimator
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(x_test)

# Log additional metrics
accuracy = best_knn.score(x_test, y_test)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1_score", f1)

# Log the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm)
cm_df.to_csv("confusion_matrix.csv")
mlflow.log_artifact("confusion_matrix.csv")

# Save and log the best KNN model
joblib.dump(best_knn, "best_knn_model.pkl")
mlflow.log_artifact("best_knn_model.pkl")

# End the MLflow run
mlflow.end_run()

Encoding:   0%|          | 0/475 [00:00<?, ?it/s]



Fitting 5 folds for each of 8 candidates, totalling 40 fits


2023/12/15 15:13:11 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


# DeBERTa

In [4]:
_, x_test, _, y_test = train_test_split(df["Title"].to_list(),
                                        df["Main_Tag"].tolist(),
                                        test_size = 0.3,
                                        random_state = RANDOM_STATE,
                                        stratify = df["Main_Tag"].tolist())


# Start MLflow run
mlflow.start_run()


model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v1"
mlflow.log_param("model_name", model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True,)
tokenizer.model_input_names = ['input_ids', 'attention_mask']


# Assuming df["Title"] is your dataframe and text processing functions are defined
text = x_test

model =  AutoModelForSequenceClassification.from_pretrained(model_name)

# Labels
candidate_labels = df["Main_Tag"].unique().tolist()
mlflow.log_param("Candidate labels", candidate_labels)

# Define batch size
batch_size = 128
mlflow.log_param("batch_size", batch_size)

device = 0 if torch.cuda.is_available() else -1

# Initialize the classifier pipeline
classifier = pipeline(
    task="zero-shot-classification",
    model=model,
    tokenizer= tokenizer ,
    use_fast = True,
    batch_size = batch_size,
    framework = "pt",
    device = device)

batch_encoded_inputs = classifier(text, candidate_labels, multi_label = False, batch_size=batch_size)

results_df = pd.DataFrame(batch_encoded_inputs)
results_df["target"] = y_test

# Extract top prediction for each instance
def extract_top_prediction(labels, scores):
    top_prediction = labels[scores.index(max(scores))]
    return top_prediction

results_df['top_prediction'] = results_df.apply(lambda row: extract_top_prediction(row['labels'], row['scores']), axis=1)

# Calculate and log metrics
y_true = results_df['target']
y_pred = results_df['top_prediction']

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_true, y_pred, average='weighted', labels=np.unique(y_pred))
f1 = f1_score(y_true, y_pred, average='weighted', labels=np.unique(y_pred))

mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1_score", f1)

# End the MLflow run
mlflow.end_run()


tokenizer_config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/870M [00:00<?, ?B/s]

# Mlflow

In [3]:
# See your experiments table inside Colab!
import IPython
display(IPython.display.IFrame("https://dagshub.com/"+ os.environ['MLFLOW_TRACKING_USERNAME']
                        + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + "/experiments/#/",'100%',600))