<a href="https://colab.research.google.com/github/amaye15/stackoverflow-question-classifier/blob/main/code/N4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task

**Définir et mettre en œuvre un pipeline d’entraînement des modèles, avec centralisation du stockage des modèles et formalisation des résultats et mesures des différentes expérimentations réalisées, afin d’industrialiser le projet de Machine Learning.**

- CE1 Vous avez mis en oeuvre un pipeline d’entraînement des modèles reproductible
- CE2 Vous avez sérialisé et stocké les modèles créés dans un registre centralisé afin de pouvoir facilement les réutiliser.
- CE3 Vous avez formalisé des mesures et résultats de chaque expérimentation, afin de les analyser et de les comparer

In [1]:
%pip install mlflow datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.2/80.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

In [2]:
import os
import torch
import mlflow
import joblib

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

from tqdm.notebook import trange, tqdm
from transformers import BertTokenizer, BertModel

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from datasets import load_dataset

def is_top_k(row, y_col, y_pred_col, k):
    """
    Check if the actual value in a specified column is within the top 'k' predicted values in another column.

    This function is designed to operate on a row of a pandas DataFrame. It compares the actual value from one column
    ('y_col') with a list of predicted values in another column ('y_pred_col'), and checks if the actual value is within
    the top 'k' elements of the predicted list.

    Parameters:
    row (pd.Series): A row from a pandas DataFrame.
    y_col (str): The name of the column containing the actual value.
    y_pred_col (str): The name of the column containing the list of predicted values.
    k (int): The number of top elements from the predicted values list to consider.

    Returns:
    bool: True if the actual value is within the top 'k' predicted values, False otherwise.
    """
    return row[y_col] in row[y_pred_col][:k]

mlflow.autolog()


2023/12/14 16:34:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2023/12/14 16:34:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/12/14 16:34:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.


# Setup

In [3]:
# Constants
NAME = "amaye15/Stack-Overflow-Zero-Shot-Classification"
RESPOSITORY = "amaye15/Stack-Overflow-Zero-Shot-Classification"
STACK_KEY = "ub*oRqta6kWgck7l2tG5ng(("
HF_KEY = "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"
K = 20
COMPONENTS = 2
RANDOM_STATE = 42
TEST_SIZE = 0.3

# Load dataset (assuming load_dataset is a defined function)
ds = load_dataset(NAME)
df = ds["train"].to_pandas()

# Dataframe Manipulation
df["Main_Tag"] = df["Tags"].str.replace(" ", "").apply(lambda x: next(iter(x.split(","))))
df["Predicted_Main_Tag"] = df["Predicted_Tags"].str.replace(" ", "").apply(lambda x: next(iter(x.split(","))))
df["Predicted_Tags"] = df["Predicted_Tags"].str.replace(" ", "").str.split(",")

# Assuming is_top_k is a defined function
df = df[df.apply(lambda row: is_top_k(row, y_col = "Main_Tag", y_pred_col = "Predicted_Tags", k = K), axis=1)].copy()

# Text Processing
top_ten = df["Main_Tag"].value_counts().to_frame().reset_index().rename(columns={"index":"Main_Tag", "Main_Tag":"index"}).loc[:9, "Main_Tag"].to_list()

# Masking
mask = df["Main_Tag"].isin(top_ten).to_list()

df = df[mask].copy()

Downloading readme:   0%|          | 0.00/602 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/16.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/111030 [00:00<?, ? examples/s]

# BERT

In [4]:

# Start MLflow run
mlflow.start_run()

name = 'bert-base-uncased'
mlflow.log_param("model_name", name)

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(name)
model = BertModel.from_pretrained(name)

# Move model to the chosen device
model.to(device)

# Assuming df["Title"] and df["Main_Tag"] are your dataframe columns
text = df["Title"].to_list()

# Define batch size
batch_size = 128  # Adjust batch size based on your GPU memory
mlflow.log_param("batch_size", batch_size)

# Placeholder for batch encoded inputs
batch_encoded_inputs = []

# Batch encode in a loop
for start_idx in tqdm(range(0, len(text), batch_size), desc="Encoding"):
    # Get the batch
    batch = text[start_idx:start_idx + batch_size]

    # Encode the batch and move to the same device as model
    batch_encoded = tokenizer(batch, padding="longest", truncation=True, return_tensors='pt').to(device)

    # Process with the model
    with torch.no_grad():
        encoded_results = model(**batch_encoded)

    # Move results to CPU for further processing/storage
    batch_results = encoded_results.last_hidden_state.mean(dim=1).cpu().tolist()

    # Store the processed batch
    batch_encoded_inputs.extend(batch_results)

knn = KNeighborsClassifier(metric="cosine")

x_train, x_test, y_train, y_test = train_test_split(batch_encoded_inputs,
                                                    df["Main_Tag"].tolist(),
                                                    test_size=0.3,
                                                    random_state=RANDOM_STATE ,  # Assuming RANDOM_STATE is defined earlier
                                                    stratify=df["Main_Tag"].tolist())

knn.fit(x_train, y_train)

# Evaluate and log model accuracy
accuracy = knn.score(x_test, y_test)
mlflow.log_metric("accuracy", accuracy)

# Save and log the KNN model
joblib.dump(knn, "knn_model.pkl")
mlflow.log_artifact("knn_model.pkl")

# End the MLflow run
mlflow.end_run()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Encoding:   0%|          | 0/475 [00:00<?, ?it/s]



# USE

In [None]:
# Check if GPU is available and set memory growth to avoid memory allocation errors
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

# Load the Universal Sentence Encoder model
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Assuming df["Title"] is your dataframe and text processing functions are defined
text = df["Title"].to_list()

# Define batch size
batch_size = 128  # Adjust based on your memory availability

# Placeholder for batch encoded inputs
batch_encoded_inputs = []

# Batch encode in a loop
for start_idx in tqdm(range(0, len(text), batch_size), desc="Encoding"):
    # Get the batch
    batch = text[start_idx:start_idx + batch_size]

    # Encode the batch using the model
    encoded_results = model(batch)

    # Store the encoded batch
    for result in encoded_results.numpy().tolist():
        batch_encoded_inputs.append(result)

Encoding:   0%|          | 0/475 [00:00<?, ?it/s]

In [None]:
knn = KNeighborsClassifier(metric = "cosine")

x_train, x_test, y_train, y_test = train_test_split(batch_encoded_inputs,
                                                    df["Main_Tag"].tolist(),
                                                    test_size = 0.3,
                                                    random_state = RANDOM_STATE,
                                                    stratify = df["Main_Tag"].tolist())

knn.fit(x_train, y_train)

knn.score(x_test, y_test)

0.7703748422150266

DeBERTa

In [None]:
_, x_test, _, y_test = train_test_split(df["Title"].to_list(),
                                        df["Main_Tag"].tolist(),
                                        test_size = 0.3,
                                        random_state = RANDOM_STATE,
                                        stratify = df["Main_Tag"].tolist())

In [None]:
model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True,)
tokenizer.model_input_names = ['input_ids', 'attention_mask']


# Assuming df["Title"] is your dataframe and text processing functions are defined
text = x_test

model =  AutoModelForSequenceClassification.from_pretrained(model_name)

candidate_labels = df["Main_Tag"].unique().tolist()

# Define batch size
batch_size = 128

# Initialize the classifier pipeline
classifier = pipeline(
    task="zero-shot-classification",
    model=model,
    tokenizer= tokenizer ,
    use_fast = True,
    batch_size = batch_size,
    framework = "pt",
    device = device)

batch_encoded_inputs = classifier(text, candidate_labels, multi_label = False, batch_size=batch_size)

results_df = pd.DataFrame(batch_encoded_inputs)

results_df["target"] = y_test

k = 1
accuracy = results_df.apply(lambda row: is_top_k(row, y_col = "target", y_pred_col = "labels", k = k), axis=1).mean()
print(f"Top-{k} Accuracy: {accuracy}")


# Mlflow

In [6]:
import os

os.environ['MLFLOW_TRACKING_USERNAME'] = "andrewmayes14"
os.environ['MLFLOW_TRACKING_PASSWORD'] = "ccb096afadd26486a787461f3495219662998c4b"
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = "mlflow"

mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME']
                        + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')
# See your experiments table inside Colab!
import IPython
display(IPython.display.IFrame("https://dagshub.com/"+ os.environ['MLFLOW_TRACKING_USERNAME']
                        + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + "/experiments/#/",'100%',600))

In [None]:
import mlflow

# Start an MLflow experiment
mlflow.set_experiment('GridSearchCV_Experiment')


* 'schema_extra' has been renamed to 'json_schema_extra'


<Experiment: artifact_location='file:///Users/typhaine/Documents/Doc_Gorilla/OpenClassroom--Machine-Learning-Engineer/P5/mlruns/992333753801301528', creation_time=1699984116641, experiment_id='992333753801301528', last_update_time=1699984116641, lifecycle_stage='active', name='GridSearchCV_Experiment', tags={}>

In [None]:
import os
import ast
import numpy as np
import pandas as pd


ROOT_PATH = os.getcwd()
DATA_LOAD_FILE = os.path.join(ROOT_PATH, "data/StackOverFlowEmbedded.csv.gz")

df = pd.read_csv(DATA_LOAD_FILE)
#df["BertEmbeddingsMean"] = df["BertEmbeddingsMean"].apply(ast.literal_eval)
#df["BertEmbeddingsSum"] = df["BertEmbeddingsSum"].apply(ast.literal_eval)

#df["FastTextEmbeddings"] = df["FastTextEmbeddings"].apply(ast.literal_eval)
#df["UseEmbeddings"] = df["UseEmbeddings"].apply(ast.literal_eval)
df

Unnamed: 0,Date,Title,Tags,Score,Label,TitleClean,TitleCleanTokenised,BertEmbeddingsMean,BertEmbeddingsSum
0,2016-02-04 12:49:19,Java contains vs anyMatch behaviour,"java, java-stream, equality",56,java,java contain v anymatch behaviour,"['java', 'contain', 'v', 'anymatch', 'behaviour']","[-0.23472099006175995, -0.24564139544963837, 0...","[-9.154118537902832, -9.5800142288208, 6.10894..."
1,2011-05-04 19:52:46,Getting random numbers in Java,"java, random",557,java,get random number in java,"['get', 'random', 'number', 'in', 'java']","[-0.2763572931289673, -0.42447564005851746, 0....","[-10.777934074401855, -16.554550170898438, 0.3..."
2,2012-03-25 17:38:01,Hibernate generates negative id values when us...,"java, hibernate, jpa, jboss7.x, jpa-2.0",53,java,hibern gener neg id valu when use a sequenc,"['hibern', 'gener', 'neg', 'id', 'valu', 'when...","[-0.349016398191452, -0.19550490379333496, -0....","[-13.611639976501465, -7.624691009521484, -0.3..."
3,2010-08-02 17:35:16,How to get the separate digits of an int number?,"java, integer, modulo",196,java,how to get the separ digit of an int number,"['how', 'to', 'get', 'the', 'separ', 'digit', ...","[-0.25365835428237915, -0.3813159167766571, -0...","[-9.892675399780273, -14.871320724487305, -7.0..."
4,2011-06-10 07:52:16,Eclipse plugin for generating a class diagram,"java, eclipse, plugins, uml, class-diagram",112,java,eclips plugin for gener a class diagram,"['eclips', 'plugin', 'for', 'gener', 'a', 'cla...","[-0.251219242811203, -0.4725949764251709, -0.0...","[-9.797550201416016, -18.431203842163086, -1.6..."
...,...,...,...,...,...,...,...,...,...
9995,2014-06-29 03:25:18,How to call Type Methods within an instance me...,"ios, class, methods, types, swift",61,ios,how to call type method within an instanc method,"['how', 'to', 'call', 'type', 'method', 'withi...","[-0.17384220659732819, -0.3111932873725891, -0...","[-6.953688144683838, -12.447731018066406, -8.0..."
9996,2014-07-31 14:55:53,What is NSLayoutConstraint &quot;UIView-Encaps...,"ios, uitableview, cocoa-touch, autolayout, ios...",306,ios,what is nslayoutconstraint quotuiviewencapsula...,"['what', 'is', 'nslayoutconstraint', 'quotuivi...","[-0.1488007754087448, 0.2860782742500305, 0.34...","[-5.952031135559082, 11.443131446838379, 13.78..."
9997,2015-03-17 20:37:23,How do I fix the xcrun unable to find simctl e...,"ios, xcode, xcrun",359,ios,how do i fix the xcrun unabl to find simctl error,"['how', 'do', 'i', 'fix', 'the', 'xcrun', 'una...","[-0.2691449224948883, -0.272336483001709, 0.01...","[-10.765796661376953, -10.89345932006836, 0.45..."
9998,2013-05-03 17:23:43,NSAttributedString background color and rounde...,"ios, objective-c, uiview, quartz-graphics, nsa...",76,ios,nsattributedstr background color and round corner,"['nsattributedstr', 'background', 'color', 'an...","[-0.3197779357433319, -0.11719034612178802, 0....","[-12.791117668151855, -4.6876139640808105, 7.8..."


In [None]:
df["Title"]

0                     Java contains vs anyMatch behaviour
1                          Getting random numbers in Java
2       Hibernate generates negative id values when us...
3        How to get the separate digits of an int number?
4           Eclipse plugin for generating a class diagram
                              ...                        
9995    How to call Type Methods within an instance me...
9996    What is NSLayoutConstraint &quot;UIView-Encaps...
9997    How do I fix the xcrun unable to find simctl e...
9998    NSAttributedString background color and rounde...
9999    &#39;Project Name&#39; was compiled with optim...
Name: Title, Length: 10000, dtype: object

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
df["LabelEncoded"] = LabelEncoder().fit_transform(df["Label"].values)
TrainDF, TestDF = train_test_split(df,train_size=0.7, random_state=42, stratify=df["LabelEncoded"].values)

In [None]:

with mlflow.start_run():
    logistic_regression_model = LogisticRegression()
    parameter_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': [None, 'l2']}
    grid_search = GridSearchCV(estimator=logistic_regression_model,
                            param_grid=parameter_grid,
                            scoring='accuracy',
                            cv=10)
    grid_search.fit(np.array(TrainDF["BertEmbeddings"].values.tolist()), np.array(TrainDF["LabelEncoded"].values.tolist()))

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    for param_name in best_params:
        mlflow.log_param(param_name, best_params[param_name])
    mlflow.log_metric("best_score", grid_search.best_score_)

    y_pred = best_model.predict(np.array(TestDF["BertEmbeddings"].values.tolist()))
    accuracy = accuracy_score(np.array(TestDF["LabelEncoded"].values.tolist()), y_pred)
    print(f"Accuracy of the best model: {accuracy}")

    # Log the best model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "best_model")

    # End the MLflow run
    mlflow.end_run()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy of the best model: 0.805


In [None]:
#!mlflow ui

In [None]:
random_forest_model = RandomForestClassifier()
parameter_grid = {
    'n_estimators': [200, 300, 400, 500, 1000],
    'max_depth': [None],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=random_forest_model,
                           param_grid=parameter_grid,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1, verbose=2)  # n_jobs=-1 to use all processors
grid_search.fit(np.array(TrainDF["BertEmbeddingsSum"].values.tolist()), np.array(TrainDF["LabelEncoded"].values.tolist()))

best_model = grid_search.best_estimator_
y_pred = best_model.predict(np.array(TestDF["BertEmbeddingsSum"].values.tolist()))
accuracy = accuracy_score(np.array(TestDF["LabelEncoded"].values.tolist()), y_pred)
print(f"Accuracy of the best model: {accuracy}")

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] END ...................max_depth=None, n_estimators=200; total time= 1.6min
[CV] END ...................max_depth=None, n_estimators=200; total time= 1.6min
[CV] END ...................max_depth=None, n_estimators=200; total time= 1.6min
[CV] END ...................max_depth=None, n_estimators=200; total time= 1.6min
[CV] END ...................max_depth=None, n_estimators=200; total time= 2.0min
[CV] END ...................max_depth=None, n_estimators=200; total time= 2.0min
[CV] END ...................max_depth=None, n_estimators=200; total time= 2.0min
[CV] END ...................max_depth=None, n_estimators=200; total time= 2.0min
[CV] END ...................max_depth=None, n_estimators=200; total time= 1.8min
[CV] END ...................max_depth=None, n_estimators=200; total time= 1.8min
[CV] END ...................max_depth=None, n_estimators=300; total time= 2.7min
[CV] END ...................max_depth=None, n_es

In [None]:
decision_tree_model = DecisionTreeClassifier()
parameter_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=decision_tree_model,
                           param_grid=parameter_grid,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1)  # n_jobs=-1 to use all processors
grid_search.fit(np.array(TrainDF["BertEmbeddings"].values.tolist()), np.array(TrainDF["LabelEncoded"].values.tolist()))

best_model = grid_search.best_estimator_
y_pred = best_model.predict(np.array(TestDF["BertEmbeddings"].values.tolist()))
accuracy = accuracy_score(np.array(TestDF["LabelEncoded"].values.tolist()), y_pred)
print(f"Accuracy of the best model: {accuracy}")

Accuracy of the best model: 0.645
