In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

# Cacher les warnings
import warnings
warnings.simplefilter("ignore")

data = pd.read_csv("data/cleaned_dataframe.csv")
data.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/laurent/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Tags,Merged_doc
0,['c#'],convert decimal double c# want assign decimal ...
1,"['html', 'css']",width collapse percentage width child element ...
2,"['c#', '.net']",calculate someone age base datetime type birth...
3,"['javascript', 'jquery', 'html', 'css']",jquery javascript find left inner edge element...
4,['c#'],calculate relative time c# give specific datet...


In [2]:
from sklearn.model_selection import train_test_split

X = data.drop('Tags', axis=1)  # features
y = data['Tags']               # label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
corpus_tokenized = data['Merged_doc'].dropna().apply(word_tokenize).tolist()

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
import ast

# Binarisation des tags
mlb = MultiLabelBinarizer()
Y_train_bin = mlb.fit_transform(y_train.apply(ast.literal_eval))
Y_test_bin = mlb.transform(y_test.apply(ast.literal_eval))

In [5]:
print("Y_test_bin:", type(Y_test_bin), Y_test_bin.shape)

Y_test_bin: <class 'numpy.ndarray'> (15000, 50)


In [6]:
from sklearn.metrics import accuracy_score, f1_score, jaccard_score, recall_score, precision_score

model_comparaison_metrics = pd.DataFrame()

def metric_score(model_name, Y_test_bin, Y_pred_bin, metrics_df=None):
    results = {
        'Accuracy': accuracy_score(Y_test_bin, Y_pred_bin),
        'F1 score': f1_score(Y_test_bin, Y_pred_bin, average='macro'),
        'Jaccard': jaccard_score(Y_test_bin, Y_pred_bin, average='macro'),
        'Recall': recall_score(Y_test_bin, Y_pred_bin, average='macro'),
        'Precision': precision_score(Y_test_bin, Y_pred_bin, average='macro')
    }

    new_col = pd.DataFrame(results, index=[model_name]).T

    if metrics_df is None:
        metrics_df = new_col
    else:
        metrics_df[model_name] = new_col[model_name]

    return metrics_df

# COUNT-VECTORIZER

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000, max_df=0.8, min_df=2)

# Apprentissage du vocabulaire sur X_train uniquement
X_train_vect = vectorizer.fit_transform(X_train['Merged_doc'])

# Transformation de X_test avec le même vectorizer
X_test_vect = vectorizer.transform(X_test['Merged_doc'])

# Affichage
print("Vocabulaire: ", vectorizer.vocabulary_)
print("Feature Matrix (train):\n", X_train_vect.toarray())

Vocabulaire:  {'create': 213, 'use': 939, 'new': 589, 'go': 396, 'lot': 521, 'question': 709, 'com': 167, 'find': 355, 'solution': 813, 'requirement': 744, 'post': 674, 'dynamic': 287, 'property': 696, 'column': 165, 'name': 579, 'set': 792, 'value': 949, 'bind': 94, 'data': 226, 'follow': 368, 'step': 839, 'achieve': 5, 'correct': 203, 'wrong': 991, 'contains': 195, 'row': 759, 'class': 150, 'add': 12, 'make': 526, 'list': 506, 'do': 273, 'public': 702, 'foreach': 372, 'col': 162, 'var': 950, 'string': 846, 'object': 608, 'tostring': 907, 'else': 295, 'execute': 325, 'loop': 520, 'contain': 193, 'last': 485, 'please': 662, 'help': 415, 'fix': 361, 'center': 136, 'image': 432, 'look': 519, 'like': 501, 'document': 276, 'function': 382, 'customer': 223, 'true': 918, 'false': 344, 'setting': 793, 'body': 99, 'background': 80, 'color': 164, 'text': 884, 'align': 21, 'pad': 629, 'px': 705, 'margin': 532, 'img': 433, 'width': 976, 'position': 672, 'display': 268, 'block': 97, 'box': 108, 's

# TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

tfidf_vectorizer =  TfidfVectorizer(analyzer="word", 
    min_df=5,  # Plus permissif
    max_df=0.8,  # Ignorer les mots trop fréquents
    ngram_range=(1, 2),
    )

# create TF-IDF features
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train['Merged_doc'])
X_tfidf_test = tfidf_vectorizer.transform(X_test['Merged_doc'])
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()

joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer_2.pkl')

print("Le jeu d'entrainement comporte {} documents pour {} mots".format(X_tfidf_train.shape[0], X_tfidf_train.shape[1]))
print("Le jeu de Test comporte {} documents pour {} mots".format(X_tfidf_test.shape[0], X_tfidf_test.shape[1]))

Le jeu d'entrainement comporte 35000 documents pour 120018 mots
Le jeu de Test comporte 15000 documents pour 120018 mots


In [9]:
# Transformation en matrice dense pour visualiser
X_tfidf_train_dense = pd.DataFrame(X_tfidf_train.todense(), columns=feature_names_tfidf)
X_tfidf_test_dense = pd.DataFrame(X_tfidf_test.todense(), columns=feature_names_tfidf)

X_tfidf_train_dense.head()

Unnamed: 0,aa,aa aa,aa ab,aa bb,aa ce,aa dd,aaa,aaa aaa,aaa bbb,aaaa,...,zx,zxing,zygoteinit,zygoteinit java,zygoteinit main,zygoteinit methodandargscaller,zz,zza,zzb,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## PCA

In [None]:
from sklearn.decomposition import PCA
import numpy as np

def display_scree_plot(pca):
    fig=plt.figure(figsize=(8,8))
    scree = pca.explained_variance_ratio_*100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.xlabel("rang de l'axe d'inertie")
    plt.ylabel("pourcentage d'inertie")
    plt.title("Eboulis des valeurs propres")
    plt.show()

: 

In [None]:
from sklearn.preprocessing import StandardScaler

def pca_transformation(train , test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    n_comp = train.shape[1]
    pca = PCA(n_components=n_comp)
    pca.fit(train)
    display_scree_plot(pca)
    pca = PCA(n_components=0.8, random_state=42)
    pca.fit(train)
    train_pca = pca.transform(train)
    test_pca = pca.transform(test)
    print("\nNous conservons {} composantes principales pour garder 80% d'inertie".format(pca.components_.shape[0]))
    return train_pca, test_pca, pca

X_train_tfidf_pca, X_test_tfidf_pca, pca_tfidf = pca_transformation(X_tfidf_train_dense, X_tfidf_test_dense)

## NMF

In [None]:
from sklearn.decomposition import NMF

def plot_top_words(model, feature_names,
                   n_top_words, nb_topic_plot, title):
    """Function for displaying the plots of the
    best x words representative of the categories of NMF.

    Parameters
    ----------------------------------------
    model : NMF model
        Fitted model of NMF to plot
    feature_names : array
        Categories result of the vectorizer (TFIDF ...)
    n_top_words : int
        Number of words for each topic.
    title : string
        Title of the plot.
    ----------------------------------------
    """
    rows = int(nb_topic_plot/6)
    fig, axes = plt.subplots(rows, 6,
                             figsize=(30, rows*10),
                             sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        if(topic_idx < nb_topic_plot):
            top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
            top_features = [feature_names[i] for i in top_features_ind]
            weights = topic[top_features_ind]

            ax = axes[topic_idx]
            bartopic = ax.barh(top_features, weights, height=0.7)
            bartopic[0].set_color('#f48023')
            ax.set_title(f'Topic {topic_idx +1}',
                         fontdict={'fontsize': 30})
            ax.invert_yaxis()
            ax.tick_params(axis='both', which='major', labelsize=20)
            for i in 'top right left'.split():
                ax.spines[i].set_visible(False)
            fig.suptitle(title, fontsize=36, color="#641E16")

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

  # Define number of topics to test
n_topics = 12

print("-"*50)
print("Start NMF fitting on Full_doc ...")
print("-" * 50)
# start_time = time.time()
# Initializing the NMF
full_nmf = NMF(n_components=n_topics,
               init='nndsvd',
               random_state=8)

# Fit NMF on Body vectorized
full_nmf.fit(X_tfidf_train_dense)

# exec_time = time.time() - start_time
print("End of training :")
# print("Execution time : {:.2f}s".format(exec_time))
print("-" * 50)

# Plot the 6 first topics
ff_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(full_nmf, ff_feature_names, 20, 6,
               'Topics in NMF model for Full_doc')

# LDA

In [None]:
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore

# Dictionnaire et corpus
common_dictionary = corpora.Dictionary(corpus_tokenized)
common_dictionary.filter_extremes(no_below=1000)
common_corpus = [common_dictionary.doc2bow(text) for text in corpus_tokenized]

coherences = []
topic_range = list(range(2, 20))

for n_topics in topic_range:
    lda_model = LdaMulticore(corpus=common_corpus,
                             id2word=common_dictionary,
                             num_topics=n_topics,
                             passes=5,
                             chunksize=2000,
                             random_state=42,
                             workers=7,
                             per_word_topics=False)

    # Cohérence directement depuis le modèle
    cm = CoherenceModel(model=lda_model, texts=corpus_tokenized, dictionary=common_dictionary, coherence='c_v')
    coherence_score = cm.get_coherence()
    coherences.append(coherence_score)
    print(f"{n_topics} topics → coherence: {coherence_score:.4f}")

In [None]:
best_index = coherences.index(max(coherences))
best_num_topics = topic_range[best_index]

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(topic_range, coherences, marker='o')
ax.axvline(x=best_num_topics, color='g', alpha=.8,
           linestyle='dashdot', label='Best param')
plt.xlabel("Nombre de topics")
plt.ylabel("Score de cohérence (c_v)")
plt.title("Évaluation de la cohérence des topics LDA")
plt.grid(True)
plt.show()

In [None]:
from gensim.models.ldamulticore import LdaMulticore
import gensim.corpora as corpora
from gensim import models
from gensim.utils import simple_preprocess
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


best_lda_model = LdaMulticore(corpus=common_corpus,
                              id2word=common_dictionary,
                              num_topics=best_num_topics,
                              per_word_topics=True,
                              passes=10,
                              workers=7)


pyLDAvis.enable_notebook()
gensimvis.prepare(best_lda_model, common_corpus, common_dictionary)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def map_topics_to_tags(lda_model, y_train, corpus, top_n_docs=10):
    topic_to_tags = {}
    all_topic_distributions = [lda_model.get_document_topics(doc, minimum_probability=0.0) for doc in corpus]
    num_topics = lda_model.num_topics

    for topic_idx in range(num_topics):
        topic_probs = [(doc_idx, dist[topic_idx][1]) for doc_idx, dist in enumerate(all_topic_distributions)
                       if topic_idx < len(dist)]
        topic_docs_sorted = sorted(topic_probs, key=lambda x: x[1], reverse=True)

        top_doc_indices = [doc_idx for doc_idx, _ in topic_docs_sorted[:top_n_docs]]

        tags = set()
        for doc_idx in top_doc_indices:
            if doc_idx >= len(y_train):
                continue  # Ignore les indices hors limites

            doc_tags = y_train.iloc[doc_idx]
            if isinstance(doc_tags, str):
                import ast
                doc_tags = ast.literal_eval(doc_tags)
            tags.update(doc_tags)

        topic_to_tags[topic_idx] = tags

    return topic_to_tags

topic_to_tags = map_topics_to_tags(best_lda_model, y_train, common_corpus)

for topic, tags in topic_to_tags.items():
    print(f"Topic {topic}: {tags}")

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import pandas as pd

metrics_df = pd.DataFrame()

mlflow.end_run()

mlflow.set_experiment("LDA_Topic_Modeling")

with mlflow.start_run(run_name=f"LDA_{best_num_topics}_topics"):

  mlflow.log_param("num_topics", best_num_topics)
  mlflow.log_param("passes", 10)
  mlflow.log_param("workers", 7)
  mlflow.log_param("top_n_docs", 10)
  mlflow.log_param("top_n_topics", 3)


  # --- 1. Préparer le corpus test pour LDA ---
  # Tokenisation des docs test (même méthode que train)
  corpus_test_tokenized = X_test['Merged_doc'].dropna().apply(word_tokenize).tolist()

  # Passage en BoW avec le dictionnaire issu du train
  common_corpus_test = [common_dictionary.doc2bow(text) for text in corpus_test_tokenized]

  # --- 2. Prédiction des tags à partir de la LDA et mapping topic->tags ---

  def predict_tags_from_lda(lda_model, corpus_bow_test, topic_to_tags, top_n_topics=3):
      all_predicted_tags = []
      for bow in corpus_bow_test:
          # Obtenir la distribution des topics pour un doc (minimum_probability=0 pour avoir toutes)
          doc_topics = lda_model.get_document_topics(bow, minimum_probability=0)
          # Trier par probabilité décroissante
          doc_topics_sorted = sorted(doc_topics, key=lambda x: x[1], reverse=True)
          # Prendre les N topics les plus probables
          top_topics = [topic for topic, prob in doc_topics_sorted[:top_n_topics]]

          # Récupérer tous les tags associés à ces topics
          predicted_tags = set()
          for topic in top_topics:
              predicted_tags.update(topic_to_tags.get(topic, []))
          all_predicted_tags.append(list(predicted_tags))
      return all_predicted_tags

  # --- 3. Parsing des tags test (si c’est des chaînes de caractères) ---
  y_test_parsed = y_test.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

  # --- 4. Prédictions ---
  y_pred_tags = predict_tags_from_lda(best_lda_model, common_corpus_test, topic_to_tags)

  # --- 5. Binarisation des tags ---
  # On fit sur la totalité des tags observés dans train + test (important pour cohérence)
  all_tags = list(y_train) + list(y_test_parsed)
  mlb.fit(all_tags)

  Y_test_bin = mlb.transform(y_test_parsed)
  Y_pred_bin = mlb.transform(y_pred_tags)

  metrics_df = metric_score("LDA", Y_test_bin, Y_pred_bin, metrics_df)

  mlflow.log_metric("accuracy", metrics_df.loc["Accuracy", "LDA"])
  mlflow.log_metric("f1_score", metrics_df.loc["F1 score", "LDA"])
  mlflow.log_metric("jaccard", metrics_df.loc["Jaccard", "LDA"])
  mlflow.log_metric("recall", metrics_df.loc["Recall", "LDA"])
  mlflow.log_metric("precision", metrics_df.loc["Precision", "LDA"])

mlflow.end_run()

metrics_df

In [None]:
import tempfile
import os

with tempfile.TemporaryDirectory() as tmp_dir:
    model_path = os.path.join(tmp_dir, "lda_model")
    best_lda_model.save(model_path)
    mlflow.log_artifact(model_path, artifact_path="lda_model")

# LOGISTIC REGRESSION

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import pandas as pd
import mlflow
import mlflow.sklearn

mlflow.end_run()

# Binarisation des labels
mlb = MultiLabelBinarizer()
Y_train_bin = mlb.fit_transform(y_train.apply(ast.literal_eval))
Y_test_bin = mlb.transform(y_test.apply(ast.literal_eval))

print(f"Nombre de classes dans mlb: {len(mlb.classes_)}")
print(f"Shape Y_train_bin: {Y_train_bin.shape}")
print(f"Shape Y_test_bin: {Y_test_bin.shape}")

# Paramètres pour la recherche
param_logit = {
    "estimator__C": [100, 10, 1.0, 0.1],
    "estimator__penalty": ["l1", "l2"],
    "estimator__dual": [False],
    "estimator__solver": ["liblinear"]
}

# Début du run MLflow
mlflow.set_experiment("Multilabel_TFIDF_LogReg")

with mlflow.start_run(run_name="GridSearch_LogReg", nested=True):

    # GridSearch
    multi_logit_cv = GridSearchCV(
        OneVsRestClassifier(LogisticRegression()),
        param_grid=param_logit,
        n_jobs=-1,
        cv=5,
        scoring="f1_weighted",
        return_train_score=True,
        refit=True,
        verbose=3
    )
    multi_logit_cv.fit(X_tfidf_train_dense, Y_train_bin)

    # Log des hyperparamètres optimaux
    mlflow.log_params(multi_logit_cv.best_params_)

    # Prédictions
    y_test_predicted_labels_tfidf = multi_logit_cv.predict(X_tfidf_test_dense)

    # Évaluation
    print(f"Shape des prédictions: {y_test_predicted_labels_tfidf.shape}")
    print(f"Nombre de 1 dans les prédictions: {y_test_predicted_labels_tfidf.sum()}")
    print(f"Pourcentage de 1: {y_test_predicted_labels_tfidf.mean()*100:.2f}%")

    # Métriques
    metrics_df = metric_score("Log_Reg", Y_test_bin, y_test_predicted_labels_tfidf, metrics_df)
    mlflow.log_metric("accuracy", metrics_df.loc["Accuracy", "Log_Reg"])
    mlflow.log_metric("f1_score", metrics_df.loc["F1 score", "Log_Reg"])
    mlflow.log_metric("jaccard", metrics_df.loc["Jaccard", "Log_Reg"])
    mlflow.log_metric("recall", metrics_df.loc["Recall", "Log_Reg"])
    mlflow.log_metric("precision", metrics_df.loc["Precision", "Log_Reg"])

    # Log du modèle
    mlflow.sklearn.log_model(multi_logit_cv.best_estimator_, "model")

    # Optionnel : log d'artefacts comme CSV
    logit_cv_results = pd.DataFrame.from_dict(multi_logit_cv.cv_results_)
    logit_cv_results.to_csv("logit_cv_results.csv", index=False)
    mlflow.log_artifact("logit_cv_results.csv")

    # Print comparaison prédictions
    y_test_pred_inversed = mlb.inverse_transform(y_test_predicted_labels_tfidf)
    y_test_inversed = mlb.inverse_transform(Y_test_bin)
    print("Predicted:", y_test_pred_inversed[:10])
    print("True:", y_test_inversed[:10])

mlflow.end_run()


In [None]:
metrics_df

# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
import mlflow
import mlflow.sklearn
import pandas as pd

mlflow.end_run()

param_rfc = {
    "estimator__max_depth": [5, 25, 50],
    "estimator__min_samples_leaf": [1, 5, 10],
    "estimator__class_weight": ["balanced"]
}

mlflow.set_experiment("Multilabel_TFIDF_RF")

with mlflow.start_run(run_name="GridSearch_RF", nested=True):

    param_rfc = {
    "estimator__max_depth": [5, 25, 50],
    "estimator__min_samples_leaf": [1, 5, 10],
    "estimator__class_weight": ["balanced"]
}

mlflow.set_experiment("Multilabel_TFIDF_RF")

with mlflow.start_run(run_name="GridSearch_RF", nested=True):

    multi_rfc_cv = GridSearchCV(
        OneVsRestClassifier(RandomForestClassifier()),
        param_grid=param_rfc,
        n_jobs=-1,
        cv=2,
        scoring="f1_weighted",
        return_train_score=True,
        refit=True,
        verbose=3
    )

    multi_rfc_cv.fit(X_tfidf_train_dense, Y_train_bin)

    # Log des meilleurs hyperparamètres
    mlflow.log_params(multi_rfc_cv.bestparams)

    # Résultats de validation croisée
    rfc_cv_results = pd.DataFrame.from_dict(multi_rfc_cv.cvresults)
    rfc_cv_results.to_csv("rfc_cv_results.csv", index=False)
    mlflow.log_artifact("rfc_cv_results.csv")

    print("-"*50)
    print("Best params for RandomForestClassifier")
    print("-"*50)
    print(multi_rfc_cv.bestparams)

    y_test_predicted_labels_tfidf_rfc = multi_rfc_cv.predict(X_tfidf_test_dense)

    print(f"Shape des prédictions: {y_test_predicted_labels_tfidf_rfc.shape}")
    print(f"Nombre de 1 dans les prédictions: {y_test_predicted_labels_tfidf_rfc.sum()}")
    print(f"Pourcentage de 1: {y_test_predicted_labels_tfidf_rfc.mean()*100:.2f}%")

    # Métriques
    metrics_df = metric_score("RF", Y_test_bin, y_test_predicted_labels_tfidf_rfc, metrics_df)
    mlflow.log_metric("accuracy", metrics_df.loc["Accuracy", "RF"])
    mlflow.log_metric("f1_score", metrics_df.loc["F1 score", "RF"])
    mlflow.log_metric("jaccard", metrics_df.loc["Jaccard", "RF"])
    mlflow.log_metric("recall", metrics_df.loc["Recall", "RF"])
    mlflow.log_metric("precision", metrics_df.loc["Precision", "RF"])

    # Log du modèle
    mlflow.sklearn.log_model(multi_rfc_cv.bestestimator, "model")

    # Affichage des 10 premières prédictions
    y_test_pred_inversed = mlb.inverse_transform(y_test_predicted_labels_tfidf_rfc)
    y_test_inversed = mlb.inverse_transform(Y_test_bin)

    print("-"*50)
    print("Print 10 first predicted Tags vs true Tags")
    print("-"*50)
    print("Predicted:", y_test_pred_inversed[:10])
    print("True:", y_test_inversed[:10])

mlflow.end_run()


Fitting 2 folds for each of 9 candidates, totalling 18 fits


In [None]:
metrics_df

# WORD2VEC

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import tempfile
import os
import json
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Tokenisation
X_train_tok = X_train['Merged_doc'].dropna().apply(word_tokenize).tolist()
X_test_tok = X_test['Merged_doc'].dropna().apply(word_tokenize).tolist()

# === Début du run MLflow ===
mlflow.set_experiment("Word2Vec_Classifier")

with mlflow.start_run(run_name="W2V + LogisticRegression", nested=True):

    # Hyperparams
    vector_size = 100
    window = 5
    min_count = 2
    sg = 1  # skip-gram

    # Log des params
    mlflow.log_params({
        "vector_size": vector_size,
        "window": window,
        "min_count": min_count,
        "sg": sg,
        "classifier": "LogisticRegression"
    })

    # Entraînement Word2Vec
    model_w2v = Word2Vec(sentences=X_train_tok, vector_size=vector_size,
                         window=window, min_count=min_count, workers=4, sg=sg)

    # Sauvegarde du modèle Word2Vec
    with tempfile.TemporaryDirectory() as tmpdir:
        w2v_path = os.path.join(tmpdir, "word2vec.model")
        model_w2v.save(w2v_path)
        mlflow.log_artifact(w2v_path, artifact_path="w2v_model")

    # Vectorisation
    def vectorize_doc(doc_tokens, model):
        vectors = [model.wv[word] for word in doc_tokens if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

    X_train_vec = np.array([vectorize_doc(doc, model_w2v) for doc in X_train_tok])
    X_test_vec = np.array([vectorize_doc(doc, model_w2v) for doc in X_test_tok])

    # Classification
    clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    clf.fit(X_train_vec, Y_train_bin)

    Y_pred_bin = clf.predict(X_test_vec)

    # Logging modèle Sklearn
    mlflow.sklearn.log_model(clf, "logistic_model")

    # Métriques
    metrics_df = metric_score("Word2Vec", Y_test_bin, Y_pred_bin, metrics_df)

    mlflow.log_metric("accuracy", metrics_df.loc["Accuracy", "Word2Vec"])
    mlflow.log_metric("f1_score", metrics_df.loc["F1 score", "Word2Vec"])
    mlflow.log_metric("jaccard", metrics_df.loc["Jaccard", "Word2Vec"])
    mlflow.log_metric("recall", metrics_df.loc["Recall", "Word2Vec"])
    mlflow.log_metric("precision", metrics_df.loc["Precision", "Word2Vec"])

mlflow.end_run()


In [None]:
metrics_df

# BERT

In [None]:
import os
import tensorflow_hub as hub
import tensorflow as tf
import transformers
from transformers import AutoTokenizer, AutoConfig, TFAutoModel
import time

In [None]:
# Fonction de préparation des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
    input_ids=[]
    token_type_ids = []
    attention_mask=[]
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              return_attention_mask = True,
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")

        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0],
                             bert_inp['token_type_ids'][0],
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)

    return input_ids, token_type_ids, attention_mask, bert_inp_tot


# Fonction de création des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size, mode='HF') :
    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = AutoTokenizer.from_pretrained(model_type)
    time1 = time.time()

    # Ensure sentences is a list of strings
    if isinstance(sentences, pd.DataFrame):
        sentences = sentences['Merged_doc'].tolist()
    elif not isinstance(sentences, list):
        raise TypeError("Input 'sentences' must be a list or a pandas DataFrame.")


    num_batches = (len(sentences) + batch_size - 1) // batch_size # Calculate number of batches including the last partial batch

    for step in range(num_batches) :
        idx = step*batch_size
        # Handle the last batch which might be smaller than batch_size
        current_batch_sentences = sentences[idx:min(idx+batch_size, len(sentences))]

        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(current_batch_sentences,
                                                                      bert_tokenizer, max_length)

        if mode=='HF' :    # Bert HuggingFace
            outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
            last_hidden_states = outputs.last_hidden_state

        if mode=='TFhub' : # Bert Tensorflow Hub
            text_preprocessed = {"input_word_ids" : input_ids,
                                 "input_mask" : attention_mask,
                                 "input_type_ids" : token_type_ids}
            outputs = model(text_preprocessed)
            last_hidden_states = outputs['sequence_output']

        if step == 0 :
            last_hidden_states_tot = last_hidden_states
        else :
            last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))

    features_bert = np.array(last_hidden_states_tot).mean(axis=1)

    time2 = np.round(time.time() - time1,0)
    print("temps traitement : ", time2)

    return features_bert, last_hidden_states_tot

In [None]:
import tensorflow as tf
from transformers import TFAutoModel

max_length = 32
batch_size = 10
model_type = 'bert-base-uncased'

model = TFAutoModel.from_pretrained(model_type)
sentences_train = X_train
sentences_test = X_test

In [None]:
print(len(X_train))

In [None]:
import numpy as np

X_bert_train, last_hidden_hf_train = feature_BERT_fct(model, model_type,
                                                   X_train, max_length,
                                                   batch_size, mode='HF')

In [None]:
X_bert_train.shape

In [None]:
X_bert_test, last_hidden_hf_test = feature_BERT_fct(model, model_type,
                                                   X_test, max_length,
                                                   batch_size, mode='HF')

In [None]:
from sklearn.decomposition import PCA

X_train_bert_dense = X_bert_train
X_test_bert_dense = X_bert_test

pca = PCA(n_components=100, random_state=42)
X_train_bert_pca = pca.fit_transform(X_train_bert_dense)
X_test_bert_pca = pca.transform(X_test_bert_dense)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

mlflow.set_experiment("Multilabel_BERT")

with mlflow.start_run(run_name="LogReg_BERT", nested=True):
    classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    classifier.fit(X_train_bert_pca, Y_train_bin)

    # Predict
    Y_pred_bin = classifier.predict(X_test_bert_pca)

    # Metrics
    metrics_df = metric_score("Bert", Y_test_bin, Y_pred_bin, metrics_df)

    # Log params
    mlflow.log_param("BERT_model", model_type)
    mlflow.log_param("max_length", max_length)
    mlflow.log_param("pca_components", 100)

    # Log metrics (juste après calcul)
    mlflow.log_metric("accuracy", metrics_df.loc["Accuracy", "Bert"])
    mlflow.log_metric("f1_score", metrics_df.loc["F1 score", "Bert"])
    mlflow.log_metric("jaccard", metrics_df.loc["Jaccard", "Bert"])
    mlflow.log_metric("recall", metrics_df.loc["Recall", "Bert"])
    mlflow.log_metric("precision", metrics_df.loc["Precision", "Bert"])

    # Log model
    mlflow.sklearn.log_model(classifier, "model")

mlflow.end_run()

# Affichage final
metrics_df


In [None]:
metrics_df.to_csv('data/metrics_df_2.csv', index=True)

#USE

In [None]:
import os
import time
import numpy as np
import joblib

import tensorflow as tf
import tensorflow_hub as hub
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Définir le cache temporaire
os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub_cache"

# Charger le modèle USE
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")



In [None]:
def feature_USE_fct(sentences, batch_size):
    features = []
    time1 = time.time()

    for step in range(0, len(sentences), batch_size):
        batch = sentences[step:step + batch_size]

        # 🔽 Correction ici : aplatir si array 2D
        if isinstance(batch, np.ndarray):
            batch = batch.ravel().tolist()

        feat = embed(batch)
        features.append(feat.numpy())

    features = np.vstack(features)
    print(f"Temps d'encodage : {np.round(time.time() - time1, 2)} sec")

    return features


In [None]:
print(type(X_train))
print(np.array(X_train).shape)

In [None]:
X_train = np.ravel(X_train).tolist()
X_test = np.ravel(X_test).tolist()

In [None]:
from sklearn.preprocessing import StandardScaler

def display_scree_plot(pca):
  fig=plt.figure(figsize=(8,8))
  scree = pca.explained_variance_ratio_*100
  plt.bar(np.arange(len(scree))+1, scree)
  plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
  plt.xlabel("rang de l'axe d'inertie")
  plt.ylabel("pourcentage d'inertie")
  plt.title("Eboulis des valeurs propres")
  plt.show()

def pca_transformation(train , test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    n_comp = train.shape[1]
    pca = PCA(n_components=n_comp)
    pca.fit(train)
    display_scree_plot(pca)
    pca = PCA(n_components=0.8, random_state=42)
    pca.fit(train)
    train_pca = pca.transform(train)
    test_pca = pca.transform(test)
    print("\nNous conservons {} composantes principales pour garder 80% d'inertie".format(pca.components_.shape[0]))
    return train_pca, test_pca, pca

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Binarisation des étiquettes multilabels
mlb = MultiLabelBinarizer()
Y_train_bin = mlb.fit_transform(y_train)
Y_test_bin = mlb.transform(y_test)

batch_size = 10

In [None]:
mlflow.set_experiment("Multilabel_USE")

with mlflow.start_run(run_name="LogReg_USE", nested=True):
    # Encodage avec USE
    X_use_train = feature_USE_fct(X_train, batch_size)
    X_use_test = feature_USE_fct(X_test, batch_size)

    # PCA
    X_train_use_pca, X_test_use_pca, pca_use = pca_transformation(X_use_train, X_use_test)
    n_components = pca_use.n_components_

    # Log des paramètres
    mlflow.log_param("embedding", "USE")
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("pca_components", n_components)
    mlflow.log_param("C", 10)
    mlflow.log_param("penalty", "l1")
    mlflow.log_param("solver", "liblinear")

    # Modèle
    logit_use_pca = OneVsRestClassifier(
        LogisticRegression(C=10, penalty="l1", dual=False, solver="liblinear"),
        n_jobs=-1
    )
    logit_use_pca.fit(X_train_use_pca, Y_train_bin)

    # Prédictions
    Y_pred_use = logit_use_pca.predict(X_test_use_pca)

    metrics_df = metric_score("USE", Y_test_bin, Y_pred_use, metrics_df)

    # Log métriques
    mlflow.log_metric("accuracy", metrics_df.loc["Accuracy", "USE"])
    mlflow.log_metric("f1_score", metrics_df.loc["F1 score", "USE"])
    mlflow.log_metric("jaccard", metrics_df.loc["Jaccard", "USE"])
    mlflow.log_metric("recall", metrics_df.loc["Recall", "USE"])
    mlflow.log_metric("precision", metrics_df.loc["Precision", "USE"])

    # Log du modèle
    mlflow.sklearn.log_model(logit_use_pca, "model")

mlflow.end_run()

metrics_df

In [None]:
metrics_df.to_csv('data/metrics_df_3.csv', index=True)

In [None]:
!mlflow ui --port 5000