In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss, jaccard_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
import mlflow
import mlflow.sklearn
from collections import Counter

In [2]:
df_train = pd.read_csv('stackoverflow_questions_cleaned_train.csv')
df_test = pd.read_csv('stackoverflow_questions_cleaned_test.csv')

# Afficher les colonnes disponibles
print(df_train.columns)

Index(['date', 'title', 'tags', 'score', 'answer_count', 'sentence_bow',
       'sentence_bow_lem', 'sentence_dl'],
      dtype='object')


In [11]:
number_of_tags = 50

# Créer une liste de tous les tags
all_tags = [tag for tags in df_train['tags'].apply(eval) for tag in tags]  # Utiliser eval pour convertir les chaînes de listes en listes

# Limiter les tags aux plus fréquents
top_tags = [tag for tag, count in Counter(all_tags).most_common(number_of_tags)]

# Filtrer les tags pour ne garder que les top 
df_train['filtered_tags'] = df_train['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])
df_test['filtered_tags'] = df_test['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])

# Supprimer les lignes sans tags pour df_train
df_train = df_train[df_train['filtered_tags'].map(len) > 0]

# Encoder les tags avec MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y_train = mlb.fit_transform(df_train['filtered_tags'])
y_test = mlb.transform(df_test['filtered_tags'])

tfidf_max_features = 500
# Vectorisation TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)


In [12]:
# Fonction pour entraîner, évaluer et logguer le modèle
def train_and_log_model(column_name, svd_components=300):
    print(f"Training model for column: {column_name}")

    X_tfidf_train = tfidf_vectorizer.fit_transform(df_train[column_name])
    X_tfidf_test = tfidf_vectorizer.transform(df_test[column_name])

    # Réduction dimensionnelle avec SVD
    svd = TruncatedSVD(n_components=svd_components)
    X_train = svd.fit_transform(X_tfidf_train)
    X_test = svd.transform(X_tfidf_test)
    
    var_explained = svd.explained_variance_ratio_.sum()
    print(f'var_explained = {var_explained}')

    # Entraîner le modèle OneVsRestClassifier avec LogisticRegression
    model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    model.fit(X_train, y_train)

    # Prédire sur l'ensemble de test
    y_pred = model.predict(X_test)

    # Calculer et afficher les scores
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    hamming = hamming_loss(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred, average='samples')
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Score F1 (micro) for {column_name}: {f1_micro}")
    print(f"Score F1 (weighted) for {column_name}: {f1_weighted}")
    print(f"Hamming Loss for {column_name}: {hamming}")
    print(f"Jaccard Score for {column_name}: {jaccard}")
    print(f"Accuracy for {column_name}: {accuracy}")
    print('-------------------------------------------------------------')

    # Logger le modèle et les métriques sur MLflow
    with mlflow.start_run(run_name=f"{column_name}_model") as run:
    # Loguer les paramètres
        mlflow.log_param("number_of_tags", number_of_tags)
        mlflow.log_param("max_features", tfidf_max_features)
        mlflow.log_param("n_components", svd_components)
        mlflow.log_param("var_explained", var_explained)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("model", "LogisticRegression - TF-IDF + SVD")
        
        mlflow.log_param("column", column_name)
        mlflow.log_metric("f1_score_micro", f1_micro)
        mlflow.log_metric("f1_score_weighted", f1_weighted)
        mlflow.log_metric("hamming_loss", hamming)
        mlflow.log_metric("jaccard_score", jaccard)
        mlflow.log_metric("accuracy", accuracy)

        # Logger les modèles
        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(tfidf_vectorizer, "tfidf_vectorizer")
        mlflow.sklearn.log_model(svd, "svd")

In [13]:
# Entraîner et logguer les modèles pour chaque colonne
columns_to_train = ['title', 'sentence_bow', 'sentence_bow_lem']

for column in columns_to_train:
    train_and_log_model(column)

Training model for column: title
var_explained = 0.8236330557880823


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for title: 0.31049250535331907
Score F1 (weighted) for title: 0.2890029192490611
Hamming Loss for title: 0.019223880597014926
Jaccard Score for title: 0.1472636815920398
Accuracy for title: 0.31094527363184077
-------------------------------------------------------------




Training model for column: sentence_bow
var_explained = 0.8091276245443009


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for sentence_bow: 0.312701252236136
Score F1 (weighted) for sentence_bow: 0.2884291126844779
Hamming Loss for sentence_bow: 0.019114427860696517
Jaccard Score for sentence_bow: 0.1468905472636816
Accuracy for sentence_bow: 0.3074626865671642
-------------------------------------------------------------




Training model for column: sentence_bow_lem
var_explained = 0.8148789405804975


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for sentence_bow_lem: 0.3142857142857143
Score F1 (weighted) for sentence_bow_lem: 0.29032875335314046
Hamming Loss for sentence_bow_lem: 0.0191044776119403
Jaccard Score for sentence_bow_lem: 0.14937810945273633
Accuracy for sentence_bow_lem: 0.30845771144278605
-------------------------------------------------------------


