In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss, jaccard_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
import mlflow
import mlflow.sklearn
from collections import Counter

In [4]:
df_train = pd.read_csv('stackoverflow_questions_cleaned_train.csv')
df_test = pd.read_csv('stackoverflow_questions_cleaned_test.csv')

# Afficher les colonnes disponibles
print(df_train.columns)

Index(['date', 'title', 'tags', 'score', 'answer_count', 'sentence_bow',
       'sentence_bow_lem', 'sentence_dl'],
      dtype='object')


## Using splitted train/test

In [20]:
number_of_tags = 30

# Créer une liste de tous les tags
all_tags = [tag for tags in df_train['tags'].apply(eval) for tag in tags]  # Utiliser eval pour convertir les chaînes de listes en listes

# Limiter les tags aux plus fréquents
top_tags = [tag for tag, count in Counter(all_tags).most_common(number_of_tags)]

# Filtrer les tags pour ne garder que les top 
df_train['filtered_tags'] = df_train['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])
# # Filtrer les tags pour ne garder que les top 
# df_test['filtered_tags'] = df_test['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])

print(df_test.info())

# Supprimer les lignes sans tags pour df_train
df_train = df_train[df_train['filtered_tags'].map(len) > 0]
df_test = df_test[df_test['filtered_tags'].map(len) > 0]
print(df_test.info())


# Encoder les tags avec MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y_train = mlb.fit_transform(df_train['filtered_tags'])
y_test = mlb.fit_transform(df_test['filtered_tags'])

tfidf_max_features = 2000
# Vectorisation TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)


<class 'pandas.core.frame.DataFrame'>
Index: 1408 entries, 0 to 2008
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              1408 non-null   object
 1   title             1408 non-null   object
 2   tags              1408 non-null   object
 3   score             1408 non-null   int64 
 4   answer_count      1408 non-null   int64 
 5   sentence_bow      1408 non-null   object
 6   sentence_bow_lem  1408 non-null   object
 7   sentence_dl       1408 non-null   object
 8   filtered_tags     1408 non-null   object
dtypes: int64(2), object(7)
memory usage: 110.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 1408 entries, 0 to 2008
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              1408 non-null   object
 1   title             1408 non-null   object
 2   tags              1408 non-null   object
 3   

In [21]:
# Fonction pour entraîner, évaluer et logguer le modèle
def train_and_log_model(column_name, svd_components=300):
    print(f"Training model for column: {column_name}")

    X_tfidf_train = tfidf_vectorizer.fit_transform(df_train[column_name])
    X_tfidf_test = tfidf_vectorizer.fit_transform(df_test[column_name])

    # Réduction dimensionnelle avec SVD
    svd = TruncatedSVD(n_components=svd_components)
    X_train = svd.fit_transform(X_tfidf_train)
    X_test = svd.transform(X_tfidf_test)

    # Entraîner le modèle OneVsRestClassifier avec LogisticRegression
    model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    model.fit(X_train, y_train)

    # Prédire sur l'ensemble de test
    y_pred = model.predict(X_test)

    # Calculer et afficher les scores
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    hamming = hamming_loss(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred, average='samples')
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Score F1 (micro) for {column_name}: {f1_micro}")
    print(f"Score F1 (weighted) for {column_name}: {f1_weighted}")
    print(f"Hamming Loss for {column_name}: {hamming}")
    print(f"Jaccard Score for {column_name}: {jaccard}")
    print(f"Accuracy for {column_name}: {accuracy}")
    print('-------------------------------------------------------------')

    # Logger le modèle et les métriques sur MLflow
    with mlflow.start_run(run_name=f"{column_name}_model") as run:
    # Loguer les paramètres
        mlflow.log_param("number_of_tags", number_of_tags)
        mlflow.log_param("max_features", tfidf_max_features)
        mlflow.log_param("n_components", svd_components)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("model", "LogisticRegression - TF-IDF + SVD")
        
        mlflow.log_param("column", column_name)
        mlflow.log_metric("f1_score_micro", f1_micro)
        mlflow.log_metric("f1_score_weighted", f1_weighted)
        mlflow.log_metric("hamming_loss", hamming)
        mlflow.log_metric("jaccard_score", jaccard)
        mlflow.log_metric("accuracy", accuracy)

        # Logger les modèles
        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(tfidf_vectorizer, "tfidf_vectorizer")
        mlflow.sklearn.log_model(svd, "svd")

In [22]:
# Entraîner et logguer les modèles pour chaque colonne
columns_to_train = ['title', 'sentence_bow', 'sentence_bow_lem']

for column in columns_to_train:
    train_and_log_model(column)

Training model for column: title
Score F1 (micro) for title: 0.0031965903036760787
Score F1 (weighted) for title: 0.0031159511519576986
Hamming Loss for title: 0.04429450757575758
Jaccard Score for title: 0.0014204545454545455
Accuracy for title: 0.0007102272727272727
-------------------------------------------------------------




Training model for column: sentence_bow
Score F1 (micro) for sentence_bow: 0.004289544235924933
Score F1 (weighted) for sentence_bow: 0.004280664206153051
Hamming Loss for sentence_bow: 0.04396306818181818
Jaccard Score for sentence_bow: 0.0024857954545454545
Accuracy for sentence_bow: 0.002130681818181818
-------------------------------------------------------------




Training model for column: sentence_bow_lem
Score F1 (micro) for sentence_bow_lem: 0.0021413276231263384
Score F1 (weighted) for sentence_bow_lem: 0.0021111330280364866
Hamming Loss for sentence_bow_lem: 0.04412878787878788
Jaccard Score for sentence_bow_lem: 0.0014204545454545455
Accuracy for sentence_bow_lem: 0.0014204545454545455
-------------------------------------------------------------




## Using full df

In [23]:
# Charger le DataFrame nettoyé
df = pd.read_csv('stackoverflow_questions_cleaned.csv')

In [45]:
number_of_tags = 30

# Créer une liste de tous les tags
all_tags = [tag for tags in df['tags'].apply(eval) for tag in tags]  # Utiliser eval pour convertir les chaînes de listes en listes

# Limiter les tags aux plus fréquents
top_tags = [tag for tag, count in Counter(all_tags).most_common(number_of_tags)]

# Filtrer les tags pour ne garder que les top 
df['filtered_tags'] = df['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])

# Supprimer les lignes sans tags pour df_train
df = df[df['filtered_tags'].map(len) > 0]

# Encoder les tags avec MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y = mlb.fit_transform(df['filtered_tags'])

tfidf_max_features = 150
# Vectorisation TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)


In [46]:
# Fonction pour entraîner, évaluer et logguer le modèle
def train_and_log_model(column_name, svd_components=100):
    print(f"Training model for column: {column_name}")

    X_tfidf = tfidf_vectorizer.fit_transform(df[column_name])

    # Réduction dimensionnelle avec SVD
    svd = TruncatedSVD(n_components=svd_components)
    X_reduced = svd.fit_transform(X_tfidf)
    
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

    # Entraîner le modèle OneVsRestClassifier avec LogisticRegression
    model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    model.fit(X_train, y_train)

    # Prédire sur l'ensemble de test
    y_pred = model.predict(X_test)

    # Calculer et afficher les scores
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    hamming = hamming_loss(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred, average='samples')
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Score F1 (micro) for {column_name}: {f1_micro}")
    print(f"Score F1 (weighted) for {column_name}: {f1_weighted}")
    print(f"Hamming Loss for {column_name}: {hamming}")
    print(f"Jaccard Score for {column_name}: {jaccard}")
    print(f"Accuracy for {column_name}: {accuracy}")
    print('-------------------------------------------------------------')

    # Logger le modèle et les métriques sur MLflow
    with mlflow.start_run(run_name=f"{column_name}_model") as run:
    # Loguer les paramètres
        mlflow.log_param("number_of_tags", number_of_tags)
        mlflow.log_param("max_features", tfidf_max_features)
        mlflow.log_param("n_components", svd_components)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("model", "LogisticRegression - TF-IDF + SVD")
        
        mlflow.log_param("column", column_name)
        mlflow.log_metric("f1_score_micro", f1_micro)
        mlflow.log_metric("f1_score_weighted", f1_weighted)
        mlflow.log_metric("hamming_loss", hamming)
        mlflow.log_metric("jaccard_score", jaccard)
        mlflow.log_metric("accuracy", accuracy)

        # Logger les modèles
        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(tfidf_vectorizer, "tfidf_vectorizer")
        mlflow.sklearn.log_model(svd, "svd")

In [47]:
# Entraîner et logguer les modèles pour chaque colonne
columns_to_train = ['title', 'sentence_bow', 'sentence_bow_lem']

for column in columns_to_train:
    train_and_log_model(column)

Training model for column: title
Score F1 (micro) for title: 0.32220738762794837
Score F1 (weighted) for title: 0.2882674419163026
Hamming Loss for title: 0.035700890764181904
Jaccard Score for title: 0.19503047351148617
Accuracy for title: 0.16385372714486637
-------------------------------------------------------------




Training model for column: sentence_bow
Score F1 (micro) for sentence_bow: 0.3405002193944713
Score F1 (weighted) for sentence_bow: 0.3040585052380162
Hamming Loss for sentence_bow: 0.035232067510548526
Jaccard Score for sentence_bow: 0.20264885138302857
Accuracy for sentence_bow: 0.1659634317862166
-------------------------------------------------------------




Training model for column: sentence_bow_lem
Score F1 (micro) for sentence_bow_lem: 0.3244916003536693
Score F1 (weighted) for sentence_bow_lem: 0.28704729588855626
Hamming Loss for sentence_bow_lem: 0.03581809657759025
Jaccard Score for sentence_bow_lem: 0.19157290201593996
Accuracy for sentence_bow_lem: 0.15611814345991562
-------------------------------------------------------------




## Using full df and removing questions without tags only in train data

In [48]:
# Charger le DataFrame nettoyé
df = pd.read_csv('stackoverflow_questions_cleaned.csv')

In [49]:
number_of_tags = 30

# Créer une liste de tous les tags
all_tags = [tag for tags in df['tags'].apply(eval) for tag in tags]  # Utiliser eval pour convertir les chaînes de listes en listes

# Limiter les tags aux plus fréquents
top_tags = [tag for tag, count in Counter(all_tags).most_common(number_of_tags)]

# Filtrer les tags pour ne garder que les top 
df['filtered_tags'] = df['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])

# Séparer les questions avec et sans tags
df_with_tags = df[df['filtered_tags'].map(len) > 0]
df_without_tags = df[df['filtered_tags'].map(len) == 0]

# Encoder les tags avec MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y = mlb.fit_transform(df_with_tags['filtered_tags'])

tfidf_max_features = 150
# Vectorisation TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)


In [50]:
# Fonction pour entraîner, évaluer et logguer le modèle
def train_and_log_model(column_name, svd_components=100):
    print(f"Training model for column: {column_name}")

    X_tfidf = tfidf_vectorizer.fit_transform(df_with_tags[column_name])

    # Réduction dimensionnelle avec SVD
    svd = TruncatedSVD(n_components=svd_components)
    X_reduced = svd.fit_transform(X_tfidf)
    
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

    # Ajouter les questions sans tags au jeu de test
    X_test_no_tags = tfidf_vectorizer.transform(df_without_tags[column_name])
    X_test_no_tags_reduced = svd.transform(X_test_no_tags)
    y_test_no_tags = [[0]*len(top_tags)] * len(df_without_tags)

    X_test = pd.concat([pd.DataFrame(X_test), pd.DataFrame(X_test_no_tags_reduced)], ignore_index=True)
    y_test = pd.concat([pd.DataFrame(y_test), pd.DataFrame(y_test_no_tags)], ignore_index=True)

    # Entraîner le modèle OneVsRestClassifier avec LogisticRegression
    model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    model.fit(X_train, y_train)

    # Prédire sur l'ensemble de test
    y_pred = model.predict(X_test)

    # Calculer et afficher les scores
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    hamming = hamming_loss(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred, average='samples')
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Score F1 (micro) for {column_name}: {f1_micro}")
    print(f"Score F1 (weighted) for {column_name}: {f1_weighted}")
    print(f"Hamming Loss for {column_name}: {hamming}")
    print(f"Jaccard Score for {column_name}: {jaccard}")
    print(f"Accuracy for {column_name}: {accuracy}")
    print('-------------------------------------------------------------')

    # Logger le modèle et les métriques sur MLflow
    with mlflow.start_run(run_name=f"{column_name}_model") as run:
        # Loguer les paramètres
        mlflow.log_param("number_of_tags", number_of_tags)
        mlflow.log_param("max_features", tfidf_max_features)
        mlflow.log_param("n_components", svd_components)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("model", "LogisticRegression - TF-IDF + SVD")
        
        mlflow.log_param("column", column_name)
        mlflow.log_metric("f1_score_micro", f1_micro)
        mlflow.log_metric("f1_score_weighted", f1_weighted)
        mlflow.log_metric("hamming_loss", hamming)
        mlflow.log_metric("jaccard_score", jaccard)
        mlflow.log_metric("accuracy", accuracy)

        # Logger les modèles
        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(tfidf_vectorizer, "tfidf_vectorizer")
        mlflow.sklearn.log_model(svd, "svd")


In [51]:
# Entraîner et logguer les modèles pour chaque colonne
columns_to_train = ['title', 'sentence_bow', 'sentence_bow_lem']

for column in columns_to_train:
    train_and_log_model(column)

Training model for column: title


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for title: 0.3004577611319184
Score F1 (weighted) for title: 0.26985365036968845
Hamming Loss for title: 0.012854630266880784
Jaccard Score for title: 0.06350844995029441
Accuracy for title: 0.6930488644184446
-------------------------------------------------------------




Training model for column: sentence_bow


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for sentence_bow: 0.31840796019900497
Score F1 (weighted) for sentence_bow: 0.28283383289969255
Hamming Loss for sentence_bow: 0.012571690754760266
Jaccard Score for sentence_bow: 0.06509520532232163
Accuracy for sentence_bow: 0.6987841247992659
-------------------------------------------------------------




Training model for column: sentence_bow_lem


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for sentence_bow_lem: 0.3078838174273859
Score F1 (weighted) for sentence_bow_lem: 0.2726941156918363
Hamming Loss for sentence_bow_lem: 0.012755219086946547
Jaccard Score for sentence_bow_lem: 0.06324080446585607
Accuracy for sentence_bow_lem: 0.6960311998164717
-------------------------------------------------------------




In [52]:
# Sauvegarder les DataFrames d'entraînement et de test dans des fichiers CSV
df_train, df_test_with_tags = train_test_split(df_with_tags, test_size=0.2, random_state=42)
df_test = pd.concat([df_test_with_tags, df_without_tags], ignore_index=True)
df_train.to_csv('stackoverflow_train_tags_only.csv', index=False)
df_test.to_csv('stackoverflow_test_tags_and_no_tags.csv', index=False)

## Using splitted train/test v2

In [11]:
number_of_tags = 30

# Créer une liste de tous les tags
all_tags = [tag for tags in df_train['tags'].apply(eval) for tag in tags]  # Utiliser eval pour convertir les chaînes de listes en listes

# Limiter les tags aux plus fréquents
top_tags = [tag for tag, count in Counter(all_tags).most_common(number_of_tags)]

# Filtrer les tags pour ne garder que les top 
df_train['filtered_tags'] = df_train['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])
df_test['filtered_tags'] = df_test['tags'].apply(lambda tags: [tag for tag in eval(tags) if tag in top_tags])

# Supprimer les lignes sans tags pour df_train
df_train = df_train[df_train['filtered_tags'].map(len) > 0]

# Encoder les tags avec MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=top_tags)
y_train = mlb.fit_transform(df_train['filtered_tags'])
y_test = mlb.transform(df_test['filtered_tags'])

tfidf_max_features = 2000
# Vectorisation TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)


In [12]:
# Fonction pour entraîner, évaluer et logguer le modèle
def train_and_log_model(column_name, svd_components=300):
    print(f"Training model for column: {column_name}")

    X_tfidf_train = tfidf_vectorizer.fit_transform(df_train[column_name])
    X_tfidf_test = tfidf_vectorizer.transform(df_test[column_name])

    # Réduction dimensionnelle avec SVD
    svd = TruncatedSVD(n_components=svd_components)
    X_train = svd.fit_transform(X_tfidf_train)
    X_test = svd.transform(X_tfidf_test)

    # Entraîner le modèle OneVsRestClassifier avec LogisticRegression
    model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    model.fit(X_train, y_train)

    # Prédire sur l'ensemble de test
    y_pred = model.predict(X_test)

    # Calculer et afficher les scores
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    hamming = hamming_loss(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred, average='samples')
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Score F1 (micro) for {column_name}: {f1_micro}")
    print(f"Score F1 (weighted) for {column_name}: {f1_weighted}")
    print(f"Hamming Loss for {column_name}: {hamming}")
    print(f"Jaccard Score for {column_name}: {jaccard}")
    print(f"Accuracy for {column_name}: {accuracy}")
    print('-------------------------------------------------------------')

    # Logger le modèle et les métriques sur MLflow
    with mlflow.start_run(run_name=f"{column_name}_model") as run:
    # Loguer les paramètres
        mlflow.log_param("number_of_tags", number_of_tags)
        mlflow.log_param("max_features", tfidf_max_features)
        mlflow.log_param("n_components", svd_components)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("model", "LogisticRegression - TF-IDF + SVD")
        
        mlflow.log_param("column", column_name)
        mlflow.log_metric("f1_score_micro", f1_micro)
        mlflow.log_metric("f1_score_weighted", f1_weighted)
        mlflow.log_metric("hamming_loss", hamming)
        mlflow.log_metric("jaccard_score", jaccard)
        mlflow.log_metric("accuracy", accuracy)

        # Logger les modèles
        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(tfidf_vectorizer, "tfidf_vectorizer")
        mlflow.sklearn.log_model(svd, "svd")

In [13]:
# Entraîner et logguer les modèles pour chaque colonne
columns_to_train = ['title', 'sentence_bow', 'sentence_bow_lem']

for column in columns_to_train:
    train_and_log_model(column)

Training model for column: title


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for title: 0.2991150442477876
Score F1 (weighted) for title: 0.27419273149543383
Hamming Loss for title: 0.02626865671641791
Jaccard Score for title: 0.12723880597014925
Accuracy for title: 0.38308457711442784
-------------------------------------------------------------




Training model for column: sentence_bow


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for sentence_bow: 0.28660994178235555
Score F1 (weighted) for sentence_bow: 0.2612148145754262
Hamming Loss for sentence_bow: 0.026417910447761195
Jaccard Score for sentence_bow: 0.11977611940298508
Accuracy for sentence_bow: 0.37860696517412934
-------------------------------------------------------------




Training model for column: sentence_bow_lem


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Score F1 (micro) for sentence_bow_lem: 0.29159212880143115
Score F1 (weighted) for sentence_bow_lem: 0.2661763450249109
Hamming Loss for sentence_bow_lem: 0.02626865671641791
Jaccard Score for sentence_bow_lem: 0.12375621890547264
Accuracy for sentence_bow_lem: 0.3850746268656716
-------------------------------------------------------------


