### Packages

In [1]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model  import LogisticRegression
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import numpy as np
from tqdm import tqdm
import time
from sklearn.model_selection import StratifiedKFold
from joblib import dump

### Chargement du dataset

In [2]:
df = pd.read_csv("../data/ecommerceDataset_clean.csv")
df.head(3)

Unnamed: 0,labels,descriptions,desc_clean
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,paper plane design frame wall hang motivationa...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",saf floral frame paint wood inch inch special ...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,saf texture modern art print frame paint synth...


In [3]:
descriptions = df["descriptions"].map(str)
labels = df["labels"]

### Classifieur

In [4]:
def init_model(model, params):
    model = Pipeline([("tfidf", TfidfVectorizer(**params)),
                      ("clf", model)
                    ])

    return model

In [5]:
def get_score_model(model, X, y, skfold, verbose=False):

    metrics = []
    for train_index, test_index in tqdm(skfold.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        metrics.append(f1_score(y_test, y_pred, average="macro"))

    metrics = np.array(metrics)
    mean_score = np.mean(metrics, axis=0)
    std_score = np.std(metrics, axis=0)

    if verbose:
        print('Mean F1 score: ', mean_score)
        print('Std F1 score: ', std_score)
        
    return mean_score, std_score 

In [6]:
params = {
            "lowercase": False, 
            "max_df": 0.9,
            "min_df": 3, 
            "ngram_range": (1, 2)
        }

nb = init_model(MultinomialNB(), params)
lr = init_model(LogisticRegression(max_iter=500), params)
lsvc = init_model(LinearSVC(random_state=42), params)

In [7]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
entries = []

start_time = time.time()
mean_score, std_score = get_score_model(nb, descriptions, labels, skfold)
entries.append(("MultinomialNB", mean_score, std_score, time.time() - start_time))

start_time = time.time()
mean_score, std_score = get_score_model(lr, descriptions, labels, skfold)
entries.append(("LogisticRegression", mean_score, std_score, time.time() - start_time))

start_time = time.time()
mean_score, std_score = get_score_model(lsvc, descriptions, labels, skfold)
entries.append(("LinearSVC", mean_score, std_score, time.time() - start_time))

5it [00:56, 11.38s/it]
5it [04:41, 56.32s/it]
5it [01:08, 13.69s/it]


In [8]:
entries_df = pd.DataFrame(entries, columns=["Model name", "Mean F1 score", "Standard deviation", "Time execution(seconds)"])
entries_df.set_index('Model name', inplace=True)
entries_df

Unnamed: 0_level_0,Mean F1 score,Standard deviation,Time execution(seconds)
Model name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MultinomialNB,0.942345,0.001469,56.890371
LogisticRegression,0.967854,0.001422,281.605662
LinearSVC,0.982501,0.00084,68.450801
