### Chargement de donnÃ©es

In [19]:
import pandas as pd

df = pd.read_csv('../data/processed/data_stemmed.csv')

### 1. Vectoriser le texte Ã  lâ€™aide de TfidfVectorizer()

- Label et label_text

In [29]:
df[["label", "label_text"]].drop_duplicates()

Unnamed: 0,label,label_text
0,1,spam
1,0,ham


- SÃ©parer features et labels

In [20]:
y = df["label"]
X = df["stemmed_text"]

- Split train/test

In [21]:
from sklearn.model_selection import train_test_split

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X - Train", len(X_train_text))
print("X - Test", len(X_test_text))
print("y - Train", len(y_train))
print("y - Test", len(y_test))

X - Train 23040
X - Test 5761
y - Train 23040
y - Test 5761


- Vectorisation du texte

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer


# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=8000)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

print(X_train.shape, X_test.shape)

(23040, 8000) (5761, 8000)


### 2. EntraÃ®ner plusieurs modÃ¨les de classification.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# ModÃ¨les
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

# 3. Liste des modÃ¨les Ã  tester
models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "Linear SVM": LinearSVC(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "SGDClassifier": SGDClassifier(loss="hinge"),
}

# 4. Stocker les rÃ©sultats
results = []

for name, model in models.items():
    print(f"ðŸ”¹ Training {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds, pos_label=1)
    recall = recall_score(y_test, preds, pos_label=1)
    f1 = f1_score(y_test, preds, pos_label=1)

    results.append([name, accuracy, precision, recall, f1])

# 5. Afficher les rÃ©sultats
import pandas as pd

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score"])


ðŸ”¹ Training Logistic Regression...
ðŸ”¹ Training Linear SVM...
ðŸ”¹ Training Naive Bayes...
ðŸ”¹ Training Random Forest...
ðŸ”¹ Training SGDClassifier (SVM-like)...


In [42]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,0.985072,0.976915,0.991447,0.984127
1,Linear SVM,0.988196,0.984832,0.989959,0.987389
2,Naive Bayes,0.981427,0.976032,0.984381,0.980189
3,Random Forest,0.984378,0.977933,0.988843,0.983358
4,SGDClassifier,0.988544,0.983413,0.99219,0.987782


SGDClassifier est le modÃ©le le plus performant

### 3. Optimiser les modÃ¨les (GridSearch).

- Logistic Regression

In [74]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

params_lr = {
    "C": [5, 10, 15, 20, 30, 40, 50],
    "max_iter": [40, 50, 70, 100]
}

grid_lr = GridSearchCV(LogisticRegression(), params_lr, cv=5, scoring="f1")
grid_lr.fit(X_train, y_train)

print(grid_lr.best_params_)
print(grid_lr.best_score_)

{'C': 30, 'max_iter': 40}
0.9881564982517437


- Linear SVC

In [None]:
from sklearn.svm import LinearSVC

params_svm = {
    "C": [0.01, 0.1, 0.5, 1, 2, 3, 5],
    "loss": ["hinge", "squared_hinge"],
    "max_iter": [2000, 3000, 5000]
}

grid_svm = GridSearchCV(LinearSVC(), params_svm, cv=5, scoring="accuracy")
grid_svm.fit(X_train, y_train)

print(grid_svm.best_params_)
print(grid_svm.best_score_)



{'C': 0.5, 'loss': 'squared_hinge', 'max_iter': 2000}
0.9884114583333332


- Naive Bayes

In [53]:
from sklearn.naive_bayes import MultinomialNB

params_nb = {
    "alpha": [0.1, 0.5, 1.0, 2.0, 5.0]
}

grid_nb = GridSearchCV(MultinomialNB(), params_nb, cv=5, scoring="f1")
grid_nb.fit(X_train, y_train)

print(grid_nb.best_params_)
print(grid_nb.best_score_)

{'alpha': 0.1}
0.9824198952223366


- Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 40],
    "max_features": ["sqrt", "log2"],
    "min_samples_split": [2, 5, 10],
}

rnd_rf = GridSearchCV(
    RandomForestClassifier(),
    params_rf,
    cv=2,
    scoring="f1")

rnd_rf.fit(X_train, y_train)

print(rnd_rf.best_params_)
print(rnd_rf.best_score_)

- SGDClassifier

In [61]:
from sklearn.linear_model import SGDClassifier

params_sgd = {
    "loss": ["hinge", "log_loss", "modified_huber"],
    "alpha": [1e-4, 1e-3, 1e-2],
    "penalty": ["l2", "l1", "elasticnet"],
    "max_iter": [2000, 3000, 4000, 5000]
}

grid_sgd = GridSearchCV(
    SGDClassifier(),
    params_sgd,
    cv=5,
    scoring="f1"
)

grid_sgd.fit(X_train, y_train)

print(grid_sgd.best_params_)
print(grid_sgd.best_score_)


{'alpha': 0.0001, 'loss': 'modified_huber', 'max_iter': 3000, 'penalty': 'l2'}
0.9880216474178318


In [81]:
preds = grid_lr.best_estimator_.predict(X_test)
accuracy_lr = accuracy_score(y_test, preds)
preds = grid_svm.best_estimator_.predict(X_test)
accuracy_svm = accuracy_score(y_test, preds)
preds = grid_nb.best_estimator_.predict(X_test)
accuracy_nb = accuracy_score(y_test, preds)
preds = grid_sgd.best_estimator_.predict(X_test)
accuracy_sgd = accuracy_score(y_test, preds)

results = pd.DataFrame({
    "Model": ["LogReg", "SVM", "NB", "SGD"],
    "Best F1": [
        grid_lr.best_score_,
        grid_svm.best_score_,
        grid_nb.best_score_,
        grid_sgd.best_score_,
    ],
    "Best Accuracy": [
        accuracy_lr,
        accuracy_svm,
        accuracy_nb,
        accuracy_sgd,
    ]
})

print(results)


    Model   Best F1  Best Accuracy
0  LogReg  0.988156       0.988196
1     SVM  0.988411       0.988370
2      NB  0.982420       0.981774
3     SGD  0.988022       0.988891


D'aprÃ©s les resultats de cross validation, on constate que SVM est le modÃ©le le plus performant

### 5. Sauvegarder le modÃ¨le final.

In [82]:
import joblib

# Sauvegarde du modÃ¨le final
joblib.dump(grid_svm.best_estimator_, "../models/spam_classifier_model.pkl")

# Sauvegarde du TF-IDF vectorizer
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")

['../models/tfidf_vectorizer.pkl']