In [2]:
import numpy as np
import pandas as pd

In [3]:
DATASET_PATH = "/Users/baptiste/cours/course_mlops/data/spam.csv"

# Tested multiple value 0.2 seems good
TEST_SIZE = 0.2

In [4]:
df = pd.read_csv(DATASET_PATH, encoding="ISO-8859-1")
print(df.head())
print(df.shape)

  label                                            message Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
(5572, 5)


In [5]:
print("Label distribution :")
print(df["label"].value_counts())

print("Example of 'spam' message :")
print(df[df["label"] == "spam"].head(2).values)

print("Example of 'ham' message:")
print(df[df["label"] == "ham"].head(2).values)

Label distribution :
label
ham     4825
spam     747
Name: count, dtype: int64
Example of 'spam' message :
[['spam'
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
  nan nan nan]
 ['spam'
  "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Ã¯Â¿Â½1.50 to rcv"
  nan nan nan]]
Example of 'ham' message:
[['ham'
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
  nan nan nan]
 ['ham' 'Ok lar... Joking wif u oni...' nan nan nan]]


In [6]:
##### Remove duplicates and null values #####

print(f"Before: {len(df)}")

df = df.drop_duplicates(subset=["message"])

print(f"After: {len(df)}")

Before: 5572
After: 5169


In [7]:
import re
import string

df["message_clean"] = df["message"].str.lower()
df["message_clean"] = df["message_clean"].apply(lambda x: re.sub(r"http\S+|www\S+|https\S+", "", x))
df["message_clean"] = df["message_clean"].apply(lambda x: re.sub(r"\S+@\S+", "", x))
df["message_clean"] = df["message_clean"].apply(lambda x: re.sub(r"\d+", "", x))
df["message_clean"] = df["message_clean"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df["message_clean"] = df["message_clean"].apply(lambda x: " ".join(x.split()))

print(f"Before: {df['message'].iloc[0]}")
print(f"After: {df['message_clean'].iloc[0]}")

Before: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
After: go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat


In [8]:
df["message_length"] = df["message"].apply(len)
df["word_count"] = df["message"].apply(lambda x: len(x.split()))
df["avg_word_length"] = df["message_length"] / df["word_count"]

df["caps_count"] = df["message"].apply(lambda x: sum(1 for c in x if c.isupper()))
df["caps_ratio"] = df["caps_count"] / df["message_length"]

df["special_chars"] = df["message"].apply(lambda x: sum(1 for c in x if c in "!?$â‚¬Â£%"))

print("features added :")
print(df[["message_length", "word_count", "caps_ratio", "special_chars"]].describe())

features added :
       message_length   word_count   caps_ratio  special_chars
count      5169.00000  5169.000000  5169.000000    5169.000000
mean         79.23196    15.340685     0.063230       0.533372
std          58.33921    11.068488     0.110651       0.958554
min           2.00000     1.000000     0.000000       0.000000
25%          36.00000     7.000000     0.025000       0.000000
50%          61.00000    12.000000     0.035971       0.000000
75%         117.00000    22.000000     0.056250       1.000000
max         910.00000   171.000000     1.000000      13.000000


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Tested multiple value 5000 is good
# ngram with (1, 2) works
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95, stop_words="english")

X_tfidf = vectorizer.fit_transform(df["message_clean"])
print(f"Shape TF-IDF: {X_tfidf.shape}")

Shape TF-IDF: (5169, 5000)


In [10]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

numerical_features = [
    "message_length",
    "word_count",
    "avg_word_length",
    "caps_ratio",
    "special_chars",
]

X_numerical = df[numerical_features].values

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

X_numerical_sparse = csr_matrix(X_numerical_scaled)
X_combined = hstack([X_tfidf, X_numerical_sparse])

print(f"Shape finale: {X_combined.shape}")

Shape finale: (5169, 5005)


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df["label"])

print(f"Classes: {le.classes_}")
print(f"Distribution: {np.bincount(y)}")

Classes: ['ham' 'spam']
Distribution: [4516  653]


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=TEST_SIZE, random_state=42, stratify=y)

print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")
import numpy as np

classes, counts = np.unique(y_train, return_counts=True)

print("Classes :", classes)

Train: 4135, Test: 1034
Classes : [0 1]


In [13]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print("=== Logistic Regression ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))

=== Logistic Regression ===
Accuracy: 0.9574
              precision    recall  f1-score   support

         ham       0.96      0.99      0.98       903
        spam       0.91      0.73      0.81       131

    accuracy                           0.96      1034
   macro avg       0.94      0.86      0.89      1034
weighted avg       0.96      0.96      0.96      1034



In [15]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss",
)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("=== XGBoost ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost ===
Accuracy: 0.9720
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98       903
        spam       0.92      0.85      0.89       131

    accuracy                           0.97      1034
   macro avg       0.95      0.92      0.93      1034
weighted avg       0.97      0.97      0.97      1034



In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.1, 0.5, 1.0, 5.0],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"],
}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring="f1", n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

Best params: {'C': 5.0, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV score: 0.8859




In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

y_pred_final = best_model.predict(X_test)
y_proba_final = best_model.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred_final):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_final):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))

print("Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=le.classes_))

Accuracy: 0.9700
ROC-AUC: 0.9899

Confusion Matrix:
[[898   5]
 [ 26 105]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       903
        spam       0.95      0.80      0.87       131

    accuracy                           0.97      1034
   macro avg       0.96      0.90      0.93      1034
weighted avg       0.97      0.97      0.97      1034



In [22]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fpr, tpr, _ = roc_curve(y_test, y_proba_final)
auc_score = roc_auc_score(y_test, y_proba_final)
precision, recall, _ = precision_recall_curve(y_test, y_proba_final)

fig = make_subplots(rows=1, cols=2, subplot_titles=("ROC Curve", "Precision-Recall Curve"))

fig.add_trace(
    go.Scatter(x=fpr, y=tpr, name=f"ROC (AUC = {auc_score:.3f})", mode="lines"),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(x=recall, y=precision, name="Precision-Recall", mode="lines"),
    row=1,
    col=2,
)

fig.update_xaxes(title_text="False Positive Rate", row=1, col=1)
fig.update_yaxes(title_text="True Positive Rate", row=1, col=1)

fig.update_xaxes(title_text="Recall", row=1, col=2)
fig.update_yaxes(title_text="Precision", row=1, col=2)

fig.update_layout(height=500, width=1000, title_text="Model Evaluation Metrics", showlegend=True)

fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [23]:
feature_names = vectorizer.get_feature_names_out().tolist() + numerical_features

coefs = best_model.coef_[0]
top_positive = np.argsort(coefs)[-10:]
top_negative = np.argsort(coefs)[:10]

print("Top 10 features SPAM:")
for idx in reversed(top_positive):
    print(f"  {feature_names[idx]}: {coefs[idx]:.4f}")

print("Top 10 features HAM:")
for idx in top_negative:
    print(f"  {feature_names[idx]}: {coefs[idx]:.4f}")

Top 10 features SPAM:
  txt: 6.3152
  text: 5.5486
  mobile: 4.8120
  won: 4.6221
  reply: 4.4612
  claim: 4.3529
  stop: 3.9335
  new: 3.9217
  prize: 3.9010
  chat: 3.4649
Top 10 features HAM:
  ltgt: -7.0832
  happy: -4.3668
  word_count: -2.6398
  amp: -2.3746
  im: -2.3254
  Â½Ã¯: -2.2796
  ill: -2.2202
  went: -1.8840
  mail: -1.8190
  day: -1.8123


In [24]:
import os
import pickle

os.makedirs("models", exist_ok=True)

with open("models/model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("models/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("models/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("OK")

OK


In [25]:
def predict_spam(text: str) -> tuple[str, float]:
    text_clean = text.lower()
    text_clean = re.sub(r"http\S+|www\S+|https\S+", "", text_clean)
    text_clean = re.sub(r"\S+@\S+", "", text_clean)
    text_clean = re.sub(r"\d+", "", text_clean)
    text_clean = text_clean.translate(str.maketrans("", "", string.punctuation))
    text_clean = " ".join(text_clean.split())

    X_tfidf = vectorizer.transform([text_clean])

    text_length = len(text)
    word_count = len(text.split())
    avg_word_length = text_length / word_count if word_count > 0 else 0
    caps_count = sum(1 for c in text if c.isupper())
    caps_ratio = caps_count / text_length if text_length > 0 else 0
    special_chars = sum(1 for c in text if c in "!?$â‚¬Â£%")

    X_numerical = [[text_length, word_count, avg_word_length, caps_ratio, special_chars]]
    X_numerical_scaled = scaler.transform(X_numerical)

    X_combined = hstack([X_tfidf, csr_matrix(X_numerical_scaled)])

    pred = best_model.predict(X_combined)[0]
    proba = best_model.predict_proba(X_combined)[0]

    label = le.inverse_transform([pred])[0]
    confidence = max(proba) * 100

    return label, confidence


test_messages = [
    "FREE!!! You have won a $1000 Walmart gift card! Click here to claim NOW!!!",
    "Hey, are we still meeting for coffee tomorrow at 3pm?",
    "URGENT: Your account will be suspended. Verify your details immediately.",
    "Thanks for your help with the project yesterday. Really appreciated it!",
    "Congratulations! You've been selected for a FREE iPhone 15! Call 0800-123-456",
    "Can you send me the meeting notes when you get a chance?",
]

for msg in test_messages:
    label, confidence = predict_spam(msg)
    emoji = "ðŸš«" if label == "spam" else "âœ…"
    print(f"{emoji} [{label:4}] ({confidence:.1f}%) {msg[:50]}...")

ðŸš« [spam] (93.9%) FREE!!! You have won a $1000 Walmart gift card! Cl...
âœ… [ham ] (99.1%) Hey, are we still meeting for coffee tomorrow at 3...
âœ… [ham ] (66.6%) URGENT: Your account will be suspended. Verify you...
âœ… [ham ] (94.2%) Thanks for your help with the project yesterday. R...
ðŸš« [spam] (59.5%) Congratulations! You've been selected for a FREE i...
âœ… [ham ] (96.7%) Can you send me the meeting notes when you get a c...


In [26]:
# # Ancien code de preprocessing - ne pas utiliser
# def old_preprocess(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z\s]', '', text)
#     return text

# # Essai avec SVM - trop lent, abandonnÃ©
# from sklearn.svm import SVC
# svm_model = SVC(kernel='rbf', probability=True)
# svm_model.fit(X_train, y_train)

# # Test avec CountVectorizer au lieu de TF-IDF
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features=5000)
# X_cv = cv.fit_transform(df['text_clean'])