In [7]:
import re
import pickle
import os
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aa2ch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aa2ch\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aa2ch\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
DATA_URL = "https://breathecode.herokuapp.com/asset/internal-link?id=932&path=url_spam.csv"
MODEL_PATH = "models/svm_url_spam.pkl"

STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()

def url_tokenizer(url: str):
    url = str(url).lower()
    raw_tokens = re.split(r"[^a-z0-9]+", url)
    tokens = []
    for t in raw_tokens:
        if len(t) < 2:
            continue
        if t in STOPWORDS:
            continue
        tokens.append(LEMMATIZER.lemmatize(t))
    return tokens


In [3]:
df = pd.read_csv(DATA_URL)
df.head()

url_col = "url" if "url" in df.columns else df.columns[0]
y_col = "is_spam" if "is_spam" in df.columns else df.columns[-1]

X = df[url_col].astype(str)
y = df[y_col].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
base_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        tokenizer=url_tokenizer,
        token_pattern=None,
        ngram_range=(1, 2),
        min_df=2
    )),
    ("svm", LinearSVC())
])

base_pipeline.fit(X_train, y_train)
y_pred = base_pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


[[452   9]
 [ 15 124]]
              precision    recall  f1-score   support

           0     0.9679    0.9805    0.9741       461
           1     0.9323    0.8921    0.9118       139

    accuracy                         0.9600       600
   macro avg     0.9501    0.9363    0.9430       600
weighted avg     0.9596    0.9600    0.9597       600



In [5]:
param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [1, 2, 5],
    "svm__C": [0.1, 1, 3, 10],
    "svm__loss": ["hinge", "squared_hinge"],
    "svm__class_weight": [None, "balanced"],
}

grid = GridSearchCV(
    base_pipeline,
    param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print(grid.best_params_)

y_pred_best = best_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best, digits=4))


Fitting 5 folds for each of 96 candidates, totalling 480 fits
{'svm__C': 3, 'svm__class_weight': None, 'svm__loss': 'squared_hinge', 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
[[451  10]
 [ 13 126]]
              precision    recall  f1-score   support

           0     0.9720    0.9783    0.9751       461
           1     0.9265    0.9065    0.9164       139

    accuracy                         0.9617       600
   macro avg     0.9492    0.9424    0.9457       600
weighted avg     0.9614    0.9617    0.9615       600



In [8]:
os.makedirs("models", exist_ok=True)

with open(MODEL_PATH, "wb") as f:
    pickle.dump(best_model, f)

test_url = "http://free-gift-cards-now.com/win"
print(test_url, "=>", "spam" if best_model.predict([test_url])[0] == 1 else "no spam")


http://free-gift-cards-now.com/win => no spam
