In [1]:
import pandas as pd

df = pd.read_csv("spam.csv", encoding="latin-1")
df = df[["v1", "v2"]]
df.columns = ["label", "message"]

X = df["message"]
y = df["label"]


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [3]:
from features import get_vectorizer

vectorizer = get_vectorizer()

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_vec, y_train)

y_pred = model_tfidf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9713004484304932


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score

nb_results = {
    "Model": "Multinomial Naive Bayes",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, pos_label="spam"),
    "Recall": recall_score(y_test, y_pred, pos_label="spam"),
    "F1": f1_score(y_test, y_pred, pos_label="spam")
}

nb_results


{'Model': 'Multinomial Naive Bayes',
 'Accuracy': 0.9713004484304932,
 'Precision': 0.9836065573770492,
 'Recall': 0.8,
 'F1': 0.8823529411764706}

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

log_reg = LogisticRegression(max_iter=1000)

log_reg.fit(X_train_vec, y_train)
y_pred_lr = log_reg.predict(X_test_vec)

lr_results = {
    "Model": "Logistic Regression",
    "Accuracy": accuracy_score(y_test, y_pred_lr),
    "Precision": precision_score(y_test, y_pred_lr, pos_label="spam"),
    "Recall": recall_score(y_test, y_pred_lr, pos_label="spam"),
    "F1": f1_score(y_test, y_pred_lr, pos_label="spam")
}

lr_results


{'Model': 'Logistic Regression',
 'Accuracy': 0.9704035874439462,
 'Precision': 0.9915966386554622,
 'Recall': 0.7866666666666666,
 'F1': 0.8773234200743495}

In [11]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()

svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)

svm_results = {
    "Model": "Support Vector Machine",
    "Accuracy": accuracy_score(y_test, y_pred_svm),
    "Precision": precision_score(y_test, y_pred_svm, pos_label="spam"),
    "Recall": recall_score(y_test, y_pred_svm, pos_label="spam"),
    "F1": f1_score(y_test, y_pred_svm, pos_label="spam")
}

svm_results


{'Model': 'Support Vector Machine',
 'Accuracy': 0.97847533632287,
 'Precision': 0.9565217391304348,
 'Recall': 0.88,
 'F1': 0.9166666666666666}

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)

rf_results = {
    "Model": "Random Forest",
    "Accuracy": accuracy_score(y_test, y_pred_rf),
    "Precision": precision_score(y_test, y_pred_rf, pos_label="spam"),
    "Recall": recall_score(y_test, y_pred_rf, pos_label="spam"),
    "F1": f1_score(y_test, y_pred_rf, pos_label="spam")
}

rf_results


{'Model': 'Random Forest',
 'Accuracy': 0.9811659192825112,
 'Precision': 0.9923664122137404,
 'Recall': 0.8666666666666667,
 'F1': 0.9252669039145908}

In [6]:
type(vectorizer)
model_tfidf.n_features_in_


1000

In [7]:

model_tfidf.predict(
    vectorizer.transform(["Congratulations! You have won a free prize"])
)


array(['spam'], dtype='<U4')

In [8]:
import pickle

pickle.dump(model_tfidf, open("spam_model.pkl", "wb"))
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))


In [9]:
model_tfidf.n_features_in_


1000

In [14]:
import pandas as pd

results_df = pd.DataFrame([
    nb_results,
    lr_results,
    svm_results,
    rf_results
])

results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Multinomial Naive Bayes,0.9713,0.983607,0.8,0.882353
1,Logistic Regression,0.970404,0.991597,0.786667,0.877323
2,Support Vector Machine,0.978475,0.956522,0.88,0.916667
3,Random Forest,0.981166,0.992366,0.866667,0.925267


In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

cv_scores = cross_val_score(
    MultinomialNB(),
    X_train_vec,
    y_train,
    cv=5,
    scoring="f1_macro"
)

cv_scores.mean()


np.float64(0.9451503553805676)