In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [13]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [14]:
df = pd.read_csv("email_spam_cleaned.csv")

In [15]:
print(df.head())

  label                                            message  char_count  \
0  safe  Go until jurong point, crazy.. Available only ...         111   
1  safe                      Ok lar... Joking wif u oni...          29   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...         155   
3  safe  U dun say so early hor... U c already then say...          49   
4  safe  Nah I don't think he goes to usf, he lives aro...          61   

   word_count                                            cleaned  
0          20  go jurong point crazi avail bugi n great world...  
1           6                              ok lar joke wif u oni  
2          28  free entri wkli comp win fa cup final tkt st m...  
3          11                u dun say earli hor u c alreadi say  
4          13          nah dont think goe usf live around though  


In [16]:
print("\nClass Distribution:\n", df['label'].value_counts())


Class Distribution:
 label
safe    4825
spam     747
Name: count, dtype: int64


In [17]:


# Drop rows where 'cleaned' or 'label' are missing
df = df.dropna(subset=['cleaned', 'label'])

# Optional: fill any remaining NaN in 'cleaned' (if any) with empty strings
df['cleaned'] = df['cleaned'].fillna("")

# Define X and y
X = df['cleaned']
y = df['label']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [18]:
print("multinomial naive-bayes")
nb = MultinomialNB(alpha=0.1)  
nb.fit(X_train_tfidf, y_train)
nb_preds = nb.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))

multinomial naive-bayes
accuracy: 0.9784560143626571
              precision    recall  f1-score   support

        safe       0.98      0.99      0.99       964
        spam       0.96      0.87      0.92       150

    accuracy                           0.98      1114
   macro avg       0.97      0.93      0.95      1114
weighted avg       0.98      0.98      0.98      1114



In [19]:
print("logistic regression")
lr = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear')
lr.fit(X_train_tfidf, y_train)
lr_preds = lr.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))

logistic regression
accuracy: 0.973967684021544
              precision    recall  f1-score   support

        safe       0.99      0.98      0.98       964
        spam       0.89      0.92      0.90       150

    accuracy                           0.97      1114
   macro avg       0.94      0.95      0.94      1114
weighted avg       0.97      0.97      0.97      1114



In [20]:
print("linear SVM")
svm = LinearSVC(class_weight='balanced', max_iter=1000)
svm.fit(X_train_tfidf, y_train)
svm_preds = svm.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))

linear SVM
accuracy: 0.9775583482944344
              precision    recall  f1-score   support

        safe       0.99      0.99      0.99       964
        spam       0.93      0.91      0.92       150

    accuracy                           0.98      1114
   macro avg       0.96      0.95      0.95      1114
weighted avg       0.98      0.98      0.98      1114





In [21]:
print("ridge classifier")
ridge = RidgeClassifier(class_weight='balanced')
ridge.fit(X_train_tfidf, y_train)
ridge_preds = ridge.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, ridge_preds))
print(classification_report(y_test, ridge_preds))

ridge classifier
accuracy: 0.9793536804308797
              precision    recall  f1-score   support

        safe       0.99      0.99      0.99       964
        spam       0.93      0.91      0.92       150

    accuracy                           0.98      1114
   macro avg       0.96      0.95      0.96      1114
weighted avg       0.98      0.98      0.98      1114



In [22]:
print("passive aggressive classifier")
pa = PassiveAggressiveClassifier(max_iter=1000, class_weight='balanced')
pa.fit(X_train_tfidf, y_train)
pa_preds = pa.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, pa_preds))
print(classification_report(y_test, pa_preds))

passive aggressive classifier
accuracy: 0.9766606822262118
              precision    recall  f1-score   support

        safe       0.98      0.99      0.99       964
        spam       0.92      0.90      0.91       150

    accuracy                           0.98      1114
   macro avg       0.95      0.94      0.95      1114
weighted avg       0.98      0.98      0.98      1114



In [23]:
print("decision tree classifier") 
dt = DecisionTreeClassifier(max_depth=30, class_weight='balanced')
dt.fit(X_train_tfidf, y_train)
dt_preds = dt.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, dt_preds))
print(classification_report(y_test, dt_preds))

decision tree classifier
accuracy: 0.9560143626570916
              precision    recall  f1-score   support

        safe       0.98      0.97      0.97       964
        spam       0.83      0.85      0.84       150

    accuracy                           0.96      1114
   macro avg       0.90      0.91      0.91      1114
weighted avg       0.96      0.96      0.96      1114



In [24]:
print("random forest classifier")
rf = RandomForestClassifier(n_estimators=100, max_depth=30, class_weight='balanced')
rf.fit(X_train_tfidf, y_train)
rf_preds = rf.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

random forest classifier
accuracy: 0.9766606822262118
              precision    recall  f1-score   support

        safe       0.98      1.00      0.99       964
        spam       0.97      0.85      0.91       150

    accuracy                           0.98      1114
   macro avg       0.97      0.92      0.95      1114
weighted avg       0.98      0.98      0.98      1114



In [25]:
print("extra trees classifier")
et = ExtraTreesClassifier(n_estimators=100, max_depth=30, class_weight='balanced')
et.fit(X_train_tfidf, y_train)
et_preds = et.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, et_preds))
print(classification_report(y_test, et_preds))

extra trees classifier
accuracy: 0.9748653500897666
              precision    recall  f1-score   support

        safe       0.97      1.00      0.99       964
        spam       0.98      0.83      0.90       150

    accuracy                           0.97      1114
   macro avg       0.98      0.92      0.94      1114
weighted avg       0.97      0.97      0.97      1114



In [26]:
print("gradient boosting classifier")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
gb.fit(X_train_tfidf, y_train)
gb_preds = gb.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, gb_preds))
print(classification_report(y_test, gb_preds))

gradient boosting classifier
accuracy: 0.9596050269299821
              precision    recall  f1-score   support

        safe       0.96      0.99      0.98       964
        spam       0.96      0.73      0.83       150

    accuracy                           0.96      1114
   macro avg       0.96      0.86      0.90      1114
weighted avg       0.96      0.96      0.96      1114



In [27]:
print("adaboost classifier")
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
ada.fit(X_train_tfidf, y_train)
ada_preds = ada.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, ada_preds))
print(classification_report(y_test, ada_preds))

adaboost classifier




accuracy: 0.966786355475763
              precision    recall  f1-score   support

        safe       0.97      0.99      0.98       964
        spam       0.93      0.81      0.87       150

    accuracy                           0.97      1114
   macro avg       0.95      0.90      0.92      1114
weighted avg       0.97      0.97      0.97      1114



In [28]:
print("bagging classifier")
bag = BaggingClassifier(n_estimators=50)
bag.fit(X_train_tfidf, y_train)
bag_preds = bag.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, bag_preds))
print(classification_report(y_test, bag_preds))


bagging classifier
accuracy: 0.9640933572710951
              precision    recall  f1-score   support

        safe       0.97      0.99      0.98       964
        spam       0.91      0.81      0.86       150

    accuracy                           0.96      1114
   macro avg       0.94      0.90      0.92      1114
weighted avg       0.96      0.96      0.96      1114



In [29]:
print("K-nearest neighbors")
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)
knn.fit(X_train_tfidf, y_train)
knn_preds = knn.predict(X_test_tfidf)
print("accuracy:", accuracy_score(y_test, knn_preds))
print(classification_report(y_test, knn_preds))

K-nearest neighbors
accuracy: 0.9416517055655296
              precision    recall  f1-score   support

        safe       0.94      1.00      0.97       964
        spam       0.99      0.57      0.73       150

    accuracy                           0.94      1114
   macro avg       0.96      0.79      0.85      1114
weighted avg       0.94      0.94      0.93      1114

