In [1]:
import pandas as pd
import numpy as np
import sklearn
import nltk

In [2]:
df=pd.read_csv("spam.csv",encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace=True)

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.rename(columns={"v1":"spam","v2":"label"},inplace=True)

In [8]:
df.head()

Unnamed: 0,spam,label
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
df["spam"]=df["spam"].map({"ham":0,"spam":1})

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [13]:
lemmatizer=WordNetLemmatizer()

In [14]:
#lower,tokenize, lemmatize, stopwords

In [15]:
english_sw=stopwords.words("english")

In [16]:
def text_preprocessing(text):
    text=text.lower()
    words=word_tokenize(text)
    words=[lemmatizer.lemmatize(word,pos="v") for word in words if word not in set(english_sw)]
    return " ".join(words)

In [17]:
df.head()

Unnamed: 0,spam,label
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
df["label"]=df["label"].apply(text_preprocessing)

In [19]:
df.head()

Unnamed: 0,spam,label
0,0,"go jurong point , crazy .. available bugis n g..."
1,0,ok lar ... joke wif u oni ...
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor ... u c already say ...
4,0,"nah n't think go usf , live around though"


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
vectorizer=TfidfVectorizer(max_features=400,binary=True,ngram_range=(1,3))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df["label"], df["spam"], test_size=0.33, random_state=42)

In [23]:
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [25]:
log_m=LogisticRegression(max_iter=200)
mnb=MultinomialNB()

In [26]:
log_m.fit(X_train,y_train)
mnb.fit(X_train,y_train)

In [27]:
log_pred=log_m.predict(X_test)
mnb_pred=mnb.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [29]:
print(f"Logistic Model Accuracy Score {accuracy_score(log_pred,y_test)}")
print(classification_report(log_pred,y_test))

Logistic Model Accuracy Score 0.9679173463839043
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1626
           1       0.81      0.95      0.87       213

    accuracy                           0.97      1839
   macro avg       0.90      0.96      0.93      1839
weighted avg       0.97      0.97      0.97      1839



In [30]:
print(f"Multinomial Bias Accuracy Score {accuracy_score(mnb_pred,y_test)}")
print(classification_report(mnb_pred,y_test))

Multinomial Bias Accuracy Score 0.9700924415443176
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1618
           1       0.83      0.95      0.88       221

    accuracy                           0.97      1839
   macro avg       0.91      0.96      0.93      1839
weighted avg       0.97      0.97      0.97      1839



In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

parameters = {'alpha': [0.1, 0.5, 1.0, 2.0]}
mnb_model = GridSearchCV(MultinomialNB(), parameters, cv=5,scoring="accuracy")
mnb_model.fit(X_train, y_train)

best_model = mnb_model.best_estimator_
mnb_predicted=mnb_model.predict(X_test)
print(f"MNB model accuracy score {accuracy_score(mnb_predicted,y_test)}")

MNB model accuracy score 0.9711799891245242


In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['liblinear']  
}

log_m = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')
log_m.fit(X_train, y_train)

print(f'Best parameters for Logistic Regression: {log_m.best_params_}')
print(f'Best cross-validation score: {log_m.best_score_}')
print(f"Logistic Model accuracy score {accuracy_score(log_m.predict(X_test),y_test)}")

Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation score: 0.982855102267874
Logistic Model accuracy score 0.9728113104948342


In [38]:
from sklearn.svm import SVC

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']  
}

grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

best_svm = grid_search_svm.best_estimator_
print(f'Best parameters for SVM: {grid_search_svm.best_params_}')
print(f'Best cross-validation score: {grid_search_svm.best_score_}')
print(f"SVC model accuracy score {accuracy_score(grid_search_svm.predict(X_test),y_test)}")

Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation score: 0.9825870057531286
SVC model accuracy score 0.9766177270255574


In [41]:
new=pd.DataFrame({"Model":"Multinomial_Bias Logistic SVC".split(" "),"Accuracy":[accuracy_score(mnb_model.predict(X_test),y_test),accuracy_score(log_m.predict(X_test),y_test),accuracy_score(grid_search_svm.predict(X_test),y_test)]})

In [44]:
new

Unnamed: 0,Model,Accuracy
0,Multinomial_Bias,0.97118
1,Logistic,0.972811
2,SVC,0.976618


In [45]:
new["accuracy"]=["0.9700924415443176","0.9679173463839043",np.nan]

In [54]:
new

Unnamed: 0,Model,Improved_Accuracy,Accuracy
0,Multinomial_Bias,0.971,0.9700924415443176
1,Logistic,0.973,0.9679173463839044
2,SVC,0.977,


In [52]:
new.rename(columns={"Accuracy":"Improved_Accuracy","accuracy":"Accuracy"},inplace=True)

In [53]:
new.columns

Index(['Model', 'Improved_Accuracy', 'Accuracy'], dtype='object')

In [59]:
new["Accuracy"]=new[["Accuracy"]].round(3)

In [61]:
new

Unnamed: 0,Model,Improved_Accuracy,Accuracy
0,Multinomial_Bias,0.971,0.9700924415443176
1,Logistic,0.973,0.9679173463839044
2,SVC,0.977,
