In [22]:
!pip install scikit-learn




In [36]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv("spam.csv")
TEXT_COL = 'text' if 'text' in df.columns else max([c for c in df.columns if df[c].dtype=='object'], key=lambda c: df[c].astype(str).str.len().mean())
texts = df[TEXT_COL].astype(str).fillna("")

def basic_clean(t):
    t = t.lower()
    t = re.sub(r'http\S+|www\.\S+', ' ', t)
    t = re.sub(r'\S+@\S+', ' ', t)
    t = re.sub(r'[^a-z\s]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

clean_basic = texts.apply(basic_clean)

tokens = clean_basic.apply(word_tokenize)

sw = set(stopwords.words('english'))
tokens_nostop = tokens.apply(lambda ws: [w for w in ws if w not in sw and len(w) > 1])

ps = PorterStemmer()
tokens_stem = tokens_nostop.apply(lambda ws: [ps.stem(w) for w in ws])
clean_text_stem = tokens_stem.apply(lambda ws: " ".join(ws))

wnl = WordNetLemmatizer()
tokens_lemma = tokens_nostop.apply(lambda ws: [wnl.lemmatize(w) for w in ws])
clean_text_lemma = tokens_lemma.apply(lambda ws: " ".join(ws))

df['clean_text'] = clean_text_lemma


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
df['clean_text'] = texts.apply(lambda t: re.sub(r'[^a-zA-Z\s]', '', t.lower()))
df['clean_text'] = texts.apply(lambda t: re.sub(r'[^a-zA-Z\s]', '', t.lower()))

df = df[df['clean_text'].str.strip().astype(bool)]


In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Drop rows where clean_text is empty or only spaces
df = df[df['clean_text'].str.strip().astype(bool)]

# Fallback: if still empty, replace with placeholder
df['clean_text'] = df['clean_text'].replace('', 'emptytext')

cv = CountVectorizer(max_features=3000)
X_bow = cv.fit_transform(df['clean_text'])

tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(df['clean_text'])

print("Bag of Words shape:", X_bow.shape)
print("TF-IDF shape:", X_tfidf.shape)


Bag of Words shape: (5569, 3000)
TF-IDF shape: (5569, 3000)


In [39]:
print(df.columns)
print(df.head())


Index(['Category', 'Message', 'clean_text'], dtype='object')
  Category                                            Message  \
0      ham  Go until jurong point, crazy.. Available only ...   
1      ham                      Ok lar... Joking wif u oni...   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3      ham  U dun say so early hor... U c already then say...   
4      ham  Nah I don't think he goes to usf, he lives aro...   

                                          clean_text  
0  go until jurong point crazy available only in ...  
1                            ok lar joking wif u oni  
2  free entry in  a wkly comp to win fa cup final...  
3        u dun say so early hor u c already then say  
4  nah i dont think he goes to usf he lives aroun...  


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

X = X_tfidf
y = df['Category']   # use Category as label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb = MultinomialNB()
nb.fit(X_train, y_train)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb.predict(X_test)))

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr.predict(X_test)))

svm = LinearSVC()
svm.fit(X_train, y_train)
print("SVM Accuracy:", accuracy_score(y_test, svm.predict(X_test)))


Naive Bayes Accuracy: 0.9748653500897666
Logistic Regression Accuracy: 0.9721723518850988
SVM Accuracy: 0.9874326750448833


In [42]:
# evalution
from sklearn.metrics import classification_report, confusion_matrix

models = {
    "Naive Bayes": nb,
    "Logistic Regression": lr,
    "SVM": svm
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Naive Bayes Results:
Accuracy: 0.9748653500897666
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.99       955
        spam       1.00      0.82      0.90       159

    accuracy                           0.97      1114
   macro avg       0.99      0.91      0.94      1114
weighted avg       0.98      0.97      0.97      1114

Confusion Matrix:
 [[955   0]
 [ 28 131]]

Logistic Regression Results:
Accuracy: 0.9721723518850988
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       955
        spam       0.98      0.82      0.89       159

    accuracy                           0.97      1114
   macro avg       0.98      0.91      0.94      1114
weighted avg       0.97      0.97      0.97      1114

Confusion Matrix:
 [[953   2]
 [ 29 130]]

SVM Results:
Accuracy: 0.9874326750448833
Classification Report:
               precision    recall  f1