In [58]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [59]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [None]:
from textblob import TextBlob
textBlb = TextBlob(text)
text = textBlb.correct().string

In [60]:
import spacy

nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    
    return " ".join(filtered_tokens) 

In [61]:
df['Message']=df['Message'].apply(preprocess)

In [62]:
df['Category'] = df['Category'].map({
    'ham':0,
    'spam':1,
})

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.Category, test_size=0.2,stratify=df.Category)

In [64]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [65]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [66]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [67]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('nb', MultinomialNB())
])

In [68]:
clf.fit(X_train, y_train)


Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 3))),
                ('nb', MultinomialNB())])

In [69]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.88      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [70]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [71]:
clf.fit(X_train, y_train)


Pipeline(steps=[('vectorizer_tfidf', TfidfVectorizer()),
                ('nb', MultinomialNB())])

In [72]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

