In [19]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import re

In [4]:
df = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])

In [5]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [59]:
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
corpus = []
for i in range(len(df)):
    msg = re.sub('[^a-zA-Z]', ' ', df['message'].loc[i])
    msg = msg.lower()
    msg = msg.split()
    
    msg = [ps.stem(word) for word in msg if not word in stopwords.words('english')]
    msg = ' '.join(msg)
    corpus.append(msg)

In [60]:
corpus[: 5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [61]:
df['message'].iloc[: 5]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [89]:
tf_idf = TfidfVectorizer(max_features = 6500)
X = tf_idf.fit_transform(corpus,).toarray()

In [90]:
y = pd.get_dummies(df['label'], drop_first=True).values

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [92]:
X_train.shape, y_train.shape

((4457, 6296), (4457, 1))

In [93]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

  return f(*args, **kwargs)


MultinomialNB()

In [94]:
y_pred = classifier.predict(X_test)

In [95]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[964   0]
 [ 35 116]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       964
           1       1.00      0.77      0.87       151

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [96]:
cv = CountVectorizer(max_features=6500)
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(df['label'], drop_first=True).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

  return f(*args, **kwargs)


[[944  20]
 [  8 143]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       964
           1       0.88      0.95      0.91       151

    accuracy                           0.97      1115
   macro avg       0.93      0.96      0.95      1115
weighted avg       0.98      0.97      0.98      1115

