In [51]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

In [52]:
df = pd.read_csv("https://raw.githubusercontent.com/abhijitpaul0212/DataSets/main/SMSSpamCollection.csv", header=None, sep="\t", names=["label", "messages"])
df.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
df.shape

(5572, 2)

In [54]:
df['messages'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [55]:
target_count = df['label'].value_counts()
print("NOT SPAM: {}%".format(round((target_count[0]/sum(target_count)) * 100), 2))
print("SPAM: {}%".format(round((target_count[1]/sum(target_count)) * 100), 2))

NOT SPAM: 87%
SPAM: 13%


Data set is imbalanced

In [56]:
import nltk
import re

In [57]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abpaul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [59]:
ps = PorterStemmer()

* Bag of words | TD - IDF
* Naive-Bayes uses conditional probability
* Stemming means getting the root word i.e. likes, liked, linking, likely --> like (root word)

In [60]:
# stopwords.words('english')

In [61]:
corpus = []

for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['messages'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)


In [62]:
corpus[0:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [65]:
X.shape

(5572, 6296)

In [66]:
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [67]:
X.shape

(5572, 2500)

In [68]:
df['label']

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [69]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = sm.fit_resample(X=X, y=df['label'])
df_resampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)

print('Original dataset shape', df.shape[0])
not_spam, spam = df['label'].value_counts()
print("{}% of data are Not Spam \n{}% of data are Spam".format(round(not_spam/(not_spam+spam) * 100, 2), round(spam/(not_spam+spam) * 100, 2)))

print('Resampled dataset shape', df_resampled.shape[0])
not_spam, spam = df_resampled['label'].value_counts()
print("{}% of data are Not Spam \n{}% of data are Spam".format(round(not_spam/(not_spam+spam) * 100, 2), round(spam/(not_spam+spam) * 100, 2)))

Original dataset shape 5572
86.59% of data are Not Spam 
13.41% of data are Spam
Resampled dataset shape 9650
50.0% of data are Not Spam 
50.0% of data are Spam


In [70]:
# y = np.where(df['label']=='spam', 1, 0)
y = pd.get_dummies(df_resampled['label'], drop_first=True).astype(int)
# y = df['label']

In [71]:
df_resampled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2491,2492,2493,2494,2495,2496,2497,2498,2499,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [72]:
X = df_resampled.iloc[:, :-1]
len(X)

9650

In [73]:
len(y)

9650

In [74]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [75]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_pred, y_test))
print(precision_score(y_pred, y_test))
print(recall_score(y_pred, y_test))
print(f1_score(y_pred, y_test))
print(roc_auc_score(y_pred, y_test))

0.9229175300455864
0.9896640826873385
0.8684807256235828
0.9251207729468599
0.9287357756558281


In [77]:
model2 = MultinomialNB()
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
print(accuracy_score(y_pred2, y_test))
print(precision_score(y_pred2, y_test))
print(recall_score(y_pred2, y_test))
print(f1_score(y_pred2, y_test))
print(roc_auc_score(y_pred2, y_test))


0.9747202652300041
0.9758828596037898
0.9716981132075472
0.9737859905457671
0.9746221119365722
