In [None]:
import nltk
nltk.download('all')

In [54]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

In [55]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [56]:
dataset = '/content/drive/MyDrive/Colab Notebooks/NLP Datasets/sms_spam.csv'
missing_cells = ['na', 'NA', 'N/A', 'n/a', '-', np.nan]
messages = pd.read_csv(dataset, na_values=missing_cells)
messages

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [57]:
messages.rename(columns={'type': 'Category', 'text': 'Text_Messages'}, inplace=True)

In [58]:
messages

Unnamed: 0,Category,Text_Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [59]:
messages.isnull().sum()

Category         0
Text_Messages    0
dtype: int64

In [60]:
messages['Category'].replace(['ham', 'spam'], [0, 1], inplace=True)

In [61]:
messages

Unnamed: 0,Category,Text_Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will ü b going to esplanade fr home?
5571,0,"Pity, * was in mood for that. So...any other s..."
5572,0,The guy did some bitching but I acted like i'd...


In [62]:
messages['Category'].value_counts()

0    4827
1     747
Name: Category, dtype: int64

In [63]:
lemmatization = WordNetLemmatizer()

result = []
for index, row in messages.iterrows():
  sentence = row['Text_Messages']
  sentence = re.sub('[^a-zA-Z]', ' ', sentence)
  sentence = sentence.lower()
  sentence = sentence.split()
  sentence = [lemmatization.lemmatize(word) for word in sentence if word not in set(stopwords.words('english'))]
  sentence = ' '.join(sentence)
  result.append(sentence)

messages['Converted_Text'] = result

In [64]:
messages.drop('Text_Messages', axis=1, inplace=True)

In [65]:
#Creating TFIDF vectors 

form_vectors = TfidfVectorizer()
X = form_vectors.fit_transform(messages['Converted_Text']).toarray()
X.shape

(5574, 7098)

In [66]:
y = messages['Category']

In [67]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [68]:
#The classes in the dataset are heavily imbalanced. There are approximately 6.46x more samples under the 'ham' category. 
#Therefore, oversampling the spam category is necessary.

from imblearn.over_sampling import SMOTE

sampler = SMOTE(sampling_strategy='not majority', random_state=1)
X_resampled, y_resampled = sampler.fit_resample(X, y)
print(X_resampled.shape, y_resampled.shape)

(9654, 7098) (9654,)


In [69]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=1)

In [70]:
spamclassifier = MultinomialNB()
spamclassifier.fit(X_train, y_train)

MultinomialNB()

In [71]:
pred = spamclassifier.predict(X_val)
how_accurate = metrics.accuracy_score(y_val, pred)
how_accurate

0.972035214914552

In [72]:
correctly_identified = metrics.confusion_matrix(y_val, pred)
correctly_identified

array([[946,  45],
       [  9, 931]])