In [58]:
import pandas as pd


In [59]:
df = pd.read_csv('/content/SMSSpamCollection.txt',sep='\t',names=['label','message'])

In [60]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [61]:
import re
import nltk

In [62]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [63]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [64]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [65]:
lemmatizer = WordNetLemmatizer()

In [66]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [67]:
corpus = []
for i in range(len(df)):
  sentence = re.sub('[^a-zA-Z]', ' ', df['message'][i])
  sentence = sentence.lower()
  words_arr = sentence.split()
  words = [lemmatizer.lemmatize(word) for word in words_arr if not word in stopwords.words('english')]
  words = ' '.join(words)

  corpus.append(words)

In [69]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [70]:
#bag of word model
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [71]:
X.shape

(5572, 7100)

This 7100 means i have this much words, we dont need all 7100 words because some of these words will not be frequently present and we dont want your data to grow that much, so we give max_features, which means we need top 5000 words from our features

In [72]:
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [73]:
X.shape

(5572, 5000)

In [74]:
Y = pd.get_dummies(df['label'])
Y

Unnamed: 0,ham,spam
0,True,False
1,True,False
2,False,True
3,True,False
4,True,False
...,...,...
5567,False,True
5568,True,False
5569,True,False
5570,True,False


In [75]:
Y = Y['spam'].values
Y

array([False, False,  True, ..., False, False, False])

In [76]:
Y.shape

(5572,)

In [77]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [78]:
# Train model using naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
spam_detection_model = MultinomialNB().fit(X_train, Y_train)

In [79]:
Y_pred = spam_detection_model.predict(X_test)

In [80]:
Y_pred

array([False, False, False, ..., False, False, False])

In [81]:
#confusion metrics
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(Y_test, Y_pred)
confusion_m

array([[950,  16],
       [  7, 142]])

In [82]:
#accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, Y_pred)


In [83]:
accuracy

0.979372197309417