### Load, clean and prepare the data

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#nltk.download()
#nltk.download('punkt')

In [13]:
import nltk
 
file = open('SMSSpamCollection.txt', 'r')
read_file = file.read()
text = nltk.Text(nltk.word_tokenize(read_file))
 
match = text.concordance('language')

Displaying 2 of 2 matches:
ere are many company . Tell me the language . spam okmail : Dear Dave this is 
hat bill brison book the one about language and words ham Okay , good , no pro


In [14]:
import pandas as pd

df = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['Spam','Phrase'])

In [15]:
df.head(15)

Unnamed: 0,Spam,Phrase
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


Turning 'Spam' values into 0 (for 'ham') or 1 (for 'spam'):

In [16]:
df['Spam'] = df['Spam'].replace(['ham', 'spam'], [0,1])

In [17]:
df.head(15)

Unnamed: 0,Spam,Phrase
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


Finding out the 10 most frequent words:

In [50]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=stop_words)
vec = vectorizer.fit(df['Phrase'])
vector = vectorizer.transform(df['Phrase'])

bag_of_words = vec.transform(df['Phrase'])
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

print(words_freq[:10])

[('call', 593), ('ur', 391), ('get', 391), ('gt', 318), ('lt', 316), ('ok', 293), ('free', 284), ('go', 283), ('know', 262), ('like', 247)]


The 'least used words' is a long list of words that are used only once, as per the word distribution.

### Splitting the data into traing/test sets

In [52]:
from sklearn.model_selection import train_test_split

corpusTrain, corpusTest, targetTrain, TargetTest = train_test_split(vector, df['Spam'], test_size=0.2, random_state=42)

### Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression
import time

start = time.time()
logreg=LogisticRegression().fit(corpusTrain, targetTrain)
print("Timing: ", (time.time() - start))
print("Training score", logreg.score(corpusTrain, targetTrain) )
print("Test score", logreg.score(corpusTest, TargetTest) )

Timing:  0.22370576858520508
Training score 0.9966345075162666
Test score 0.9856502242152466


Confusion matrix:

In [55]:
from sklearn.metrics import confusion_matrix
y_prediction = logreg.predict(corpusTest)
confMtx = confusion_matrix(TargetTest, y_prediction)
print(confMtx)

[[966   0]
 [ 16 133]]


### SVM

In [62]:
from sklearn.svm import SVC

start = time.time()
clfSVC = SVC().fit(corpusTrain, targetTrain)
print("Timing: ", (time.time() - start))
print("Training score",  clfSVC.score(corpusTrain, targetTrain) )
print("Test score",  clfSVC.score(corpusTest, TargetTest) )


Timing:  1.249746322631836
Training score 0.9968588736818488
Test score 0.9838565022421525


Confusion matrix:

In [59]:
clfSVC_predict = clfSVC.predict(corpusTest)
confMtx2 = confusion_matrix(TargetTest, clfSVC_predict)
print(confMtx2)

[[966   0]
 [ 18 131]]


We can observe that SVM is slower than Linear Regression and not as accurate

In [None]:
from sklearn.metrics import f1_score
# ...