In [1]:
import csv
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
messages = [line.rstrip() for line in open("smsspamcollection/SMSSpamCollection")]

In [3]:
len(messages)

5574

In [4]:
# Create Pandas dataset
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE, names=["class", "message"])

In [5]:
messages.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
messages.groupby('class').count()

Unnamed: 0_level_0,message
class,Unnamed: 1_level_1
ham,4827
spam,747


In [7]:
# Transform messages in lower case and split them
messages['message'] = messages.message.apply(lambda m: m.lower().split())
messages.head()

Unnamed: 0,class,message
0,ham,"[go, until, jurong, point,, crazy.., available..."
1,ham,"[ok, lar..., joking, wif, u, oni...]"
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,"[u, dun, say, so, early, hor..., u, c, already..."
4,ham,"[nah, i, don't, think, he, goes, to, usf,, he,..."


In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Grouping together the inflected forms of a word
def WordsIntoBaseForm(message):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in message])

In [10]:
messages['message'] = messages.message.apply(WordsIntoBaseForm)
messages.head()

Unnamed: 0,class,message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he go to usf, he life around..."


In [11]:
# Vectorize the words
trainingVector = CountVectorizer().fit(messages['message'].values)

In [12]:
messagesBagOfWords = trainingVector.transform(messages['message'])

In [13]:
words_df = pd.DataFrame(data=messagesBagOfWords.toarray(),columns = trainingVector.get_feature_names())
words_df

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Normalize
messagesTfidf = TfidfTransformer().fit(messagesBagOfWords).transform(messagesBagOfWords)

In [15]:
spamDetector = MultinomialNB().fit(messagesTfidf, messages['class'].values)

In [16]:
example1 = ['England v Macedonia - dont miss the goals/team news. Txt ENGLAND to 99999']

In [17]:
checkResult = spamDetector.predict(trainingVector.transform(example1))[0]
checkResult

'spam'

In [18]:
example2 = ['Hi, how r u doing?']

In [19]:
checkResult = spamDetector.predict(trainingVector.transform(example2))[0]
checkResult

'ham'