In [1]:
import re
import pandas as pd
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix


### Spam Classification
Deciding whether an email is spam or not.

## Step 1 loading the dataset

In [2]:


#load dataset
df=pd.read_csv('datasets/spam.csv', encoding='latin-1')
df=df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis='columns')

#df[v1] is the class variable and df[v2] is the  email
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Step 2 pre-processing: removing stopwords and stemming

In [3]:
stemmer=SnowballStemmer('english')
#A  stemming algorithm reduces words like fishing, fished, and fisher to the stem fish.
#The stem need not be a word, for example  argue, argued, 
#argues, arguing, and argus could be reduced to the stem argu. 

stop=set(stopwords.words('english'))
#Stop words are  the most common words in a language
#and are filtered out before processing of natural language data 


df['v2']=[re.sub('[^a-zA-Z]', ' ', sms) for sms in df['v2']]
word_list=[sms.split() for sms in df['v2']]
def normalize(words):
    current_words=list()
    for word in words:
        if word.lower() not in stop: #remove  the most common words
            updated_word=stemmer.stem(word) #stemming
            current_words.append(updated_word.lower()) #lower case
    return current_words
word_list=[normalize(word) for word in word_list]
df['v2']=[" ".join(word) for word in word_list]


In [4]:
#df[v1] is the class variable and df[v2] is the processed email
df

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkts st ...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though
...,...,...
5567,spam,nd time tri contact u u pound prize claim easi...
5568,ham,b go esplanad fr home
5569,ham,piti mood suggest
5570,ham,guy bitch act like interest buy someth els nex...


In [5]:
#split in training and testing
x_train, x_test, y_train, y_test=train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=7)



In [6]:
y_train.values.reshape(-1,1)

array([['ham'],
       ['ham'],
       ['ham'],
       ...,
       ['ham'],
       ['spam'],
       ['ham']], dtype=object)

## Step 3: transforming email into counts

In [17]:
#it counts the words
cv=CountVectorizer()
#it returns the number of times a word appears in the i-th email
x_train_df=cv.fit_transform(x_train) #x_train_df is a matrix emails times words
print("number of emails=",x_train_df.shape[0])
print("number of words=",x_train_df.shape[1])
x_test_df=cv.transform(x_test)
#this is a sparse matrix (it means that only non-zeroes elements are stored)
print(x_train_df)
#to get the full matrix
x_train_df.toarray()

number of emails= 4457
number of words= 5595
  (0, 5405)	1
  (0, 1991)	2
  (0, 1140)	1
  (0, 3047)	1
  (0, 4944)	1
  (0, 3328)	2
  (0, 162)	1
  (0, 4483)	1
  (0, 1398)	1
  (0, 1921)	1
  (0, 2676)	1
  (0, 458)	1
  (0, 4620)	1
  (0, 1552)	1
  (0, 3214)	1
  (0, 1790)	1
  (0, 2541)	1
  (0, 4984)	1
  (0, 4456)	1
  (1, 1089)	1
  (1, 1734)	1
  (1, 150)	1
  (1, 156)	1
  (2, 3342)	1
  (2, 4701)	1
  :	:
  (4453, 74)	1
  (4454, 2580)	1
  (4454, 1623)	1
  (4454, 2920)	1
  (4454, 1681)	1
  (4454, 3715)	1
  (4454, 683)	1
  (4455, 1529)	1
  (4455, 5150)	2
  (4455, 1776)	1
  (4455, 5065)	1
  (4455, 1885)	1
  (4455, 4786)	1
  (4455, 1792)	1
  (4455, 3249)	1
  (4455, 3960)	1
  (4455, 2160)	1
  (4455, 2341)	1
  (4455, 5327)	1
  (4455, 4533)	1
  (4455, 4956)	3
  (4455, 3032)	1
  (4455, 5067)	1
  (4455, 3248)	1
  (4456, 1885)	1


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
#listv = cv.get_feature_names()
#listv.append('Class')
#dataset = pd.DataFrame(data=np.hstack([x_train_df.toarray(),y_train.values.reshape(-1,1)]), columns=listv)
#dataset.to_csv("email_clean.csv",index=False)

### Understanding the input data format

In [9]:
row_index=0 #select one email
print(x_train_df[row_index,:].todense().shape)
print("this is the non-sparse matrix=",x_train_df[row_index,:].todense())
ind=np.where(x_train_df[row_index,:].todense()[0,:]>0)[1]
print()
#original words in the email
print(x_train.values[row_index])
print()
#decoded numerical input 
print(cv.inverse_transform(x_train_df[row_index,:].todense()))
print()
#index of those words in x_train_df[row_index,:].todense()
print(ind)
print()
# number of times those words appears in the email
print(x_train_df[row_index,ind].todense())

(1, 5595)
this is the non-sparse matrix= [[0 0 0 ... 0 0 0]]

wish great day moji told offer alway speechless offer easili go great length behalf stun exam next friday keep touch sorri

[array(['alway', 'behalf', 'day', 'easili', 'exam', 'friday', 'go',
       'great', 'keep', 'length', 'moji', 'next', 'offer', 'sorri',
       'speechless', 'stun', 'told', 'touch', 'wish'], dtype='<U34')]

[ 162  458 1140 1398 1552 1790 1921 1991 2541 2676 3047 3214 3328 4456
 4483 4620 4944 4984 5405]

[[1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1]]


## Step 4: training the classifier and making predictions for the test set

In [19]:

#MultinomialNB
clf=MultinomialNB()
clf.fit(x_train_df,y_train)
prediction_train=clf.predict(x_train_df)
prediction_test=clf.predict(x_test_df)
print(prediction_test)#returns the predictions for the test set

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [22]:
np.set_printoptions(suppress=True)
proba=clf.predict_proba(x_test_df)#returns the rpedicted probability of Ham and Spam for each email in the test set
proba

array([[0.99224585, 0.00775415],
       [0.99710212, 0.00289788],
       [0.99977543, 0.00022457],
       ...,
       [0.99721402, 0.00278598],
       [0.99999219, 0.00000781],
       [0.99364559, 0.00635441]])

In [28]:
ind=np.where(proba[:,1]>0.5)[0]#emails that are classified as Spam
print(x_train.values[ind])

['break time one come n get stuff fr'
 'dun need use dial juz open da browser n surf' 'make babi yo tho'
 'shower babi'
 'regist sinco paye log icicibank com enter urn lt gt confirm bewar fraud share disclos urn anyon'
 'yes sura sun tv lol'
 'hey guy know breath neck get bud anyway abl get half track usf tonight'
 'leav wif lar wan carri meh heavi da num familiar'
 'feb lt gt love u day send dis ur valu frnds evn come back u gt marri person u luv u ignor dis u lose ur luv evr'
 'gram usual run like lt gt half eighth smarter though get almost whole second gram lt gt'
 'watch tv got new job' 'prabha soryda reali frm heart sori'
 'forward hi mailbox messag sms alert messag match pleas call back retriev messag match'
 'normal hot mail com see' 'oh yes like tortur watch england'
 'hey rite u put evey mnth' 'good good job like entrepreneur'
 'gr see messag r u leav congrat dear school wat r ur plan' 'late'
 'let know detail fri u find cos tom fri mention chines thank'
 'ah see lingo let kno

## Step 5: computing accuracy and confusion matrix

In [29]:
#accuracy training set
print("Accuracy:"+str(accuracy_score(y_train,prediction_train)))
print()

Accuracy:0.9923715503702042



We care about the generalisation error, that is the performance on unseen data.

In [30]:

#accuracy test set
print("Accuracy:"+str(accuracy_score(y_test,prediction_test)))
print()

conf_mat=confusion_matrix(y_test, prediction_test)
print("Confusion Matrix")
print(conf_mat)


Accuracy:0.989237668161435

Confusion Matrix
[[965   5]
 [  7 138]]


#### Note: Where can we find sparse matrices ?
You can manipulate them using scipy.sparse

In [6]:
import scipy.sparse as sc #this is the library

#x_train_df is a scipy sparse matrix, this avoids to store the zeroes
#to access to the non-zero element
i=0# email index
ind=sc.find(x_train_df[i,:]>0)[1]
print("indexes of non-zeroes elements=",ind)
x_train_df[0,ind].todense()


indexes of non-zeroes elements= [ 162  458 1140 1398 1552 1790 1921 1991 2541 2676 3047 3214 3328 4456
 4483 4620 4944 4984 5405]


matrix([[1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]],
       dtype=int64)

In [32]:
#test set
ind=sc.find(x_test_df[i,:]>0)[1]
print("indexes of non-zeroes elements=",ind)
x_test_df[0,ind].todense()


indexes of non-zeroes elements= [2870 3588]


matrix([[1, 1]], dtype=int64)

## Question

We consider Movie Reviews Corpus, a dataset that includes  movie reviews that are categorized as positive or negative.

In [14]:
import random
import nltk

nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
df = pd.DataFrame(columns=['v1', 'v2'])
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        df=df.append({'v1': category, 'v2': movie_reviews.words(fileid)}, ignore_index=True)
        
word_list=[sms for sms in df['v2']]
def normalize(words):
    current_words=list()
    for word in words:
        if word.lower() not in stop: #remove  the most common words
            if word.isalpha(): #remove punctuation
                updated_word=stemmer.stem(word) #stemming
                current_words.append(updated_word.lower()) #lower case
    return current_words
word_list=[normalize(word) for word in word_list]
df['v2']=[" ".join(word) for word in word_list]


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/benavoli/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [15]:
df

Unnamed: 0,v1,v2
0,neg,plot two teen coupl go church parti drink driv...
1,neg,happi bastard quick movi review damn bug got h...
2,neg,movi like make jade movi viewer thank invent t...
3,neg,quest camelot warner bros first featur length ...
4,neg,synopsi mental unstabl man undergo psychothera...
...,...,...
1995,pos,wow movi everyth movi funni dramat interest we...
1996,pos,richard gere command actor alway great film ev...
1997,pos,glori star matthew broderick denzel washington...
1998,pos,steven spielberg second epic film world war ii...


Using the same steps as in the Spam filter example, apply MultinomialNB to this example