In [152]:
#Import
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from textblob import TextBlob

In [158]:
#Load and print head of DataFrame
df = pd.read_csv("spam-data.tsv", sep="\t", names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [159]:
#Print details about quantity(rows, columns)
df.shape

(5567, 2)

In [160]:
#Check and remove duplicates
df.drop_duplicates(inplace = True)

In [161]:
#Veryfy how many duplicates have been deleted (rows, columns)
df.shape

(5164, 2)

In [162]:
#Show the number of invalid/missing data (NAN, Nan ..)
df.isnull().sum()

label      0
message    0
dtype: int64

In [163]:
#Delete punctuation
df['message'] = df['message'].str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,ham,Ive been searching for the right words to than...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,Nah I dont think he goes to usf he lives aroun...
3,ham,Even my brother is not like to speak with me T...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL


In [164]:
#Delete stopwords
stop = stopwords.words('english')
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

Unnamed: 0,label,message
0,ham,Ive searching right words thank breather I pro...
1,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
2,ham,Nah I dont think goes usf lives around though
3,ham,Even brother like speak They treat like aids p...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL


In [200]:
#Count frequently used words
freq = pd.Series(' '.join(df['message']).split()).value_counts()[:11]
freq

go      242
know    233
like    220
dont    209
got     199
come    192
time    183
want    164
day     164
lor     157
No      156
dtype: int64

In [201]:
#Delete frequently used words
freq = list(freq)
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['message'].head()

0    Ive searching right words thank promise wont t...
1    Free entry wkly comp win FA Cup final tkts 21s...
2          Nah dont think goes usf lives around though
3              Even brother like speak They treat like
4                      HAVE A DATE ON SUNDAY WITH WILL
Name: message, dtype: object

In [202]:
#Re-count frequently used words
freq = pd.Series(' '.join(df['message']).split()).value_counts()[:11]
freq

go      242
know    233
like    220
dont    209
got     199
come    192
time    183
want    164
day     164
lor     157
No      156
dtype: int64

In [203]:
#Count rare used words
rare = pd.Series(' '.join(df['message']).split()).value_counts()[-7045:]
rare

go       242
know     233
like     220
dont     209
got      199
        ... 
boo        2
4info      2
jeans      2
GIV        2
saucy      2
Length: 4465, dtype: int64

In [204]:
#Delete rare used words
rare = list(rare)
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
df['message'].head()

0    Ive searching right words thank promise wont t...
1    Free entry wkly comp win FA Cup final tkts 21s...
2          Nah dont think goes usf lives around though
3              Even brother like speak They treat like
4                      HAVE A DATE ON SUNDAY WITH WILL
Name: message, dtype: object

In [205]:
#Re-count rare used words
rare = pd.Series(' '.join(df['message']).split()).value_counts()[-1:]
rare

saucy    2
dtype: int64

In [206]:
#Tokenization - split sentence into word list
TextBlob(df['message'][2]).words

WordList(['Nah', 'dont', 'think', 'goes', 'usf', 'lives', 'around', 'though'])

In [207]:
#Vectorization
#Before run download http://nlp.stanford.edu/data/glove.6B.zip
#Before run install: conda install -c conda-forge gensim

from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [208]:
#Vectorization continued
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [209]:
#Vectorization - test word
model['brother']

array([ 0.44172 , -0.42239 ,  0.16875 , -0.73071 ,  0.11421 ,  0.58036 ,
       -0.15996 , -0.35057 , -0.51585 ,  0.049159,  0.21029 ,  0.33813 ,
        0.34015 ,  0.33394 ,  0.43082 , -0.68797 ,  0.016801, -0.4392  ,
       -0.63862 ,  0.99979 , -0.22808 , -0.36173 ,  0.26028 ,  0.33471 ,
        0.19574 ,  0.1889  , -0.65007 , -0.90396 ,  0.87146 ,  1.0389  ,
        0.14813 ,  0.62926 ,  0.45103 ,  0.056848, -0.47635 ,  0.22851 ,
        0.019162,  0.23166 ,  0.31517 , -0.04989 , -0.045153,  0.41631 ,
        1.2553  , -0.93028 , -0.16085 , -0.0195  , -0.52613 , -0.062153,
        0.41316 , -0.23164 , -0.52598 , -0.096949,  0.60631 ,  0.89382 ,
        0.24843 , -1.9425  , -1.0757  ,  0.095841, -0.020964,  0.49486 ,
        0.36509 ,  0.74831 ,  0.38753 , -0.25084 ,  0.81364 , -0.30012 ,
        0.46068 ,  0.76646 , -0.15263 ,  0.83083 , -0.06191 , -0.070126,
        0.031228, -0.63841 , -0.15574 ,  0.14927 , -0.11447 , -0.30875 ,
       -0.38419 ,  0.12946 ,  0.72202 ,  0.52711 , 

In [210]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(df['message'])

train_vect

<5164x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 24455 stored elements in Compressed Sparse Row format>

In [211]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(df['message'])
train_bow

<5164x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 30607 stored elements in Compressed Sparse Row format>

In [216]:
#Sentiment analysis
df['sentiment'] = df['message'].apply(lambda x: TextBlob(x).sentiment[0])
df[['message','sentiment']].head()

Unnamed: 0,message,sentiment
0,Ive searching right words thank promise wont t...,0.642857
1,Free entry wkly comp win FA Cup final tkts 21s...,0.3
2,Nah dont think goes usf lives around though,0.0
3,Even brother like speak They treat like,0.0
4,HAVE A DATE ON SUNDAY WITH WILL,0.0


In [217]:
#Split the data into 75% training and 25% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_bow, df['message'], test_size=0.25, random_state=0)

In [218]:
#Train the Model - Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, y_train)

In [219]:
#Print the prediction
print(classifier.predict(X_train))
print(y_train.values)

['Huh means Y like dat one push n'
 'sexy sexy cum text im wet warm ready fun THIS MSG IS FREE TEXT STOP' ''
 ... '' ''
 'Idk keep saying youre since moved keep heads freedom responsibility And im tired much shit deal im barely keeping together gets added']
['Huh means Y like dat one push n'
 'sexy sexy cum text im wet warm ready fun THIS MSG IS FREE TEXT STOP'
 'They released vday shirts put makes bottom half naked instead white underwear'
 ... 'turns love phone unknown album' 'Can mag meeting point'
 'Idk keep saying youre since moved keep heads freedom responsibility And im tired much shit deal im barely keeping together gets added']


In [220]:
#Evaluate the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))

  _warn_prf(average, modifier, msg_start, len(result))


                                                                                                                                                                                                                                                                                                                                                                                                                                               precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                                                                                                                                                                                    0.02      1.00      0.03        26
                                                         