In [454]:
#Import
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob

In [455]:
#Load and print head of DataFrame
df = pd.read_csv("spam-data.tsv", sep="\t", names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [456]:
#Print details about quantity(rows, columns)
df.shape

(5567, 2)

In [457]:
#Check and remove duplicates
df.drop_duplicates(inplace = True)

In [458]:
#Veryfy how many duplicates have been deleted (rows, columns)
df.shape

(5164, 2)

In [459]:
#Show the number of invalid/missing data (NAN, Nan ..)
df.isnull().sum()

label      0
message    0
dtype: int64

In [460]:
#Delete punctuation
df['message'] = df['message'].str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,ham,Ive been searching for the right words to than...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,Nah I dont think he goes to usf he lives aroun...
3,ham,Even my brother is not like to speak with me T...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL


In [461]:
#Delete stopwords
stop = stopwords.words('english')
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

Unnamed: 0,label,message
0,ham,Ive searching right words thank breather I pro...
1,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
2,ham,Nah I dont think goes usf lives around though
3,ham,Even brother like speak They treat like aids p...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL


In [462]:
#Count frequently used words
freq = pd.Series(' '.join(df['message']).split()).value_counts()[:11]
freq

I       1395
u        696
2        447
call     326
get      322
U        315
Im       303
4        263
ur       255
ltgt     254
You      246
dtype: int64

In [463]:
#Delete frequently used words
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['message'].head()

0    Ive searching right words thank breather promi...
1    Free entry wkly comp win FA Cup final tkts 21s...
2          Nah dont think goes usf lives around though
3    Even brother like speak They treat like aids p...
4                      HAVE A DATE ON SUNDAY WITH WILL
Name: message, dtype: object

In [464]:
#Re-count frequently used words
freq = pd.Series(' '.join(df['message']).split()).value_counts()[:11]
freq

go      242
know    233
like    220
dont    209
got     199
come    192
time    183
day     164
want    164
lor     157
No      156
dtype: int64

In [465]:
#Count rare used words
rare = pd.Series(' '.join(df['message']).split()).value_counts()[-7045:]
rare

bear           1
Caught         1
belt           1
Aaooooright    1
Building       1
              ..
fgkslpo        1
AD             1
DIS            1
98321561       1
DontCha        1
Length: 7045, dtype: int64

In [467]:
#Delete rare used words
#rare = list(rare)
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
df['message'].head()

0    Ive searching right words thank promise wont t...
1    Free entry wkly comp win FA Cup final tkts 21s...
2          Nah dont think goes usf lives around though
3              Even brother like speak They treat like
4                      HAVE A DATE ON SUNDAY WITH WILL
Name: message, dtype: object

In [471]:
#Re-count rare used words
rare = pd.Series(' '.join(df['message']).split()).value_counts()[-1]
rare

2

In [472]:
#Change labels spam/ham into 0/1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df['label'].head()

0    0
1    1
2    0
3    0
4    0
Name: label, dtype: int64

In [473]:
#Split the data into 75% training and 25% testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size = 0.25, random_state = 0)

print('Rows in test set: ' + str(x_test.shape))
print('Rows in train set: ' + str(x_train.shape))
type(x_train)
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorize email text into tfidf matrix
# TfidfVectorizer converts collection of raw documents to a matrix of TF-IDF features.
# It's equivalent to CountVectorizer followed by TfidfTransformer.
list = x_train.tolist()
vectorizer = TfidfVectorizer(
    input= list ,  # input is actual text
    lowercase=True,      # convert to lower case before tokenizing
    #stop_words='english' # remove stop words
)
features_train_transformed = vectorizer.fit_transform(list) #gives tf idf vector
features_test_transformed  = vectorizer.transform(x_test) #gives tf idf vector

Rows in test set: (1291,)
Rows in train set: (3873,)


In [474]:
from sklearn.naive_bayes import MultinomialNB

# train a classifier
classifier = MultinomialNB()
classifier.fit(features_train_transformed, y_train)

MultinomialNB()

In [475]:
# review the classifier accuracy
print("classifier accuracy {:.2f}%".format(classifier.score(features_test_transformed, y_test) * 100))

classifier accuracy 97.13%


In [485]:
labels = classifier.predict(features_test_transformed)
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
  
actual = y_test.tolist() 
predicted = labels 
results = confusion_matrix(actual, predicted) 
  
print('True Possitive: ' + str(results[0][0]))
print('False Positive: ' + str(results[0][1]))
print('False Negative: ' + str(results[1][0]))
print('True Negative: ' + str(results[1][1]))

print ('Accuracy Score: %.3f' % accuracy_score(actual, predicted)) 
print ('Report: ')
print (classification_report(actual, predicted) )

score_2 = f1_score(actual, predicted, average = 'binary')
print('F-Measure: %.3f' % score_2)

True Possitive: 1133
False Positive: 2
False Negative: 35
True Negative: 121
Accuracy Score: 0.971
Report: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1135
           1       0.98      0.78      0.87       156

    accuracy                           0.97      1291
   macro avg       0.98      0.89      0.93      1291
weighted avg       0.97      0.97      0.97      1291

F-Measure: 0.867
