In [26]:
#Import
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob

In [27]:
#Load and print head of DataFrame
df = pd.read_csv("spam-data.tsv", sep="\t", names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [28]:
#Print details about quantity(rows, columns)
df.shape

(5567, 2)

In [29]:
#Check and remove duplicates
df.drop_duplicates(inplace = True)

In [30]:
#Veryfy how many duplicates have been deleted (rows, columns)
df.shape

(5164, 2)

In [31]:
#Show the number of invalid/missing data (NAN, Nan ..)
df.isnull().sum()

label      0
message    0
dtype: int64

In [32]:
#Delete punctuation
df['message'] = df['message'].str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,ham,Ive been searching for the right words to than...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,Nah I dont think he goes to usf he lives aroun...
3,ham,Even my brother is not like to speak with me T...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL


In [33]:
#Delete stopwords
stop = stopwords.words('english')
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

Unnamed: 0,label,message
0,ham,Ive searching right words thank breather I pro...
1,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
2,ham,Nah I dont think goes usf lives around though
3,ham,Even brother like speak They treat like aids p...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL


In [34]:
#Count frequently used words
freq = pd.Series(' '.join(df['message']).split()).value_counts()[:11]
freq

I       1395
u        696
2        447
call     326
get      322
U        315
Im       303
4        263
ur       255
ltgt     254
You      246
dtype: int64

In [35]:
#Delete frequently used words
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['message'].head()

0    Ive searching right words thank breather promi...
1    Free entry wkly comp win FA Cup final tkts 21s...
2          Nah dont think goes usf lives around though
3    Even brother like speak They treat like aids p...
4                      HAVE A DATE ON SUNDAY WITH WILL
Name: message, dtype: object

In [36]:
#Re-count frequently used words
freq = pd.Series(' '.join(df['message']).split()).value_counts()[:11]
freq

go      242
know    233
like    220
dont    209
got     199
come    192
time    183
want    164
day     164
lor     157
No      156
dtype: int64

In [37]:
#Count rare used words
rare = pd.Series(' '.join(df['message']).split()).value_counts()[-7045:]
rare

legitimat     1
individual    1
731           1
minimum       1
goodnite      1
             ..
Lancaster     1
Yesbut        1
europe        1
NOK           1
URFEELING     1
Length: 7045, dtype: int64

In [38]:
#Delete rare used words
#rare = list(rare)
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
df['message'].head()

0    Ive searching right words thank promise wont t...
1    Free entry wkly comp win FA Cup final tkts 21s...
2          Nah dont think goes usf lives around though
3              Even brother like speak They treat like
4                      HAVE A DATE ON SUNDAY WITH WILL
Name: message, dtype: object

In [39]:
#Re-count rare used words
rare = pd.Series(' '.join(df['message']).split()).value_counts()[-1]
rare

2

In [40]:
#Change labels spam/ham into 0/1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df['label'].head()

0    0
1    1
2    0
3    0
4    0
Name: label, dtype: int64

In [41]:
#Split the data into 75% training and 25% testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size = 0.25, random_state = 0)

print('Rows in test set: ' + str(x_test.shape))
print('Rows in train set: ' + str(x_train.shape))

Rows in test set: (1291,)
Rows in train set: (3873,)


In [42]:
#Vectorize text's into TF-IDF matrix (converting collection of raw text to a matrix of TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

list = x_train.tolist()

vectorizer = TfidfVectorizer(
    input = list ,    #raw text
    lowercase = True  #convert to lower case before tokenizing
)

features_train_transformed = vectorizer.fit_transform(list) #Returns TF-IDF vector 
features_test_transformed  = vectorizer.transform(x_test)   #Returns TF-IDF vector

In [43]:
#Train a classifier using train dataset
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(features_train_transformed, y_train)

MultinomialNB()

In [44]:
#Test model on test dataset
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
  
actual = y_test.tolist() 
predicted = classifier.predict(features_test_transformed)
results = confusion_matrix(actual, predicted) 
  
print('True Possitive: ' + str(results[0][0]))
print('False Positive: ' + str(results[0][1]))
print('False Negative: ' + str(results[1][0]))
print('True Negative: ' + str(results[1][1]))

print ('Accuracy Score: %.3f' % accuracy_score(actual, predicted)) 
print ('Report: ')
print (classification_report(actual, predicted) )

True Possitive: 1133
False Positive: 2
False Negative: 35
True Negative: 121
Accuracy Score: 0.971
Report: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1135
           1       0.98      0.78      0.87       156

    accuracy                           0.97      1291
   macro avg       0.98      0.89      0.93      1291
weighted avg       0.97      0.97      0.97      1291



In [45]:
#Test solution

sentences = [
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entr...",
    "Nah I don't think he goes to usf, he lives around here though"
]

# Core functions below
# Verify is given sentences are spam, 
# return 1 if function working good, 
# return 0 if error has occured
def veryfy_spam_function(texts):
    
    #Count number of sentences to verify 
    number_of_sentences = len(texts)
    
    if number_of_sentences == 0:
        print('No sentence to verify, give at least one!')
        return False
    
    #Convert to dataframe
    df = pd.DataFrame(texts, columns=['message'])
    
    #Remove punctuation
    df['message'] = df['message'].str.replace('[^\w\s]', '')
    
    #Remove stopwords
    stop = stopwords.words('english')
    df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    
    #Vectorize
    vect = vectorizer.transform(df['message'])
    
    #Predict result
    result = classifier.predict(vect)
    
    if number_of_sentences == 1:  #Do it only if one sentence has been verified
        print(get_message(result[0]))
    elif number_of_sentences > 1: #Do it only if more than one sentences have been verified
        i = 0;
        for x in texts:
            print(get_message(result[i]) + " - sentence -> " + x + "\n")
            i = i+1
            
    return True
    
def get_message(result):
    if result == 1:
        message = 'STOP - This sentence is a spam!';
    else:
        message = 'OK - This sentence is not a spam.';
    return message

#Function to verify if sentences are spam
if veryfy_spam_function(sentences):
    print("\nProcessed ok.")
else:
    print("\nSome errors occurred!")

STOP - This sentence is a spam! - sentence -> Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entr...

OK - This sentence is not a spam. - sentence -> Nah I don't think he goes to usf, he lives around here though


Processed ok.
