In [4]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
import pandas as pd 
#dataframe import
df_spam=pd.read_csv('gdrive/My Drive/Colab Notebooks/spam.csv', header=None,  encoding='latin-1')
df_ham=pd.read_csv('gdrive/My Drive/Colab Notebooks/ham.csv', header=None, encoding='latin-1')

#dataframe row column selector
df_spam = df_spam.loc[:, 0:0]
df_ham = df_ham.loc[:, 0:0]

print(df_spam)
print(df_ham)

                                                      0
0     Free entry in 2 a wkly comp to win FA Cup fina...
1     FreeMsg Hey there darling it's been 3 week's n...
2     WINNER!! As a valued network customer you have...
3     Had your mobile 11 months or more? U R entitle...
4     SIX chances to win CASH! From 100 to 20,000 po...
...                                                 ...
5044                                                NaN
5045                                                NaN
5046                                                NaN
5047                                                NaN
5048                                                NaN

[5049 rows x 1 columns]
                                                      0
0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
2     U dun say so early hor... U c already then say...
3     Nah I don't think he goes to usf, he lives aro...
4     Even my brother i

In [35]:
from __future__ import print_function, division
import nltk
import os
import random
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier, classify
 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stoplist = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
def init_lists(folder):
    key_list = []
    file_content = os.listdir(folder)
    for a_file in file_content:
        f = open(folder + a_file, 'r')
        key_list.append(f.read())
    f.close()
    return key_list

In [0]:
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(str(sentence, errors='ignore'))]

In [0]:
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}

In [0]:
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set of size= ' + str(len(train_set)) + ' mails')
    print ('Test set of size = ' + str(len(test_set)) + ' mails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier

In [0]:
def evaluate(train_set, test_set, classifier):
    # test accuracy of classifier on training and test set
    print ('Training set accuracy = ' + str(classify.accuracy(classifier, train_set)))
    print ('Test set accuracy = ' + str(classify.accuracy(classifier, test_set)))
    # check most informative words for the classifier
    classifier.show_most_informative_features(20)

In [36]:
if __name__ == "__main__":
    # initialise the data
    #spam = df_spam.values.tolist()
    #ham = df_ham.values.tolist()
    spam = df_spam.to_records(index=False)
    ham = df_ham.to_records(index=False)
    all_mails = [(mail, 'spam') for mail in spam]
    all_mails += [(mail, 'ham') for mail in ham]
    random.shuffle(all_mails)
    print ('Corpus of size = ' + str(len(all_mails)) + ' mails')
 
    # extract the features
    all_features = [(get_features(mail, ''), label) for (mail, label) in all_mails]
    print ('Fetched ' + str(len(all_features)) + ' feature sets')
 
    # train the classifier
    train_set, test_set, classifier = train(all_features, 0.8)
 
    # evaluate performance
    evaluate(train_set, test_set, classifier)

Corpus of size = 10098 mails
Fetched 10098 feature sets
Training set of size= 8078 mails
Test set of size = 2020 mails
Training set accuracy = 0.9226293637038872
Test set accuracy = 0.8138613861386138
Most Informative Features
                   8̀   = True             spam : ham    =     19.8 : 1.0
                       < = True              ham : spam   =     17.4 : 1.0
                       > = True              ham : spam   =     17.4 : 1.0
                       & = True              ham : spam   =     15.3 : 1.0
                    c   = True              ham : spam   =     15.3 : 1.0
                    8   = True              ham : spam   =     13.7 : 1.0
                       x = True              ham : spam   =     11.9 : 1.0
                       : = True              ham : spam   =     10.6 : 1.0
                       = True              ham : spam   =     10.1 : 1.0
                    =   = True              ham : spam   =      9.9 : 1.0
                    z 