In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from os.path import join
from os import walk
from bs4 import BeautifulSoup

from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Gather Data

In [2]:
spam1 = '/home/fajrin/ML_29/Classifier/Project 1/SpamData/01_Processing/spam_assassin_corpus/spam_1'
spam2 = '/home/fajrin/ML_29/Classifier/Project 1/SpamData/01_Processing/spam_assassin_corpus/spam_2'
ham1 = '/home/fajrin/ML_29/Classifier/Project 1/SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'

ham2 = '/home/fajrin/ML_29/Classifier/Project 1/SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'

In [3]:
# extract email in path 

def email_generator (path):
    for root , dirnames , filenames in walk(path):
        for filename in filenames :
            filepath = join(root , filename)
            email_body = open(filepath , encoding ='latin-1')
            
            _body = False
            lines=[]
           
            for line in email_body :
                if _body :
                    word = word_tokenize(line.lower())
                    lines.append(line)
                elif line =='\n':
                    _body = True
            email_body.close()
            email_text = '\n'.join(lines)
            yield filename , email_text
            


In [4]:
def df_email_generator(path,category):
    row=[]
    filename_=[]
    for filename , email_text in email_generator(path):
        row.append({'category':category ,'message':email_text })
        filename_.append(filename)
    return pd.DataFrame(data = row , index=filename_ )

In [5]:
spam = df_email_generator(spam1,1)
spam = spam.append(df_email_generator(spam2,1))

ham = df_email_generator(ham1,0)
ham = spam.append(df_email_generator(ham2,0))

data = pd.concat([spam ,ham])
index_ = range(0,len(data))
data['filename']=data.index
data.index=index_
data.index.name ='email_id'
data

Unnamed: 0_level_0,category,message,filename
email_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,"<html>\n\n<TABLE id=AutoNumber2 style=""BORDER-...",00477.24ef7a042f97482f884387c75249380c
1,1,"<HR>\n\n<html>\n\n<div bgcolor=3D""#FFFFCC"">\n\...",00369.845eeb9573484bd88a6a6224c7068d81
2,1,<html>\n\n\n\n\n\n\n\n<body>\n\n\n\n<div align...,00201.00020fc9911604f6cae7ae0f598ad29d
3,1,"Dear Partner to be,\n\n\n\nFirst, I must apolo...",00218.917ed95f5c90c1d9d15d2528b0bd1e79
4,1,This is a Multipart MIME message. Since your m...,00260.c75ce8b8d8bfc55723426979d260bf61
...,...,...,...
5191,0,"(pace Giraudoux)\n\n\n\nTo be more precise, I ...",00959.a24f34fb3d342beaf6fc1df54b00e5a2
5192,0,"On Wednesday 31 Jul 2002 9:53 am, Ralf Ertzing...",01183.a6c69ac786145115a2ad4f06c986bc3d
5193,0,-----BEGIN PGP SIGNED MESSAGE-----\n\nHash: SH...,00599.4b9e5d55f5bb001974345a0439e6f93d
5194,0,"On Sun 28 Jul 2002 06:29, ilug-admin@linux.ie ...",00131.4d06fea0c1c9623082010e4f5d9815b1


### Cleaning Data

In [6]:
# check null value 
data.isnull().sum()

# check empty message
drop_index = data.index[data.message.str.len()==0]
data.loc[drop_index]
data.drop(drop_index , inplace = True)
data

Unnamed: 0_level_0,category,message,filename
email_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,"<html>\n\n<TABLE id=AutoNumber2 style=""BORDER-...",00477.24ef7a042f97482f884387c75249380c
1,1,"<HR>\n\n<html>\n\n<div bgcolor=3D""#FFFFCC"">\n\...",00369.845eeb9573484bd88a6a6224c7068d81
2,1,<html>\n\n\n\n\n\n\n\n<body>\n\n\n\n<div align...,00201.00020fc9911604f6cae7ae0f598ad29d
3,1,"Dear Partner to be,\n\n\n\nFirst, I must apolo...",00218.917ed95f5c90c1d9d15d2528b0bd1e79
4,1,This is a Multipart MIME message. Since your m...,00260.c75ce8b8d8bfc55723426979d260bf61
...,...,...,...
5191,0,"(pace Giraudoux)\n\n\n\nTo be more precise, I ...",00959.a24f34fb3d342beaf6fc1df54b00e5a2
5192,0,"On Wednesday 31 Jul 2002 9:53 am, Ralf Ertzing...",01183.a6c69ac786145115a2ad4f06c986bc3d
5193,0,-----BEGIN PGP SIGNED MESSAGE-----\n\nHash: SH...,00599.4b9e5d55f5bb001974345a0439e6f93d
5194,0,"On Sun 28 Jul 2002 06:29, ilug-admin@linux.ie ...",00131.4d06fea0c1c9623082010e4f5d9815b1


In [7]:
data.groupby('category').describe()
data

Unnamed: 0_level_0,category,message,filename
email_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,"<html>\n\n<TABLE id=AutoNumber2 style=""BORDER-...",00477.24ef7a042f97482f884387c75249380c
1,1,"<HR>\n\n<html>\n\n<div bgcolor=3D""#FFFFCC"">\n\...",00369.845eeb9573484bd88a6a6224c7068d81
2,1,<html>\n\n\n\n\n\n\n\n<body>\n\n\n\n<div align...,00201.00020fc9911604f6cae7ae0f598ad29d
3,1,"Dear Partner to be,\n\n\n\nFirst, I must apolo...",00218.917ed95f5c90c1d9d15d2528b0bd1e79
4,1,This is a Multipart MIME message. Since your m...,00260.c75ce8b8d8bfc55723426979d260bf61
...,...,...,...
5191,0,"(pace Giraudoux)\n\n\n\nTo be more precise, I ...",00959.a24f34fb3d342beaf6fc1df54b00e5a2
5192,0,"On Wednesday 31 Jul 2002 9:53 am, Ralf Ertzing...",01183.a6c69ac786145115a2ad4f06c986bc3d
5193,0,-----BEGIN PGP SIGNED MESSAGE-----\n\nHash: SH...,00599.4b9e5d55f5bb001974345a0439e6f93d
5194,0,"On Sun 28 Jul 2002 06:29, ilug-admin@linux.ie ...",00131.4d06fea0c1c9623082010e4f5d9815b1


In [8]:
# cleaninf email body form html tags
# Challenge: Modify function to remove HTML tags. Then test on Email with DOC_ID 2. 
def email_cleaner(message, stemmer=PorterStemmer(), 
                 stop_words=set(stopwords.words('english'))):
    
    # Remove HTML tags
    soup = BeautifulSoup(message, 'html.parser')
    cleaned_text = soup.get_text()
    
    # Converts to Lower Case and splits up the words
    words = word_tokenize(cleaned_text.lower())
    
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
#             filtered_words.append(word) 
    
    return filtered_words   

In [9]:
nested_list=data['message'].apply(email_cleaner)
nested_list

email_id
0       [never, pay, retail, count, save, select, sati...
1       [tremend, save, toner, inkjet, fax, thermal, r...
2       [copi, dvd, burner, dvd, wizard, technolog, ad...
3       [dear, partner, first, must, apologis, use, me...
4       [multipart, mime, messag, sinc, mail, reader, ...
                              ...                        
5191    [pace, giraudoux, precis, believ, war, civil, ...
5192    [wednesday, jul, ralf, ertzing, wrote, hi, joh...
5193    [pgp, sign, messag, hash, ok, time, pipe, hone...
5194    [sun, jul, wrote, messag, john, moran, ilug, s...
5195    [would, forgeri, virus, detect, would, option,...
Name: message, Length: 5192, dtype: object

In [10]:
doc_ids_ham = data[data.category==0].index
doc_ids_spam = data[data.category==1].index

In [11]:
new_data = data.copy()
new_data['clean_message']=nested_list
new_data.drop('message',axis=1 , inplace=True)
new_data

Unnamed: 0_level_0,category,filename,clean_message
email_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,00477.24ef7a042f97482f884387c75249380c,"[never, pay, retail, count, save, select, sati..."
1,1,00369.845eeb9573484bd88a6a6224c7068d81,"[tremend, save, toner, inkjet, fax, thermal, r..."
2,1,00201.00020fc9911604f6cae7ae0f598ad29d,"[copi, dvd, burner, dvd, wizard, technolog, ad..."
3,1,00218.917ed95f5c90c1d9d15d2528b0bd1e79,"[dear, partner, first, must, apologis, use, me..."
4,1,00260.c75ce8b8d8bfc55723426979d260bf61,"[multipart, mime, messag, sinc, mail, reader, ..."
...,...,...,...
5191,0,00959.a24f34fb3d342beaf6fc1df54b00e5a2,"[pace, giraudoux, precis, believ, war, civil, ..."
5192,0,01183.a6c69ac786145115a2ad4f06c986bc3d,"[wednesday, jul, ralf, ertzing, wrote, hi, joh..."
5193,0,00599.4b9e5d55f5bb001974345a0439e6f93d,"[pgp, sign, messag, hash, ok, time, pipe, hone..."
5194,0,00131.4d06fea0c1c9623082010e4f5d9815b1,"[sun, jul, wrote, messag, john, moran, ilug, s..."


In [12]:
strq=new_data.clean_message
type(strq)

pandas.core.series.Series

In [13]:
new_data['liststring']=new_data.clean_message.apply(lambda x:''.join(str(x)))

In [14]:
new_data.drop(['clean_message'],axis =1 ,inplace =True)

In [15]:
new_data

Unnamed: 0_level_0,category,filename,liststring
email_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,00477.24ef7a042f97482f884387c75249380c,"['never', 'pay', 'retail', 'count', 'save', 's..."
1,1,00369.845eeb9573484bd88a6a6224c7068d81,"['tremend', 'save', 'toner', 'inkjet', 'fax', ..."
2,1,00201.00020fc9911604f6cae7ae0f598ad29d,"['copi', 'dvd', 'burner', 'dvd', 'wizard', 'te..."
3,1,00218.917ed95f5c90c1d9d15d2528b0bd1e79,"['dear', 'partner', 'first', 'must', 'apologis..."
4,1,00260.c75ce8b8d8bfc55723426979d260bf61,"['multipart', 'mime', 'messag', 'sinc', 'mail'..."
...,...,...,...
5191,0,00959.a24f34fb3d342beaf6fc1df54b00e5a2,"['pace', 'giraudoux', 'precis', 'believ', 'war..."
5192,0,01183.a6c69ac786145115a2ad4f06c986bc3d,"['wednesday', 'jul', 'ralf', 'ertzing', 'wrote..."
5193,0,00599.4b9e5d55f5bb001974345a0439e6f93d,"['pgp', 'sign', 'messag', 'hash', 'ok', 'time'..."
5194,0,00131.4d06fea0c1c9623082010e4f5d9815b1,"['sun', 'jul', 'wrote', 'messag', 'john', 'mor..."


In [16]:
x_train , x_test , y_train , y_test = train_test_split(new_data.liststring , new_data.category)

In [17]:
v = CountVectorizer()
X_train_count = v.fit_transform(x_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
model = MultinomialNB()
model.fit(X_train_count,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
example = ['get viagra for free now!', 
          'need a mortgage? Reply to arrange a call with a specialist and get a quote', 
          'Could you please help me with the project for tomorrow?', 
          'Hello Jonathan, how about a game of golf tomorrow?', 
          'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.'
          ]

In [22]:
emails = [
    
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!',
    ' I am a teacher'
]
emails_count = v.transform(example)
model.predict(emails_count)

array([1, 1, 1, 1, 0])

In [20]:
X_test_count = v.transform(x_test)
model.score(X_test_count, y_test)

0.9861325115562404

In [36]:
data.sort_index(inplace=True)


In [37]:
vectorizer = CountVectorizer(stop_words='english')

In [38]:
all_features = vectorizer.fit_transform(data.message)

In [39]:
vectorizer.vocabulary_

{'html': 40956,
 'table': 72663,
 'id': 42204,
 'autonumber2': 17827,
 'style': 71565,
 'border': 20867,
 'collapse': 24855,
 'bordercolor': 20870,
 '111111': 1420,
 'cellspacing': 23243,
 'cellpadding': 23236,
 'width': 80718,
 '715': 9621,
 'tbody': 72930,
 'tr': 74559,
 'td': 73009,
 'valign': 77634,
 '27': 3612,
 'href': 40878,
 'http': 40979,
 'www': 81601,
 'frugaljoe': 36429,
 'com': 24928,
 'img': 43447,
 'src': 70896,
 'logo': 50267,
 'jpg': 46497,
 'bgcolor': 19797,
 'ffcc99': 34983,
 'height': 39861,
 'salealerts': 68005,
 'dot': 29574,
 'gif': 37832,
 'font': 35799,
 'face': 34328,
 'verdana': 77953,
 'size': 69698,
 'center': 23257,
 'pay': 59057,
 'retail': 66125,
 'br': 21058,
 '20': 2742,
 '459': 6906,
 'colspan': 24919,
 'div': 29116,
 'head': 39747,
 'body': 20728,
 '465': 6938,
 'blair': 20343,
 'images': 43408,
 'email': 32413,
 'fshprospecthdrblue_4c': 36476,
 'times': 73878,
 'new': 55074,
 'roman': 66925,
 'color': 24899,
 'black': 20325,
 'nbsp': 54737,
 'count'

In [40]:
X_train , X_test , y_train , y_test = train_test_split(all_features, data.category ,
                                                      test_size=0.3 , random_state= 88)


In [42]:
classifier = MultinomialNB()
classifier.fit(X_train , y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
model = MultinomialNB()
model.fit(X_train_count,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [53]:
example = ['I want to viagra for free now!', 
          'need a mortgage? I need you Reply to arrange a call with a specialist and get a quote', 
          'hello about the project ', 
          'Hello Jonathan, how about a game of golf tomorrow?', 
          'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.'
          ]

In [54]:
doc_term_matrix = vectorizer.transform(example)

In [55]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0])