# Get dataset (Spam Ham Dataset)


In [None]:
! pip install -q kaggle

from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nakshatra22mai0037","key":"a81bab0c7c164b351d1a172fe2d4c12c"}'}

In [None]:
# get data from kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download venky73/spam-mails-dataset

Downloading spam-mails-dataset.zip to /content
  0% 0.00/1.86M [00:00<?, ?B/s]
100% 1.86M/1.86M [00:00<00:00, 147MB/s]


In [None]:
#unzip data
! unzip /content/spam-mails-dataset.zip -d /content/

Archive:  /content/spam-mails-dataset.zip
  inflating: /content/spam_ham_dataset.csv  


In [None]:
import numpy as np
import pandas as pd

emails = pd.read_csv("/content/spam_ham_dataset.csv")
emails

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [None]:
def get_email_subject(email):
    subject = email[0:email.find('\r\n')]
    subject = subject. replace('Subject: ', '')
    return subject

def get_email_body(email):
    body = email[email.find('\r\n')+2:]
    return body

In [None]:
# cleaning of columns
email_df = emails.drop(['Unnamed: 0', "label_num"], axis = 1)

# get the subject and body of email
email_df["subject"] = email_df["text"].apply(lambda x: get_email_subject(x))
email_df["body"] = email_df["text"].apply(lambda x: get_email_body(x))

# ridding of the text column (unless we need it)
email_df = email_df.drop(["text"], axis = 1)

email_df

# expand default pandas display options to make emails more clearly visible when printed
pd.set_option('display.max_colwidth', 200)

# email_df is new our dataframe
email_df.head()

Unnamed: 0,label,subject,body
0,ham,enron methanol ; meter # : 988291,"this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary\r\nflow data provided by daren } .\r\nplease override pop ' s daily volume { presently zero } to reflect daily\r\nac..."
1,ham,"hpl nom for january 9 , 2001",( see attached file : hplnol 09 . xls )\r\n- hplnol 09 . xls
2,ham,neon retreat,"ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time !\r\ni know that this time of year is extremely hectic , and that it ' s tough to think about anyt..."
3,spam,"photoshop , windows , office . cheap . main trending",abasements darer prudently fortuitous undergone\r\nlighthearted charm orinoco taster\r\nrailroad affluent pornographic cuvier\r\nirvin parkhouse blameworthy chlorophyll\r\nrobed diagrammatic fogar...
4,ham,re : indian springs,"this deal is to book the teco pvr revenue . it is my understanding that teco\r\njust sends us a check , i haven ' t received an answer as to whether there is a\r\npredermined price associated with..."


# Text/Data Pre-processing

In [None]:
# hyperparameters 
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

In [None]:
# Tokenization method 1
# this is tokenization split by white space
def tokenize_1(row):
    if row is None or row is '':
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

  if row is None or row is '':


In [None]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize

In [None]:
# Tokenization method 2
# split of white space AND punctuation $3.88 --> '3', '.', '88'
def tokenize_2(row):
    return wordpunct_tokenize(str(row))[:maxtokens]

In [None]:
import re
# this covers lower() tokens
def reg_expressions(row):
    row = re.sub(r'[\r\n]', "", row)
    return row

In [None]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:10])

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    return token

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def stemming(row):
    port_stemmer = nltk.stem.porter.PorterStemmer()
    token = [port_stemmer.stem(token) for token in row]
    return token

In [None]:
def lemmatization(row):
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    token = [lem.lemmatize(token) for token in row]
    return token

In [None]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_tokenize=1,flg_stemm=False, flg_lemm=True, flg_stopwords=True):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = reg_expressions(text)

    ## Tokenize (convert from string to list)
    if flg_tokenize == 1:
        text = tokenize_1(text)

    elif flg_tokenize == 2:
        text = tokenize_2(text)
    
    # remove Stopwords
    if flg_stopwords == True:
        text = stop_word_removal(text)
        
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        text = stemming(text)
        
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        text = lemmatization(text)
            
    ## back to string from list
    text = " ".join(text)
    return text

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4') #Open Multilingual Wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Tokenization

In [None]:
email_tokenization = email_df["body"].apply(lambda x: utils_preprocess_text(x, flg_tokenize=2))
email_tokenization[:5]

0    follow note gave monday , 4 / 3 / 00 { preliminaryflow data provided daren } . please override pop ' daily volume { presently zero } reflect dailyactivity obtain gas control . change needed asap e...
1                                                                                                                                                     ( see attached file : hplnol 09 . xl )- hplnol 09 . xl
2    ho ho ho , ' around wonderful time year - - - neon leader retreat time ! know time year extremely hectic , ' tough think anything past holiday , life go past week december 25 january 1 , ' ' like ...
3    abasement darer prudently fortuitous undergonelighthearted charm orinoco tasterrailroad affluent pornographic cuvierirvin parkhouse blameworthy chlorophyllrobed diagrammatic fogarty clear baydainc...
4                deal book teco pvr revenue . understanding tecojust sends u check , ' received answer whether apredermined price associated deal teco let u know whatwe giving . co

# Stemming

In [None]:
email_stemming = email_df["body"].apply(lambda x: utils_preprocess_text(x, flg_stemm=True))
email_stemming[:5]

0       follow note gave monday , 4 / 3 / 00 { preliminaryflow data provid daren } .pleas overrid pop ' daili volum { present zero } reflect dailyact obtain ga control .thi chang need asap econom purpos .
1                                                                                                                                                       ( see attach file : hplnol 09 . xl )- hplnol 09 . xl
2    ho ho ho , ' around wonder time year - - - neon leader retreat time !i know time year extrem hectic , ' tough think anyth past holiday , life go past week decemb 25 januari 1 , ' ' like think minu...
3    aba darer prudent fortuit undergonelightheart charm orinoco tasterrailroad affluent pornograph cuvierirvin parkhous blameworthi chlorophyllrob diagrammat fogarti clear baydainconvenienc manag repr...
4                                 deal book teco pvr revenu . understand tecojust send u check , ' receiv answer whether apredermin price associ deal teco let u know whatw give . c

# Lemmatization

In [None]:
email_lemmatization = email_df["body"].apply(lambda x: utils_preprocess_text(x, flg_lemm=True))
email_lemmatization[:5]

0    follow note gave monday , 4 / 3 / 00 { preliminaryflow data provided daren } .please override pop ' daily volume { presently zero } reflect dailyactivity obtain gas control .this change needed asa...
1                                                                                                                                                     ( see attached file : hplnol 09 . xl )- hplnol 09 . xl
2    ho ho ho , ' around wonderful time year - - - neon leader retreat time !i know time year extremely hectic , ' tough think anything past holiday , life go past week december 25 january 1 , ' ' like...
3    abasement darer prudently fortuitous undergonelighthearted charm orinoco tasterrailroad affluent pornographic cuvierirvin parkhouse blameworthy chlorophyllrobed diagrammatic fogarty clear baydainc...
4                deal book teco pvr revenue . understanding tecojust sends u check , ' received answer whether apredermined price associated deal teco let u know whatwe giving . co

# Stopwords

In [None]:
emailstopwords = email_df["body"].apply(lambda x: utils_preprocess_text(x, flg_stopwords=True))
emailstopwords[:5]

0    follow note gave monday , 4 / 3 / 00 { preliminaryflow data provided daren } .please override pop ' daily volume { presently zero } reflect dailyactivity obtain gas control .this change needed asa...
1                                                                                                                                                     ( see attached file : hplnol 09 . xl )- hplnol 09 . xl
2    ho ho ho , ' around wonderful time year - - - neon leader retreat time !i know time year extremely hectic , ' tough think anything past holiday , life go past week december 25 january 1 , ' ' like...
3    abasement darer prudently fortuitous undergonelighthearted charm orinoco tasterrailroad affluent pornographic cuvierirvin parkhouse blameworthy chlorophyllrobed diagrammatic fogarty clear baydainc...
4                deal book teco pvr revenue . understanding tecojust sends u check , ' received answer whether apredermined price associated deal teco let u know whatwe giving . co