### Enron Email Classification 
Before goining to this notebook,please check the data cleaning and preprocessing of the dataset: https://www.kaggle.com/code/sainiamit/data-cleaning-enron-email-classification

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/data-cleaning-enron-email-classification/cleaned_dataset.csv
/kaggle/input/data-cleaning-enron-email-classification/__results__.html
/kaggle/input/data-cleaning-enron-email-classification/__notebook__.ipynb
/kaggle/input/data-cleaning-enron-email-classification/__output__.json
/kaggle/input/data-cleaning-enron-email-classification/custom.css
/kaggle/input/data-cleaning-enron-email-classification/__results___files/__results___21_0.png
/kaggle/input/data-cleaning-enron-email-classification/__results___files/__results___23_0.png


## Import All required libraries


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import re 
import string 
import time
pd.set_option("display.max_rows",50)
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate

# machine learning model for email classification
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

## load the dataset


In [3]:

df= pd.read_csv("/kaggle/input/data-cleaning-enron-email-classification/cleaned_dataset.csv")

## EDA

In [4]:
df.head()

Unnamed: 0,Subject,X-Folder,body
0,Re:,'sent mail,Traveling to have a business meeting takes the...
1,Re: test,'sent mail,test successful. way to go!!!
2,Re: Hello,'sent mail,Let's shoot for Tuesday at 11:45.
3,Re: Hello,'sent mail,"Greg,\n\n How about either next Tuesday or Thu..."
4,Re: PRC review - phone calls,'sent mail,any morning between 10 and 11:30


In [5]:
df.shape

(489236, 3)

### Remove folder 
remove folder that do not contain enough email because such folders would not be significant for 
training our classifier. Also, we can infer that some folders with very little emails in them were 
created but unused

In [6]:
def remove_folder(emails,n):
    email_count = dict(df['X-Folder'].value_counts())
    small_folder = [key for key, value in email_count.items() if value<=n]
    email=df.loc[~df['X-Folder'].isin(small_folder)]
    return email

In [7]:
n=150
df= remove_folder(df,n)

In [8]:
df.head(3)

Unnamed: 0,Subject,X-Folder,body
0,Re:,'sent mail,Traveling to have a business meeting takes the...
1,Re: test,'sent mail,test successful. way to go!!!
2,Re: Hello,'sent mail,Let's shoot for Tuesday at 11:45.


In [9]:
# check the shape after remove folder
df.shape

(460141, 3)

In [10]:
print("total unique folders :",len(df['X-Folder'].unique()))

total unique folders : 82


In [11]:
# combine subject and body columns
df['text'] = df['Subject'] + " " + df['body']

In [12]:
df.head(3)

Unnamed: 0,Subject,X-Folder,body,text
0,Re:,'sent mail,Traveling to have a business meeting takes the...,Re: Traveling to have a business meeting takes...
1,Re: test,'sent mail,test successful. way to go!!!,Re: test test successful. way to go!!!
2,Re: Hello,'sent mail,Let's shoot for Tuesday at 11:45.,Re: Hello Let's shoot for Tuesday at 11:45.


In [13]:
# remove subject and body column from the dataset
drop_col = ['Subject','body']
df.drop(columns=drop_col,axis=1, inplace=True)


In [14]:
df.head(3)

Unnamed: 0,X-Folder,text
0,'sent mail,Re: Traveling to have a business meeting takes...
1,'sent mail,Re: test test successful. way to go!!!
2,'sent mail,Re: Hello Let's shoot for Tuesday at 11:45.


### cleaning process steps:
    convert text into lowercase
    remove all new line or extra line
    remove tabs,punctuation and commas
    remove extra white spaces
    remove stopwords

In [15]:
stopword = stopwords.words("english")
print(stopword)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
def preprocess_text(text):
    # lowercase words
    text = text.lower()
    
    # remove new line 
    text = re.sub(r"\n+"," ",text)
    
    #remove tabs,punctuation and commas
    text= re.sub("["+string.punctuation+"]", " ", text)
    
    # remove extra white space
    text = re.sub("\s+"," ",text)
    
    # remove stopwords from text
    text =" ".join([word for word in text.split() if word not in stopword])
    return text

start = time.time()
df['text'] = df['text'].apply(lambda x : preprocess_text(x))

In [17]:
start= time.time()
folder_dict = dict(df['X-Folder'].value_counts().sort_values()[50:70])
data = df[df['X-Folder'].isin(folder_dict.keys())]
end= time.time()
print("total time for exeuation  :",(end-start))


total time for exeuation  : 0.04509472846984863


In [18]:
# check the shape of the dataset
data.shape[0]

13586

In [19]:
# save it into folder
data.to_csv("preprocessed.csv",index=False)

In [20]:
# read the preprocessed data
data = pd.read_csv("preprocessed.csv")
data.head()

Unnamed: 0,X-Folder,text
0,california,caiso notice summer 2001 generation rfb market...
1,california,ca iso cal px information related 2000 market ...
2,california,caiso notification update inter sc trades adju...
3,california,update mif meeting presentations iso website u...
4,california,mif presentations presentations market issues ...


In [21]:
data['X-Folder'].unique()

array(['california', 'calendar', 'resumes', 'archives', 'junk file',
       'ces', 'logistics', 'tufco', 'projects', 'online trading',
       'management', 'esvl', 'ooc', 'corporate', 'tw-commercial group',
       'genco-jv_ipo', 'junk', 'deal discrepancies', 'e-mail bin',
       'bill williams iii'], dtype=object)

In [22]:
def label_encoder(data):
    encoder = LabelEncoder()
    # apply encoder on dependent feature means X-Folder
    y = encoder.fit_transform(data['X-Folder'])
    return y

In [23]:
y = label_encoder(data)
input_data = data['text']

 Before going to the training process we convert the raw data into numerical form because machine 
learning algorithm can not understad the raw text directly so we need to convert the raw text into 
 numerical form or vector representation using Bow , tf-idf.
 
 # 1. Bag-of-word
It is a way of extracting feature from text for use in modeling such as machine learning algorithm.
A bag of word is a represenation of text that describes the occurance of words within  a document.It involves two things.
1. A vocubulary of known words
2. A measure of the presence of known words
 
 

In [24]:
start = time.time()
count_vect = CountVectorizer(min_df=5, max_features=5000)
X= count_vect.fit_transform(input_data)
end = time.time()
print("Execution time :",(end-start))

Execution time : 1.7350153923034668


In [25]:
start= time.time()
X= X.toarray()
print("X shape : ",X.shape)
end= time.time()
print("Execution time :",(end-start))


X shape :  (13586, 5000)
Execution time : 0.29169535636901855


In [26]:
# create dataframe to store results
f1_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB','Decision Tree','SVM','AdaBoost','ANN'],
    'BoW': ''
}


jaccard_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB', 'Decision Tree','SVM','AdaBoost','ANN'],
    'BoW': ''
}


acc_data = {
    'Algorithm': ['Gaussian NB', 'Multinomial NB','Decision Tree','SVM','AdaBoost','ANN'],
    'BoW': ''
}

f1_df= pd.DataFrame(f1_data)
jac_df= pd.DataFrame(jaccard_data)
acc_df= pd.DataFrame(acc_data)


In [27]:
# Training and Evaluation
models =[GaussianNB(),MultinomialNB(), DecisionTreeClassifier(), LinearSVC(),
         AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=5),
         MLPClassifier(hidden_layer_sizes=(10,))]

names=['GaussianNB','MultinomialNB','DecisionTreeClassifier','LinearSVC','AdaBoostClassifier','MLPClassifier']


jacc_scores = []
acc_scores = []
f1_scores = []
exec_times = []

In [28]:
for model, name in zip(models,names):
    print(model)
    start= time.time()
    scoring ={"acc":'accuracy',
             'f1_mac':'f1_macro',
             'jacc_mac':'jaccard_macro'}
    scores = cross_validate(model,X,y,cv=5, n_jobs=4,scoring=scoring)
    training_time= (time.time()-start)
    print("accuracy: ", scores['test_acc'].mean())
    print("f1_score: ", scores['test_f1_mac'].mean())
    print("Jaccard_index: ", scores['test_jacc_mac'].mean())
    print("time (sec): ", training_time)
    print("\n")
    jacc_scores.append(scores['test_jacc_mac'].mean())
    acc_scores.append(scores['test_acc'].mean())
    f1_scores.append(scores['test_f1_mac'].mean())
    exec_times.append(training_time)

GaussianNB()
accuracy:  0.5668335498589943
f1_score:  0.5441217915003902
Jaccard_index:  0.392009682672538
time (sec):  16.9053316116333


MultinomialNB()
accuracy:  0.7278835760885255
f1_score:  0.6939782155546919
Jaccard_index:  0.5628269573042803
time (sec):  37.283658504486084


DecisionTreeClassifier()
accuracy:  0.6391881655388104
f1_score:  0.6206441383256058
Jaccard_index:  0.47065104482929315
time (sec):  61.29569220542908


LinearSVC()
accuracy:  0.7164733643646157
f1_score:  0.6986398758689013
Jaccard_index:  0.5612043713603372
time (sec):  27.06520104408264


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=5)
accuracy:  0.6478741079995872
f1_score:  0.627994046351007
Jaccard_index:  0.481102532330037
time (sec):  297.92735266685486


MLPClassifier(hidden_layer_sizes=(10,))
accuracy:  0.7266309500886009
f1_score:  0.70805583925509
Jaccard_index:  0.5718086166189493
time (sec):  226.65907287597656




In [29]:
acc_df['BoW'] = acc_scores
f1_df['BoW'] =f1_scores
jac_df['BoW'] =jacc_scores
acc_df['BoW_time'] = training_time

In [30]:
acc_df['BoW_time'] = training_time

In [31]:
acc_df

Unnamed: 0,Algorithm,BoW,BoW_time
0,Gaussian NB,0.566834,226.659073
1,Multinomial NB,0.727884,226.659073
2,Decision Tree,0.639188,226.659073
3,SVM,0.716473,226.659073
4,AdaBoost,0.647874,226.659073
5,ANN,0.726631,226.659073


In [32]:
# save the results
acc_df.to_csv("accuracy.csv", index=False)
f1_df.to_csv("f1_score.csv", index=False)
jac_df.to_csv("jacc_score.csv", index=False)

### Bag Of Words Bigram

In [33]:
start = time.time()
count_vect = CountVectorizer(min_df=5, max_features=5000,ngram_range=(2,2))
X= count_vect.fit_transform(input_data)
X = X.toarray()
print("X shape :",X.shape)
end = time.time()
print("Execution time :",(end-start))

X shape : (13586, 5000)
Execution time : 5.679706811904907


### Training and evaluation

In [34]:
models =[GaussianNB(),MultinomialNB(), DecisionTreeClassifier(), LinearSVC(),
         AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=5),
         MLPClassifier(hidden_layer_sizes=(10,))]

names=['GaussianNB','MultinomialNB','DecisionTreeClassifier','LinearSVC','AdaBoostClassifier','MLPClassifier']


jacc_scores = []
acc_scores = []
f1_scores = []
exec_times = []

for model, name in zip(models,names):
    print(model)
    start= time.time()
    scoring ={"acc":'accuracy',
             'f1_mac':'f1_macro',
             'jacc_mac':'jaccard_macro'}
    scores = cross_validate(model,X,y,cv=5, n_jobs=4,scoring=scoring)
    training_time= (time.time()-start)
    print("accuracy: ", scores['test_acc'].mean())
    print("f1_score: ", scores['test_f1_mac'].mean())
    print("Jaccard_index: ", scores['test_jacc_mac'].mean())
    print("time (sec): ", training_time)
    print("\n")
    jacc_scores.append(scores['test_jacc_mac'].mean())
    acc_scores.append(scores['test_acc'].mean())
    f1_scores.append(scores['test_f1_mac'].mean())
    exec_times.append(training_time)
    
acc_df['BoWBi'] = acc_scores
f1_df['BoWBi'] =f1_scores
jac_df['BoWBi'] =jacc_scores
acc_df['BoWBi_time'] = training_time
acc_df

GaussianNB()
accuracy:  0.5741205388469244
f1_score:  0.5531704529630934
Jaccard_index:  0.39584816767646086
time (sec):  14.482750654220581


MultinomialNB()
accuracy:  0.6235102452251284
f1_score:  0.6041764725559157
Jaccard_index:  0.45575511615901265
time (sec):  37.24501156806946


DecisionTreeClassifier()
accuracy:  0.5753730835989463
f1_score:  0.565168437457654
Jaccard_index:  0.4154682641006149
time (sec):  157.89789056777954


LinearSVC()
accuracy:  0.6185783079474262
f1_score:  0.6056464758193184
Jaccard_index:  0.45476251370483345
time (sec):  19.93900442123413


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=5)
accuracy:  0.5632279033464116
f1_score:  0.5525724432805649
Jaccard_index:  0.4027439390950513
time (sec):  334.44291257858276


MLPClassifier(hidden_layer_sizes=(10,))
accuracy:  0.5970857731401475
f1_score:  0.5828702532160996
Jaccard_index:  0.4313803737892967
time (sec):  336.03135538101196




Unnamed: 0,Algorithm,BoW,BoW_time,BoWBi,BoWBi_time
0,Gaussian NB,0.566834,226.659073,0.574121,336.031355
1,Multinomial NB,0.727884,226.659073,0.62351,336.031355
2,Decision Tree,0.639188,226.659073,0.575373,336.031355
3,SVM,0.716473,226.659073,0.618578,336.031355
4,AdaBoost,0.647874,226.659073,0.563228,336.031355
5,ANN,0.726631,226.659073,0.597086,336.031355


In [35]:
# save the results
acc_df.to_csv("accuracy.csv", index=False)
f1_df.to_csv("f1_score.csv", index=False)
jac_df.to_csv("jacc_score.csv", index=False)

### TFidf (term-frequency inverse document frequency)¶

In [36]:
start= time.time()
tfidf= TfidfVectorizer(min_df=5,max_features=5000)
X = tfidf.fit_transform(input_data)
X= X.toarray()
print("X shape :",X.shape)
end = time.time()
print("execution time :",(end-start))


X shape : (13586, 5000)
execution time : 2.198418617248535


In [37]:
models =[GaussianNB(),MultinomialNB(), DecisionTreeClassifier(), LinearSVC(),
         AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=5),
         MLPClassifier(hidden_layer_sizes=(10,))]

names=['GaussianNB','MultinomialNB','DecisionTreeClassifier','LinearSVC','AdaBoostClassifier','MLPClassifier']


jacc_scores = []
acc_scores = []
f1_scores = []
exec_times = []

for model, name in zip(models,names):
    print(model)
    start= time.time()
    scoring ={"acc":'accuracy',
             'f1_mac':'f1_macro',
             'jacc_mac':'jaccard_macro'}
    scores = cross_validate(model,X,y,cv=5, n_jobs=4,scoring=scoring)
    training_time= (time.time()-start)
    print("accuracy: ", scores['test_acc'].mean())
    print("f1_score: ", scores['test_f1_mac'].mean())
    print("Jaccard_index: ", scores['test_jacc_mac'].mean())
    print("time (sec): ", training_time)
    print("\n")
    jacc_scores.append(scores['test_jacc_mac'].mean())
    acc_scores.append(scores['test_acc'].mean())
    f1_scores.append(scores['test_f1_mac'].mean())
    exec_times.append(training_time)
    
acc_df['tdidf'] = acc_scores
f1_df['tdidf'] =f1_scores
jac_df['tdidf'] =jacc_scores
acc_df['tdidf_timing'] = training_time
acc_df

GaussianNB()
accuracy:  0.5836893751846698
f1_score:  0.5630253882669244
Jaccard_index:  0.4115797560251563
time (sec):  12.949916124343872


MultinomialNB()
accuracy:  0.72302468067543
f1_score:  0.6815725509632343
Jaccard_index:  0.5508293556366095
time (sec):  2.2749176025390625


DecisionTreeClassifier()
accuracy:  0.6386731892483025
f1_score:  0.623598143434159
Jaccard_index:  0.4731872267901906
time (sec):  71.01550006866455


LinearSVC()
accuracy:  0.7836020066065379
f1_score:  0.7668982829267701
Jaccard_index:  0.6454489354776655
time (sec):  6.502036809921265


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=5)
accuracy:  0.6541296548616172
f1_score:  0.6339147693839028
Jaccard_index:  0.4881338835741076
time (sec):  320.3334610462189


MLPClassifier(hidden_layer_sizes=(10,))
accuracy:  0.740837172973806
f1_score:  0.7240256425269747
Jaccard_index:  0.5923751886222164
time (sec):  339.849662065506




Unnamed: 0,Algorithm,BoW,BoW_time,BoWBi,BoWBi_time,tdidf,tdidf_timing
0,Gaussian NB,0.566834,226.659073,0.574121,336.031355,0.583689,339.849662
1,Multinomial NB,0.727884,226.659073,0.62351,336.031355,0.723025,339.849662
2,Decision Tree,0.639188,226.659073,0.575373,336.031355,0.638673,339.849662
3,SVM,0.716473,226.659073,0.618578,336.031355,0.783602,339.849662
4,AdaBoost,0.647874,226.659073,0.563228,336.031355,0.65413,339.849662
5,ANN,0.726631,226.659073,0.597086,336.031355,0.740837,339.849662


In [38]:
# save the results
acc_df.to_csv("accuracy.csv", index=False)
f1_df.to_csv("f1_score.csv", index=False)
jac_df.to_csv("jacc_score.csv", index=False)