## Run libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [2]:
Df = pd.read_csv("emails.csv")

In [3]:
Df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


## Data Analysis Part

In [4]:
Df.shape

(5172, 3002)

In [5]:
Df.columns

Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)

In [6]:
Df.describe()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


In [7]:
Df.isnull().sum()

Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64

In [8]:
Df.dropna(inplace=True)

In [9]:
Df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [10]:
Df.drop(['Email No.'], axis = 1, inplace = True)

In [11]:
Df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,1,0,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,1,0,1


## Train Test Split

In [12]:
X = Df.drop(['Prediction'], axis = 1)
y = Df['Prediction']

In [13]:
X

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,0,1,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,0,1,0


In [14]:
y

0       0
1       0
2       0
3       0
4       0
       ..
5167    0
5168    0
5169    1
5170    1
5171    0
Name: Prediction, Length: 5172, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Scale data using Standard Scalar

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train

array([[-0.13697531, -0.23313587, -0.28733325, ..., -0.05767098,
        -0.34669127, -0.06847003],
       [-0.31120252, -0.02436609, -0.0832229 , ..., -0.05767098,
         0.03087263, -0.06847003],
       [-0.48542972, -0.65067542, -0.28733325, ..., -0.05767098,
        -0.34669127, -0.06847003],
       ...,
       [ 0.47281991,  1.22825259,  0.18892425, ..., -0.05767098,
         0.40843653, -0.06847003],
       [ 0.2985927 , -0.02436609,  6.17616136, ..., -0.05767098,
         0.03087263, -0.06847003],
       [-0.48542972,  0.18440369, -0.21929647, ..., -0.05767098,
        -0.34669127, -0.06847003]])

In [17]:
X_test = scaler.transform(X_test)
X_test

array([[ 0.64704711,  0.0800188 , -0.21929647, ..., -0.05767098,
        -0.34669127, -0.06847003],
       [-0.04986171, -0.44190564, -0.28733325, ..., -0.05767098,
        -0.34669127, -0.06847003],
       [ 2.91200078, -0.02436609, -0.0832229 , ..., -0.05767098,
         0.03087263, -0.06847003],
       ...,
       [ 0.1243655 , -0.02436609, -0.21929647, ..., -0.05767098,
        -0.34669127, -0.06847003],
       [-0.48542972, -0.44190564, -0.21929647, ..., -0.05767098,
         0.03087263, -0.06847003],
       [-0.48542972, -0.54629053, -0.28733325, ..., -0.05767098,
        -0.34669127, -0.06847003]])

## Train data using two models Random Forest and Logistic Regression

In [18]:
#Logistic regression.
lr_model = LogisticRegression(random_state=25)

In [19]:
lr_model.fit(X_train, y_train)

In [20]:
pred = lr_model.predict(X_test)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_curve, roc_auc_score

In [23]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
Logistic_regression_Accuracy = round(accuracy_score(y_test, pred)*100)
print(Logistic_regression_Accuracy)

[[1079   32]
 [  16  425]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1111
           1       0.93      0.96      0.95       441

    accuracy                           0.97      1552
   macro avg       0.96      0.97      0.96      1552
weighted avg       0.97      0.97      0.97      1552

97


In [27]:
 clf = RandomForestClassifier(max_depth=2, random_state=0)

In [28]:
clf.fit(X_train, y_train)
RandomForestClassifier(...)
pred = clf.predict(X_test)
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
Random_Forest_Accuracy = round(accuracy_score(y_test, pred)*100)
print(Random_Forest_Accuracy)

[[1111    0]
 [ 410   31]]
              precision    recall  f1-score   support

           0       0.73      1.00      0.84      1111
           1       1.00      0.07      0.13       441

    accuracy                           0.74      1552
   macro avg       0.87      0.54      0.49      1552
weighted avg       0.81      0.74      0.64      1552

74
