In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
data = pd.read_csv("emails.csv")

data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [36]:
data.describe()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


In [26]:
print(data.shape)

(5172, 3002)


In [27]:
data_drop_col=data.drop('Email No.', axis=1)

x=data.drop(columns=['Email No.', 'Prediction'])
y=data['Prediction']
x_numeric=x.select_dtypes(include=['float64','int64'])
scale=StandardScaler()
x_scale=scale.fit_transform(x_numeric)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=44)

In [28]:
lr = LogisticRegression(max_iter=1000)
svm= SVC(kernel='linear', random_state=44)
dt= DecisionTreeClassifier()
rf=RandomForestClassifier()
gb=GradientBoostingClassifier()

In [29]:
diff_models= [lr, svm, dt, rf, gb]
for model in diff_models:
    model.fit(x_train, y_train)
    Accuracy= model.score(x_test, y_test)
    print(f"{model} Accuracy: {Accuracy}")

LogisticRegression(max_iter=1000) Accuracy: 0.971967133881102
SVC(kernel='linear', random_state=44) Accuracy: 0.9487675205413243
DecisionTreeClassifier() Accuracy: 0.9173513774770421
RandomForestClassifier() Accuracy: 0.9700338327694539
GradientBoostingClassifier() Accuracy: 0.9647172547124214


In [30]:
def model_evaluation(model, x_test, y_test):
    y_pred = model.predict(x_test)
    Accuracy =accuracy_score(y_test, y_pred)
    Precision = precision_score(y_test, y_pred)
    Recall= recall_score(y_test, y_pred)
    f1_value= f1_score(y_test, y_pred)
    print(f"Accuracy: {Accuracy}")
    print(f"Precision: {Precision}")
    print(f"Recall: {Recall}")
    print(f"f1 Score: {f1_value}")

In [31]:
models_evaluation = [lr,svm, dt, rf, gb]
for model in models_evaluation:
    print(f"{model} Evaluation: ")
    model_evaluation(model, x_test,y_test)

LogisticRegression(max_iter=1000) Evaluation: 
Accuracy: 0.971967133881102
Precision: 0.956882255389718
Recall: 0.9474548440065681
f1 Score: 0.9521452145214522
SVC(kernel='linear', random_state=44) Evaluation: 
Accuracy: 0.9487675205413243
Precision: 0.914332784184514
Recall: 0.9113300492610837
f1 Score: 0.912828947368421
DecisionTreeClassifier() Evaluation: 
Accuracy: 0.9173513774770421
Precision: 0.8465189873417721
Recall: 0.8784893267651889
f1 Score: 0.8622078968573731
RandomForestClassifier() Evaluation: 
Accuracy: 0.9700338327694539
Precision: 0.9627749576988156
Recall: 0.9343185550082101
f1 Score: 0.9483333333333334
GradientBoostingClassifier() Evaluation: 
Accuracy: 0.9647172547124214
Precision: 0.958904109589041
Recall: 0.9195402298850575
f1 Score: 0.9388097233864208


In [32]:
#Cross-Validation Score
for model in diff_models:
    cv_score = cross_val_score(model, x_scale, y, cv=5)
    print(f"{model}: Cross-Validation Score: {cv_score}")
    print(f"{model}: Mean of Cross-Validation: {cv_score.mean()}")

LogisticRegression(max_iter=1000): Cross-Validation Score: [0.93816425 0.96038647 0.96131528 0.96324952 0.92843327]
LogisticRegression(max_iter=1000): Mean of Cross-Validation: 0.9503097580803409
SVC(kernel='linear', random_state=44): Cross-Validation Score: [0.9294686  0.94782609 0.94294004 0.93713733 0.91489362]
SVC(kernel='linear', random_state=44): Mean of Cross-Validation: 0.9344531344901373
DecisionTreeClassifier(): Cross-Validation Score: [0.92270531 0.92077295 0.92359768 0.93713733 0.88007737]
DecisionTreeClassifier(): Mean of Cross-Validation: 0.9168581279959632
RandomForestClassifier(): Cross-Validation Score: [0.96231884 0.96811594 0.95261122 0.97001934 0.94294004]
RandomForestClassifier(): Mean of Cross-Validation: 0.9592010764443696
GradientBoostingClassifier(): Cross-Validation Score: [0.95362319 0.95748792 0.95551257 0.96711799 0.94003868]
GradientBoostingClassifier(): Mean of Cross-Validation: 0.954756071351816
