In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [13]:
# Data Preparation
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [14]:
df.isnull().sum()

Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64

In [15]:

# Drop 'Email No.' column as it's not a feature for prediction
df = df.drop(columns=['Email No.'])

# Split features and target variable
X = df.drop(columns=['Prediction'])
y = df['Prediction']


In [16]:
# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [17]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:

# Model Selection
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}


In [19]:
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

In [20]:
# Model Training and Evaluation
best_model = None
best_accuracy = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    accuracy, precision, recall, f1 = evaluate_model(model, X_test, y_test)

    print(f'{model_name} Performance:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-score: {f1:.2f}')

    # Cross-validation
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f'Cross-Validation Accuracy: {scores.mean():.2f}')
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = model_name

Logistic Regression Performance:
Accuracy: 0.97
Precision: 0.91
Recall: 0.98
F1-score: 0.94
Cross-Validation Accuracy: 0.95
Decision Tree Performance:
Accuracy: 0.92
Precision: 0.88
Recall: 0.86
F1-score: 0.87
Cross-Validation Accuracy: 0.92
Support Vector Machine Performance:
Accuracy: 0.95
Precision: 1.00
Recall: 0.82
F1-score: 0.90
Cross-Validation Accuracy: 0.92
Random Forest Performance:
Accuracy: 0.98
Precision: 0.96
Recall: 0.96
F1-score: 0.96
Cross-Validation Accuracy: 0.96
Gradient Boosting Performance:
Accuracy: 0.97
Precision: 0.94
Recall: 0.96
F1-score: 0.95
Cross-Validation Accuracy: 0.95


In [21]:

# Display the best model
print(f'The best model is {best_model.__class__.__name__} with an accuracy of {best_accuracy:.2f}')


The best model is RandomForestClassifier with an accuracy of 0.98


In [22]:
# Using the best model for predictions
y_pred = best_model.predict(X_test)

# Displaying some example predictions # 1=True ,0=False
predictions = pd.DataFrame({
    'Email': X_test.index,
    'Actual': y_test,
    'Predicted': y_pred
})

print(predictions.head(10))


      Email  Actual  Predicted
1566   1566       0          0
1988   1988       0          0
1235   1235       1          1
3276   3276       0          0
3438   3438       0          0
1471   1471       0          0
1129   1129       1          1
3750   3750       0          0
3049   3049       0          0
530     530       0          0
