In [2]:
import pandas as pd

df = pd.read_csv('emails.csv')
df.head()

# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())
df = df.dropna()  # Drop rows with any missing value (if any exist)

Missing values in each column:
 Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64


In [3]:
# Drop non-feature column (Email No.)
df = df.drop(['Email No.'], axis=1)

# Features and labels
X = df.drop('Prediction', axis=1)
y = df['Prediction']

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [5]:
# using logistic regression classification algorithm
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.9671497584541063
Precision: 0.9119496855345912
Recall: 0.9797297297297297
F1 Score: 0.9446254071661238


In [7]:
# using Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.9285024154589372
Precision: 0.87248322147651
Recall: 0.8783783783783784
F1 Score: 0.8754208754208754


In [8]:
# using SVC classification algorithm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.9468599033816425
Precision: 0.9958847736625515
Recall: 0.8175675675675675
F1 Score: 0.8979591836734694


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()

# Grid search for best parameters
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(rf, param_grid, cv=3, scoring='f1')
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)

# Evaluate best model
best_model = grid.best_estimator_
y_pred_rf = best_model.predict(X_test)

print("F1 Score (RF):", f1_score(y_test, y_pred_rf))

Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
F1 Score (RF): 0.9613445378151261
