In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
data = pd.read_csv("emails.csv")
data

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [3]:
X = data.drop(columns=['Email No.', 'Prediction'])
y = data['Prediction']

# Random Forest Classifier

In [4]:
rf = RandomForestClassifier()

In [5]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

In [6]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')


In [7]:
grid_search.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 10, 20, 30],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]},
             scoring='accuracy')

In [8]:
best_params = grid_search.best_params_


In [9]:
print("Best hyperparameters: ", best_params)

Best hyperparameters:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [10]:
best_model_grid = grid_search.best_estimator_
y_pred_grid = best_model_grid.predict(X)

In [11]:
print("Grid Search - Classification Report:")
print(classification_report(y, y_pred_grid))

Grid Search - Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3672
           1       1.00      1.00      1.00      1500

    accuracy                           1.00      5172
   macro avg       1.00      1.00      1.00      5172
weighted avg       1.00      1.00      1.00      5172



In [12]:
print("Grid Search - Accuracy:", accuracy_score(y, y_pred_grid))

Grid Search - Accuracy: 1.0


In [13]:
print("Grid Search - AUC-ROC:", roc_auc_score(y, best_model_grid.predict_proba(X)[:, 1]))

Grid Search - AUC-ROC: 1.0


# Random Search

In [14]:
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20]
}



In [15]:
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)

In [16]:
random_search.fit(X, y)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   param_distributions={'max_depth': [None, 10, 20, 30, 40, 50],
                                        'min_samples_split': [2, 5, 10, 15, 20],
                                        'n_estimators': [50, 100, 200, 300,
                                                         400]},
                   random_state=42, scoring='accuracy')

In [17]:
best_params_random = random_search.best_params_

In [18]:
print("Best hyperparameters from random search: ", best_params_random)

Best hyperparameters from random search:  {'n_estimators': 300, 'min_samples_split': 2, 'max_depth': 30}


In [19]:
best_model_random = random_search.best_estimator_
y_pred_random = best_model_random.predict(X)

In [21]:
print("Random Search - Classification Report:")
print(classification_report(y, y_pred_random))

Random Search - Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3672
           1       1.00      1.00      1.00      1500

    accuracy                           1.00      5172
   macro avg       1.00      1.00      1.00      5172
weighted avg       1.00      1.00      1.00      5172



In [22]:
print("Random Search - Accuracy:", accuracy_score(y, y_pred_random))

Random Search - Accuracy: 1.0


In [23]:
print("Random Search - AUC-ROC:", roc_auc_score(y, best_model_random.predict_proba(X)[:, 1]))

Random Search - AUC-ROC: 1.0
