# Libraries
---

In [1]:
# Data Prep Libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import warnings

# Model Prep Library
from sklearn.model_selection import train_test_split, GridSearchCV

# Model Evaluation Libraries
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Model Library
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# Settings for display
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
warnings.filterwarnings("ignore")

# Read Data
---

In [2]:
data = pd.read_csv("../data/emails.csv")
df = data.copy()

# Model Evaluation Function
---

In [3]:
model = list()
resample = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()

In [4]:
def model_eval_(clf_model, X_test, y_test, algo=None, sampling=None):
    # Test set prediction
    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('AUC-ROC')
    print('='*60)
    print(roc_auc_score(y_test, y_prob[:,1]))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred))
    recall.append(recall_score(y_test,y_pred))
    F1score.append(f1_score(y_test,y_pred))
    AUCROC.append(roc_auc_score(y_test, y_prob[:,1]))
    resample.append(sampling)

# Data Prep
---

In [5]:
df.drop("Email No.", axis=1, inplace=True)
print(df.shape)
print()
print(df.isna().sum().sum())

(5172, 3001)

0


In [6]:
y = df["Prediction"]
X = df.drop("Prediction", axis=1)

# Create Train, Validation and Test Datasets
---

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.25, random_state=42)

In [8]:
X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((3879, 3000), (969, 3000), (324, 3000), (3879,), (969,), (324,))

# Model
---

In [9]:
cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=False)

In [15]:
param_rf_grid = 

In [16]:
rf_cv = GridSearchCV(estimator=RandomForestClassifier(), 
                     param_grid=param_rf_grid, 
                     cv=cv, 
                     n_jobs=2,
                     verbose=2, 
                     refit="f1")
nn_cv.fit(X_valid, y_valid)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  5.2min
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:  9.9min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=100, shuffle=False),
             estimator=MLPClassifier(), n_jobs=2,
             param_grid={'activation': ['relu'], 'alpha': [0.1, 0.01],
                         'hidden_layer_sizes': [(10, 10), (5, 3)],
                         'learning_rate_init': [0.1, 0.01],
                         'max_iter': [200, 400]},
             refit='f1', verbose=2)

In [17]:
model_eval_(rf_cv, X_valid, y_valid, 'Random Forest', 'Validation Evaluation')

Confusion Matrix
[[689   0]
 [  0 280]] 

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       689
           1       1.00      1.00      1.00       280

    accuracy                           1.00       969
   macro avg       1.00      1.00      1.00       969
weighted avg       1.00      1.00      1.00       969
 

AUC-ROC
1.0


### Validation results are perfect. We hit the target.

In [18]:
model_eval_(nn_cv, X_test, y_test, 'Random Forest', 'Test Evaluation')

Confusion Matrix
[[220   4]
 [  7  93]] 

Classification Report
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       224
           1       0.96      0.93      0.94       100

    accuracy                           0.97       324
   macro avg       0.96      0.96      0.96       324
weighted avg       0.97      0.97      0.97       324
 

AUC-ROC
0.9886607142857142


### Test results are very good. We have very high accuracy, recall and precision values.