## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import joblib

from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## Data Reading

In [3]:
train_df = pd.read_csv('scaled_train.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label,proto_target_encoded,service_target_encoded,state_target_encoded
0,0,0.179469,0.4,0.2,-0.131902,0.00726,-0.02522,-0.010417,0.892857,-0.009738,0.253129,0.0,0.0,0.435473,0.163928,0.012007,0.102883,1.0,0.324406,1.150945,1.0,0.0,0.0,0.0,-0.697674,-0.011236,0.0,0.0,-0.4,-1.0,-0.166667,0.0,0.0,-0.181818,0,0,0.0,-0.285714,-0.3,0,0,-0.995621,-0.002417,-0.002039
1,1,0.97045,1.2,3.6,0.233129,37.976407,-0.025185,-1.0,0.884921,-0.009803,18.032918,0.666667,8.5,0.900023,0.302174,0.024441,12.068622,1.0,0.739772,1.608104,1.0,0.0,0.0,0.0,-0.488372,11.932584,0.0,0.0,3.8,0.0,-0.166667,0.0,0.0,-0.090909,0,0,0.0,-0.285714,0.2,0,0,-0.995493,-0.00037,-0.002362
2,2,2.427244,0.6,1.4,-0.050613,11.816697,-0.0257,-1.0,0.884921,-0.00988,2.1362,0.333333,3.0,4.199465,2.012246,6.835484,99.320499,1.0,1.104087,1.54839,1.0,1.708847,2.64131,1.296432,-0.627907,8.764045,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0,0,0.0,-0.142857,0.2,0,0,-0.9923,0.0,0.0
3,3,2.51483,1.0,1.0,0.15184,0.549909,-0.025704,-1.0,0.884921,-0.009867,0.068652,0.333333,1.5,2.766997,1.767374,0.103084,43.410362,1.0,0.577632,0.547346,1.0,0.0,0.0,0.0,-0.488372,0.224719,0.0,0.0,-0.4,0.0,0.0,0.0,0.0,0.0,1,1,0.0,-0.142857,-0.3,0,0,-0.994307,0.129674,-0.001643
4,4,0.670406,0.8,0.4,0.079755,0.094374,-0.025546,0.0,0.884921,-0.009801,0.091221,0.666667,0.5,0.860772,1.481864,0.961223,1.007099,1.0,1.271038,1.033172,1.0,1.960584,3.057719,1.471084,-0.465116,0.011236,0.0,0.0,3.8,0.0,0.0,0.25,0.0,3.363636,0,0,0.0,-0.142857,3.5,0,0,-0.998246,-0.001737,-0.004067


In [4]:
train_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
train_df.shape

(175341, 43)

### Splitting into Train and Validation Sets 

In [6]:
x = train_df.drop('label', axis = 1)
y = train_df['label']

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [8]:
x_train.shape, y_train.shape

((140272, 42), (140272,))

In [9]:
x_val.shape, y_val.shape

((35069, 42), (35069,))

## Model Building

## I. Random Forest

In [10]:
rf_model = RandomForestClassifier(n_jobs = -1, class_weight = 'balanced', random_state = 42)
rf_model.fit(x_train, y_train)

In [11]:
train_acc = accuracy_score(y_train, rf_model.predict(x_train))
val_acc = accuracy_score(y_val, rf_model.predict(x_val))

In [12]:
train_acc, val_acc

(0.9995580015968974, 0.9594228520915908)

In [13]:
confusion_matrix(y_val, rf_model.predict(x_val))

array([[10182,   987],
       [  436, 23464]], dtype=int64)

In [14]:
print(classification_report(y_val, rf_model.predict(x_val)))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93     11169
           1       0.96      0.98      0.97     23900

    accuracy                           0.96     35069
   macro avg       0.96      0.95      0.95     35069
weighted avg       0.96      0.96      0.96     35069



In [15]:
train_auc = roc_auc_score(y_train, rf_model.predict(x_train))
val_auc = roc_auc_score(y_val, rf_model.predict(x_val))

train_auc, val_auc

(0.9995628229984729, 0.9466938638813123)

## Model Tuning

### 1. Randomized Search

In [19]:
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(range(10, 40, 10)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

In [20]:
random_search = RandomizedSearchCV(estimator = rf_model, param_distributions = param_dist, n_iter = 100, cv = 3, n_jobs = -1, verbose = 2, random_state = 42)
random_search.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [21]:
print("Best parameters found by Random Search:", random_search.best_params_)

Best parameters found by Random Search: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 93}


In [11]:
rf_random_model = RandomForestClassifier(bootstrap = False, max_depth = 30, min_samples_leaf = 2,
                                        min_samples_split = 3, n_estimators = 93, n_jobs = -1, random_state = 43)
rf_random_model.fit(x_train, y_train)

### 2. Grid Search CV

In [12]:
param_grid = {
    'n_estimators': [rf_random_model.n_estimators - 50, rf_random_model.n_estimators, rf_random_model.n_estimators + 50],
    'max_depth': [rf_random_model.max_depth - 10, rf_random_model.max_depth, rf_random_model.max_depth + 10],
    'min_samples_split': [rf_random_model.min_samples_split - 1, rf_random_model.min_samples_split, rf_random_model.min_samples_split + 1],
    'min_samples_leaf': [rf_random_model.min_samples_leaf - 1, rf_random_model.min_samples_leaf, rf_random_model.min_samples_leaf + 1],
    'bootstrap': [rf_random_model.bootstrap]
}

In [13]:
grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [27]:
print(grid_search.best_params_)

{'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 229}


In [16]:
tuned_rf_model = RandomForestClassifier(bootstrap = False, max_depth = 20, min_samples_leaf = 1,
                                       min_samples_split = 2, n_estimators = 143, n_jobs = -1, random_state = 100)
tuned_rf_model.fit(x_train, y_train)

### Evaluation

In [17]:
train_acc_rf = accuracy_score(y_train, tuned_rf_model.predict(x_train))
val_acc_rf = accuracy_score(y_val, tuned_rf_model.predict(x_val))

In [18]:
train_acc_rf, val_acc_rf

(0.9922507699327022, 0.959479882517323)

In [19]:
confusion_matrix(y_val, tuned_rf_model.predict(x_val))

array([[10171,   998],
       [  423, 23477]], dtype=int64)

In [20]:
print(classification_report(y_val, tuned_rf_model.predict(x_val)))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93     11169
           1       0.96      0.98      0.97     23900

    accuracy                           0.96     35069
   macro avg       0.96      0.95      0.95     35069
weighted avg       0.96      0.96      0.96     35069



In [21]:
train_auc_rf = roc_auc_score(y_train, tuned_rf_model.predict(x_train))
val_auc_rf = roc_auc_score(y_val, tuned_rf_model.predict(x_val))

train_auc_rf, val_auc_rf

(0.9907509733310799, 0.9464733959918199)

In [22]:
accuracy = accuracy_score(y_val, tuned_rf_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, tuned_rf_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, tuned_rf_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, tuned_rf_model.predict(x_val))
print('F1', f1)

Accuracy 0.959479882517323
Precision 0.959223697650664
Recall 0.9823012552301256
F1 0.970625322997416


### Adjusting the Threshold

In [23]:
y_pred_proba = tuned_rf_model.predict_proba(x_val)[:, 1]
y_pred_proba

array([0.        , 0.999967  , 0.95749636, ..., 0.        , 0.        ,
       0.6911867 ])

In [24]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)

In [25]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

print("Optimal Threshold:", optimal_threshold)

Optimal Threshold: 0.6585652010546491


In [26]:
#Applying the Threshold Value
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
y_pred_optimal

array([0, 1, 1, ..., 0, 0, 1])

In [27]:
train_auc = roc_auc_score(y_train, tuned_rf_model.predict(x_train))
val_auc = roc_auc_score(y_val, y_pred_optimal)

train_auc, val_auc

(0.9969569099445267, 0.9564576527005598)

In [28]:
accuracy = accuracy_score(y_val, y_pred_optimal)
print('Accuracy', accuracy)
precision = precision_score(y_val, y_pred_optimal)
print('Precision', precision)
recall = recall_score(y_val, y_pred_optimal)
print('Recall', recall)
f1 = f1_score(y_val, y_pred_optimal)
print('F1', f1)

Accuracy 0.957227180700904
Precision 0.9782237403928267
Recall 0.9585774058577405
F1 0.9683009298393914


### Important Features

In [23]:
feature_importances = tuned_rf_model.feature_importances_
feature_names = x_train.columns

len(feature_importances), len(feature_names)

(42, 42)

In [24]:
importance_df = pd.DataFrame({'features': feature_names, 'importances': feature_importances}).sort_values(by = 'importances', ascending = False)

In [25]:
importance_df.head(10)

Unnamed: 0,features,importances
6,sttl,0.182363
28,ct_state_ttl,0.138047
9,dload,0.073691
7,dttl,0.051539
5,rate,0.038495
24,dmean,0.037084
8,sload,0.031864
3,sbytes,0.02897
41,state_target_encoded,0.02827
22,ackdat,0.028167


## II. XGBoost

In [22]:
xgb_model = XGBClassifier(n_jobs = -1, random_state = 42, eval_metric = 'logloss')
xgb_model.fit(x_train, y_train)

In [23]:
train_acc = accuracy_score(y_train, xgb_model.predict(x_train))
val_acc = accuracy_score(y_val, xgb_model.predict(x_val))

train_acc, val_acc

(0.9702221398425915, 0.9584533348541446)

In [24]:
accuracy = accuracy_score(y_val, xgb_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, xgb_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, xgb_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, xgb_model.predict(x_val))
print('F1', f1)

Accuracy 0.9584533348541446
Precision 0.9606716203456628
Recall 0.9791213389121339
F1 0.9698087403385897


In [25]:
train_auc = roc_auc_score(y_train, xgb_model.predict(x_train))
val_auc = roc_auc_score(y_val, xgb_model.predict(x_val))

train_auc, val_auc

(0.9620308607569014, 0.946674108438966)

## Model Tuning

### 1. Randomized Search CV

In [71]:
param_dist = {
    'n_estimators': range(50, 500, 50),
    'max_depth': range(3, 15, 2),
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

In [72]:
random_search_xgb = RandomizedSearchCV(estimator = xgb_model, param_distributions = param_dist, 
                                                 n_iter = 100, scoring = 'roc_auc', cv = 3, 
                                                 verbose = 3, random_state = 42, n_jobs = -1)
random_search_xgb.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [74]:
print(random_search_xgb.best_params_)

{'subsample': 1.0, 'n_estimators': 400, 'max_depth': 13, 'learning_rate': 0.1, 'colsample_bytree': 0.6}


In [75]:
xgb_random_model = XGBClassifier(subsample = 1.0, n_estimators = 400, max_depth = 13, learning_rate = 0.1,
                                colsample_bytree = 0.6, random_state = 42, n_jobs = -1)
xgb_random_model.fit(x_train, y_train)

### Evaluation

In [76]:
train_acc = accuracy_score(y_train, xgb_random_model.predict(x_train))
val_acc = accuracy_score(y_val, xgb_random_model.predict(x_val))

train_acc, val_acc

(0.9978185240104939, 0.9618466451852064)

In [77]:
accuracy = accuracy_score(y_val, xgb_random_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, xgb_random_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, xgb_random_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, xgb_random_model.predict(x_val))
print('F1', f1)

Accuracy 0.9618466451852064
Precision 0.9639332126994572
Recall 0.9807112970711297
F1 0.9722498755599801


In [78]:
train_auc = roc_auc_score(y_train, xgb_random_model.predict(x_train))
val_auc = roc_auc_score(y_val, xgb_random_model.predict(x_val))

train_auc, val_auc

(np.float64(0.9968119209859936), np.float64(0.9510951954959015))

### 2. Grid Search CV

In [79]:
best_random_params = random_search_xgb.best_params_
best_random_params

{'subsample': 1.0,
 'n_estimators': 400,
 'max_depth': 13,
 'learning_rate': 0.1,
 'colsample_bytree': 0.6}

In [80]:
param_grid = {
    'n_estimators': [best_random_params['n_estimators'] - 50, best_random_params['n_estimators'], best_random_params['n_estimators'] + 50],
    'max_depth': [best_random_params['max_depth'] - 2, best_random_params['max_depth'], best_random_params['max_depth'] + 2],
    'learning_rate': [best_random_params['learning_rate'] * 0.8, best_random_params['learning_rate'], best_random_params['learning_rate'] * 1.2],
    'subsample': [best_random_params['subsample'] - 0.1, best_random_params['subsample'], best_random_params['subsample'] + 0.1],
    'colsample_bytree': [best_random_params['colsample_bytree'] - 0.1, best_random_params['colsample_bytree'], best_random_params['colsample_bytree'] + 0.1]
}

In [81]:
grid_search = GridSearchCV(xgb_model, param_grid = param_grid, scoring = 'roc_auc', cv = 3, verbose = 3, n_jobs = -1)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [82]:
grid_search.best_params_

{'colsample_bytree': 0.6,
 'learning_rate': 0.08000000000000002,
 'max_depth': 13,
 'n_estimators': 350,
 'subsample': 1.0}

In [26]:
xgb_tuned_model = XGBClassifier(colsample_bytree = 0.6, learning_rate = 0.08000000000000002, max_depth = 13,
                               n_estimators = 350, subsample = 1.0, n_jobs = -1, random_state = 42)
xgb_tuned_model.fit(x_train, y_train)

### Evaluation

In [27]:
train_acc_xgb = accuracy_score(y_train, xgb_tuned_model.predict(x_train))
val_acc_xgb = accuracy_score(y_val, xgb_tuned_model.predict(x_val))

train_acc_xgb, val_acc_xgb

(0.9961574654956086, 0.9619892212495367)

In [28]:
accuracy = accuracy_score(y_val, xgb_tuned_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, xgb_tuned_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, xgb_tuned_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, xgb_tuned_model.predict(x_val))
print('F1', f1)

Accuracy 0.9619892212495367
Precision 0.9637118316689269
Recall 0.9811715481171548
F1 0.9723633197188647


In [30]:
train_auc_xgb = roc_auc_score(y_train, xgb_tuned_model.predict(x_train))
val_auc_xgb = roc_auc_score(y_val, xgb_tuned_model.predict(x_val))

train_auc_xgb, val_auc_xgb

(0.9944794098462267, 0.9510567204279927)

In [33]:
feature_importances = xgb_tuned_model.feature_importances_
feature_names = x_train.columns

In [34]:
importance_df = pd.DataFrame({'features': feature_names, 'importances': feature_importances}).sort_values(by = 'importances', ascending = False)
importance_df.head(10)

Unnamed: 0,features,importances
6,sttl,0.681105
28,ct_state_ttl,0.128607
7,dttl,0.071121
16,swin,0.036071
38,is_sm_ips_ports,0.009569
31,ct_dst_sport_ltm,0.00734
24,dmean,0.006847
32,ct_dst_src_ltm,0.005051
37,ct_srv_dst,0.004514
3,sbytes,0.004038


In [35]:
imp_cols = feature_importances[:10]
#fs_train = x_train[feature

In [36]:
imp_cols

array([6.1562820e-04, 1.7780698e-03, 3.1447809e-03, 4.0379823e-03,
       2.4885780e-03, 9.3917997e-04, 6.8110514e-01, 7.1120553e-02,
       8.2750735e-04, 1.6274909e-03], dtype=float32)

## III. Naive Bayes

In [23]:
gnb_model = GaussianNB()
gnb_model.fit(x_train, y_train)

In [24]:
train_acc = accuracy_score(y_train, gnb_model.predict(x_train))
val_acc = accuracy_score(y_val, gnb_model.predict(x_val))

train_acc, val_acc

(0.8158221170297707, 0.813025749237218)

In [46]:
accuracy = accuracy_score(y_val, gnb_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, gnb_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, gnb_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, gnb_model.predict(x_val))
print('F1', f1)

Accuracy 0.813025749237218
Precision 0.8815170047076423
Recall 0.838326359832636
F1 0.8593793561946428


In [47]:
train_auc = roc_auc_score(y_train, gnb_model.predict(x_train))
val_auc = roc_auc_score(y_val, gnb_model.predict(x_val))

train_auc, val_auc

(0.8027806807394976, 0.7986062813578079)

## IV. Decision Trees

In [31]:
dt_model = DecisionTreeClassifier(random_state = 42)
dt_model.fit(x_train, y_train)

In [32]:
train_acc = accuracy_score(y_train, dt_model.predict(x_train))
val_acc = accuracy_score(y_val, dt_model.predict(x_val))

train_acc, val_acc

(0.9995722596099008, 0.9474179474749779)

### 1. Randomized Search

In [54]:
param_dist = {
    'max_depth': [3, 4, 5, 6, None],  
    'min_samples_split': range(2, 11),  
    'min_samples_leaf': range(1, 5),   
    'max_features': ['auto', 'sqrt', 'log2', None],  
    'criterion': ['gini', 'entropy']  
}

In [56]:
random_search = RandomizedSearchCV(dt_model, param_distributions = param_dist, n_iter = 100, scoring = 'roc_auc', 
                                   cv = 3, verbose = 3, random_state = 42, n_jobs = -1)
random_search.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [57]:
random_search.best_params_

{'min_samples_split': 3,
 'min_samples_leaf': 2,
 'max_features': None,
 'max_depth': 6,
 'criterion': 'entropy'}

In [59]:
dt_random_model = DecisionTreeClassifier(min_samples_split = 3, min_samples_leaf = 2, max_features = None,
                                         max_depth = 6, criterion = 'entropy', random_state = 42)
dt_random_model.fit(x_train, y_train)

### Evaluation

In [60]:
train_acc = accuracy_score(y_train, dt_random_model.predict(x_train))
val_acc = accuracy_score(y_val, dt_random_model.predict(x_val))

train_acc, val_acc

(0.9327306946503935, 0.9299096067752146)

In [61]:
accuracy = accuracy_score(y_val, dt_random_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, dt_random_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, dt_random_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, dt_random_model.predict(x_val))
print('F1', f1)

Accuracy 0.9299096067752146
Precision 0.9099495258488834
Recall 0.9956903765690377
F1 0.9508910732837849


In [62]:
train_auc = roc_auc_score(y_train, dt_random_model.predict(x_train))
val_auc = roc_auc_score(y_val, dt_random_model.predict(x_val))

train_auc, val_auc

(0.8971023287283223, 0.8924194563479085)

### 2. Grid Search

In [64]:
best_params_random = random_search.best_params_

In [65]:
param_grid = {
    'max_depth': [best_params_random['max_depth'], best_params_random['max_depth'] + 1, best_params_random['max_depth'] + 2],
    'min_samples_split': np.arange(best_params_random['min_samples_split'] - 1, best_params_random['min_samples_split'] + 2),
    'min_samples_leaf': np.arange(best_params_random['min_samples_leaf'] - 1, best_params_random['min_samples_leaf'] + 2),
    'max_features': [best_params_random['max_features']],
    'criterion': [best_params_random['criterion']]
}

In [66]:
grid_search = GridSearchCV(dt_model, param_grid = param_grid, scoring = 'roc_auc', cv = 3, verbose = 1, n_jobs = -1)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [67]:
grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': None,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [33]:
dt_tuned_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 8, max_features = None,
                                        min_samples_leaf = 3, min_samples_split = 2, random_state = 42)
dt_tuned_model.fit(x_train, y_train)

### Evaluation

In [34]:
train_acc_dt = accuracy_score(y_train, dt_tuned_model.predict(x_train))
val_acc_dt = accuracy_score(y_val, dt_tuned_model.predict(x_val))

train_acc_dt, val_acc_dt

(0.9347339454773583, 0.9323619150816961)

In [35]:
accuracy = accuracy_score(y_val, dt_tuned_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, dt_tuned_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, dt_tuned_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, dt_tuned_model.predict(x_val))
print('F1', f1)

Accuracy 0.9323619150816961
Precision 0.9656112120425643
Recall 0.9340167364016736
F1 0.9495512356969671


In [36]:
train_auc_dt = roc_auc_score(y_train, dt_tuned_model.predict(x_train))
val_auc_dt = roc_auc_score(y_val, dt_tuned_model.predict(x_val))

train_auc_dt, val_auc_dt

(0.9350647982126595, 0.9314187899037645)

## Final Evaluation Dataframe

In [38]:
models = {
    "Random Forest" : tuned_rf_model,
    "XGBoost" : xgb_tuned_model,
    "Decision Tree" : dt_tuned_model
}

In [40]:
results = []
for model_name, model in models.items():
    
    y_pred = model.predict(x_val)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, model.predict(x_val))

    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "AUC Score": auc
    })

In [42]:
evaluation_df = pd.DataFrame(results)
evaluation_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,AUC Score
0,Random Forest,0.95948,0.959224,0.982301,0.970625,0.946473
1,XGBoost,0.961989,0.963712,0.981172,0.972363,0.951057
2,Decision Tree,0.932362,0.965611,0.934017,0.949551,0.931419


# Final Model

In [32]:
joblib.dump(xgb_tuned_model, 'pretrained_model.pkl')

['pretrained_model.pkl']