## Importing Libraries

In [152]:
import pandas as pd
import numpy as np
import joblib

from scipy.stats import randint, uniform
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Data Reading

In [3]:
train_df = pd.read_csv('final_train.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,dpkts,sttl,smean,ct_srv_src,proto_target_encoded,label
0,0,0.2,-0.010417,-0.697674,-0.4,-0.995621,0
1,1,3.6,-1.0,-0.488372,3.8,-0.995493,0
2,2,1.4,-1.0,-0.627907,0.2,-0.9923,0
3,3,1.0,-1.0,-0.488372,-0.4,-0.994307,0
4,4,0.4,0.0,-0.465116,3.8,-0.998246,0


In [4]:
train_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
train_df.shape

(175341, 6)

### Splitting into Train and Validation Sets 

In [6]:
x = train_df.drop('label', axis = 1)
y = train_df['label']

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [8]:
x_train.shape, y_train.shape

((140272, 5), (140272,))

In [9]:
x_val.shape, y_val.shape

((35069, 5), (35069,))

## Model Building

## II. Random Forest

In [10]:
rf_model = RandomForestClassifier(n_jobs = -1, class_weight = 'balanced', random_state = 42)
rf_model.fit(x_train, y_train)

In [11]:
train_acc = accuracy_score(y_train, rf_model.predict(x_train))
val_acc = accuracy_score(y_val, rf_model.predict(x_val))

In [12]:
train_acc, val_acc

(0.970564332154671, 0.9418004505403633)

In [13]:
confusion_matrix(y_val, rf_model.predict(x_val))

array([[10197,   972],
       [ 1069, 22831]], dtype=int64)

In [14]:
print(classification_report(y_val, rf_model.predict(x_val)))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     11169
           1       0.96      0.96      0.96     23900

    accuracy                           0.94     35069
   macro avg       0.93      0.93      0.93     35069
weighted avg       0.94      0.94      0.94     35069



In [121]:
train_proba = rf_model.predict_proba(x_train)[:, 1]
val_proba = rf_model.predict_proba(x_val)[:, 1]
train_auc = roc_auc_score(y_train, train_proba)
val_auc = roc_auc_score(y_val, val_proba)

train_auc, val_auc

(0.9974413825658193, 0.9855591724854098)

## Model Tuning

### 1. Randomized Search

In [21]:
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': list(range(10, 40, 10)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

In [22]:
random_search = RandomizedSearchCV(estimator = rf_model, param_distributions = param_dist, n_iter = 100, cv = 3, n_jobs = -1, verbose = 2, random_state = 42)
random_search.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [23]:
print("Best parameters found by Random Search:", random_search.best_params_)

Best parameters found by Random Search: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 57}


In [24]:
rf_random_model = RandomForestClassifier(bootstrap = True, max_depth = 20, min_samples_leaf = 3,
                                        min_samples_split = 6, n_estimators = 57, n_jobs = -1, random_state = 43)
rf_random_model.fit(x_train, y_train)

### 2. Grid Search CV

In [25]:
param_grid = {
    'n_estimators': [rf_random_model.n_estimators - 50, rf_random_model.n_estimators, rf_random_model.n_estimators + 50],
    'max_depth': [rf_random_model.max_depth - 10, rf_random_model.max_depth, rf_random_model.max_depth + 10],
    'min_samples_split': [rf_random_model.min_samples_split - 1, rf_random_model.min_samples_split, rf_random_model.min_samples_split + 1],
    'min_samples_leaf': [rf_random_model.min_samples_leaf - 1, rf_random_model.min_samples_leaf, rf_random_model.min_samples_leaf + 1],
    'bootstrap': [rf_random_model.bootstrap]
}

In [26]:
grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [27]:
print(grid_search.best_params_)

{'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 57}


In [128]:
classes = np.array([0, 1])
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

print(class_weights_dict)

{0: 1.5644531685663938, 1: 0.734862375708553}


In [130]:
tuned_rf_model = RandomForestClassifier(bootstrap = True, max_depth = 20, min_samples_leaf = 3, 
                                       class_weight={0: 1.5644531685663938, 1: 0.734862375708553},
                                       min_samples_split = 5, n_estimators = 57, n_jobs = -1, random_state = 100)
tuned_rf_model.fit(x_train, y_train)

### Evaluation

In [131]:
train_acc_rf = accuracy_score(y_train, tuned_rf_model.predict(x_train))
val_acc_rf = accuracy_score(y_val, tuned_rf_model.predict(x_val))

In [132]:
train_acc_rf, val_acc_rf

(0.9574113151591195, 0.9440531523567823)

In [133]:
confusion_matrix(y_val, tuned_rf_model.predict(x_val))

array([[10385,   784],
       [ 1178, 22722]], dtype=int64)

In [134]:
print(classification_report(y_val, tuned_rf_model.predict(x_val)))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91     11169
           1       0.97      0.95      0.96     23900

    accuracy                           0.94     35069
   macro avg       0.93      0.94      0.94     35069
weighted avg       0.94      0.94      0.94     35069



In [136]:
accuracy = accuracy_score(y_val, tuned_rf_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, tuned_rf_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, tuned_rf_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, tuned_rf_model.predict(x_val))
print('F1', f1)

Accuracy 0.9440531523567823
Precision 0.9666468135795117
Recall 0.9507112970711297
F1 0.958612833818504


In [137]:
train_proba = tuned_rf_model.predict_proba(x_train)[:, 1]
val_proba = tuned_rf_model.predict_proba(x_val)[:, 1]
train_auc = roc_auc_score(y_train, train_proba)
val_auc = roc_auc_score(y_val, val_proba)

train_auc, val_auc

(0.9949347726795832, 0.9898538374483168)

## II. XGBoost

In [41]:
xgb_model = XGBClassifier(n_jobs = -1, random_state = 42, eval_metric = 'logloss')
xgb_model.fit(x_train, y_train)

In [42]:
train_acc = accuracy_score(y_train, xgb_model.predict(x_train))
val_acc = accuracy_score(y_val, xgb_model.predict(x_val))

train_acc, val_acc

(0.951280369567697, 0.9476175539650403)

In [43]:
accuracy = accuracy_score(y_val, xgb_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, xgb_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, xgb_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, xgb_model.predict(x_val))
print('F1', f1)

Accuracy 0.9476175539650403
Precision 0.9466013521719768
Recall 0.9783263598326359
F1 0.9622024238184399


In [119]:
train_proba = xgb_model.predict_proba(x_train)[:, 1]
val_proba = xgb_model.predict_proba(x_val)[:, 1]
train_auc = roc_auc_score(y_train, train_proba)
val_auc = roc_auc_score(y_val, val_proba)

train_auc, val_auc

(0.9917648799648356, 0.9900617500396157)

## Model Tuning

### 1. Randomized Search CV

In [45]:
param_dist = {
    'n_estimators': range(50, 500, 50),
    'max_depth': range(3, 15, 2),
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

In [46]:
random_search_xgb = RandomizedSearchCV(estimator = xgb_model, param_distributions = param_dist, 
                                                 n_iter = 100, scoring = 'roc_auc', cv = 3, 
                                                 verbose = 3, random_state = 42, n_jobs = -1)
random_search_xgb.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [47]:
print(random_search_xgb.best_params_)

{'subsample': 0.8, 'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.8}


In [48]:
xgb_random_model = XGBClassifier(subsample = 0.8, n_estimators = 400, max_depth = 7, learning_rate = 0.05,
                                colsample_bytree = 0.8, random_state = 42, n_jobs = -1)
xgb_random_model.fit(x_train, y_train)

### 2. Grid Search CV

In [49]:
best_random_params = random_search_xgb.best_params_
best_random_params

{'subsample': 0.8,
 'n_estimators': 400,
 'max_depth': 7,
 'learning_rate': 0.05,
 'colsample_bytree': 0.8}

In [50]:
param_grid = {
    'n_estimators': [best_random_params['n_estimators'] - 50, best_random_params['n_estimators'], best_random_params['n_estimators'] + 50],
    'max_depth': [best_random_params['max_depth'] - 2, best_random_params['max_depth'], best_random_params['max_depth'] + 2],
    'learning_rate': [best_random_params['learning_rate'] * 0.8, best_random_params['learning_rate'], best_random_params['learning_rate'] * 1.2],
    'subsample': [best_random_params['subsample'] - 0.1, best_random_params['subsample'], best_random_params['subsample'] + 0.1],
    'colsample_bytree': [best_random_params['colsample_bytree'] - 0.1, best_random_params['colsample_bytree'], best_random_params['colsample_bytree'] + 0.1]
}

In [51]:
grid_search = GridSearchCV(xgb_model, param_grid = param_grid, scoring = 'roc_auc', cv = 3, verbose = 3, n_jobs = -1)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [52]:
grid_search.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.06,
 'max_depth': 7,
 'n_estimators': 400,
 'subsample': 0.9}

In [138]:
scale_pos_weight = class_weights_dict[1] / class_weights_dict[0]

In [140]:
xgb_tuned_model = XGBClassifier(colsample_bytree = 0.8, learning_rate = 0.06, max_depth = 7, scale_pos_weight = scale_pos_weight,
                               n_estimators = 400, subsample = 0.9, n_jobs = -1, random_state = 42)
xgb_tuned_model.fit(x_train, y_train)

### Evaluation

In [141]:
train_acc_xgb = accuracy_score(y_train, xgb_tuned_model.predict(x_train))
val_acc_xgb = accuracy_score(y_val, xgb_tuned_model.predict(x_val))

train_acc_xgb, val_acc_xgb

(0.9448143606706969, 0.9426273917134791)

In [142]:
accuracy = accuracy_score(y_val, xgb_tuned_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, xgb_tuned_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, xgb_tuned_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, xgb_tuned_model.predict(x_val))
print('F1', f1)

Accuracy 0.9426273917134791
Precision 0.9698608964451314
Recall 0.9451882845188284
F1 0.9573656551957959


In [143]:
train_proba = xgb_tuned_model.predict_proba(x_train)[:, 1]
val_proba = xgb_tuned_model.predict_proba(x_val)[:, 1]
train_auc = roc_auc_score(y_train, train_proba)
val_auc = roc_auc_score(y_val, val_proba)

train_auc, val_auc

(0.9919786449385056, 0.9901639418878688)

## III. Cat Boost

In [63]:
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x157546dfe80>

In [64]:
train_acc = accuracy_score(y_train, cat_model.predict(x_train))
val_acc = accuracy_score(y_val, cat_model.predict(x_val))

train_acc, val_acc

(0.9502181475989506, 0.9467620975790584)

In [65]:
accuracy = accuracy_score(y_val, cat_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, cat_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, cat_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, cat_model.predict(x_val))
print('F1', f1)

Accuracy 0.9467620975790584
Precision 0.9458136052770021
Recall 0.977907949790795
F1 0.9615930550698402


In [117]:
train_proba = cat_model.predict_proba(x_train)[:, 1]
val_proba = cat_model.predict_proba(x_val)[:, 1]
train_auc = roc_auc_score(y_train, train_proba)
val_auc = roc_auc_score(y_val, val_proba)

train_auc, val_auc

(0.9914006500901074, 0.9900019798523334)

## Model Tuning

### 1. Random Search

In [67]:
param_dist = {
    'iterations': range(50, 500, 50),
    'depth': range(3, 15, 2),
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128, 254],
    'bagging_temperature': [0, 0.5, 1, 2, 3]
}

In [69]:
random_search_cat = RandomizedSearchCV(estimator = cat_model, param_distributions = param_dist, 
                                       n_iter = 100, scoring = 'roc_auc', cv = 3, verbose = 3, 
                                       random_state = 42, n_jobs = -1)
random_search_cat.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [75]:
best_random_params = random_search_cat.best_params_
best_random_params

{'learning_rate': 0.1,
 'l2_leaf_reg': 7,
 'iterations': 300,
 'depth': 11,
 'border_count': 254,
 'bagging_temperature': 2}

In [73]:
cat_random_model = CatBoostClassifier(
    learning_rate = 0.01, l2_leaf_reg = 7, iterations = 300, depth = 11, border_count = 254, bagging_temperature = 2,
    random_state = 42, verbose = 0
)
cat_random_model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x157546dfbe0>

### 2. Gird Search 

In [76]:
param_grid = {
    'iterations': [best_random_params['iterations'] - 50, best_random_params['iterations'], best_random_params['iterations'] + 50],
    'depth': [best_random_params['depth'] - 2, best_random_params['depth'], best_random_params['depth'] + 2],
    'learning_rate': [best_random_params['learning_rate'] * 0.8, best_random_params['learning_rate'], best_random_params['learning_rate'] * 1.2],
    'l2_leaf_reg': [best_random_params['l2_leaf_reg'] - 2, best_random_params['l2_leaf_reg'], best_random_params['l2_leaf_reg'] + 2],
    'bagging_temperature': [best_random_params['bagging_temperature'] - 1, best_random_params['bagging_temperature'], best_random_params['bagging_temperature'] + 1]
}

In [77]:
grid_search_cat = GridSearchCV(estimator=cat_model, param_grid=param_grid, scoring='roc_auc', cv=3, verbose=3, n_jobs=-1)
grid_search_cat.fit(x_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [80]:
best_grid_params = grid_search_cat.best_params_
best_grid_params

{'bagging_temperature': 1,
 'depth': 9,
 'iterations': 350,
 'l2_leaf_reg': 5,
 'learning_rate': 0.1}

In [144]:
cat_tuned_model = CatBoostClassifier(
    bagging_temperature = 1, depth = 9, iterations = 350, l2_leaf_reg = 5, learning_rate = 0.1, 
    class_weights = {0: 1.5644531685663938, 1: 0.734862375708553}, random_state = 42, verbose = 0
)
cat_tuned_model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x157790c9100>

### Evaluation

In [145]:
train_acc_xgb = accuracy_score(y_train, cat_tuned_model.predict(x_train))
val_acc_xgb = accuracy_score(y_val, cat_tuned_model.predict(x_val))

train_acc_xgb, val_acc_xgb

(0.9468532565301699, 0.9421996635204882)

In [146]:
accuracy = accuracy_score(y_val, cat_tuned_model.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, cat_tuned_model.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, cat_tuned_model.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, cat_tuned_model.predict(x_val))
print('F1', f1)

Accuracy 0.9421996635204882
Precision 0.9706907682375726
Recall 0.9436820083682008
F1 0.9569958629468548


In [147]:
train_proba = cat_tuned_model.predict_proba(x_train)[:, 1]
val_proba = cat_tuned_model.predict_proba(x_val)[:, 1]
train_auc = roc_auc_score(y_train, train_proba)
val_auc = roc_auc_score(y_val, val_proba)

train_auc, val_auc

(0.9925203570518045, 0.990392831173852)

## IV. Voting Classifier

In [154]:
voting_clf = VotingClassifier(
    estimators=[('rf', tuned_rf_model), ('xgb', xgb_tuned_model), ('cat', cat_tuned_model)],
    voting='hard', 
    n_jobs=-1
)
voting_clf.fit(x_train, y_train)

In [161]:
accuracy = accuracy_score(y_train, voting_clf.predict(x_train))
print('Accuracy', accuracy)
precision = precision_score(y_train, voting_clf.predict(x_train))
print('Precision', precision)
recall = recall_score(y_train, voting_clf.predict(x_train))
print('Recall', recall)
f1 = f1_score(y_train, voting_clf.predict(x_train))
print('F1', f1)

Accuracy 0.9485998631230752
Precision 0.976445303640664
Recall 0.9473077608155824
F1 0.9616558707467798


In [156]:
accuracy = accuracy_score(y_val, voting_clf.predict(x_val))
print('Accuracy', accuracy)
precision = precision_score(y_val, voting_clf.predict(x_val))
print('Precision', precision)
recall = recall_score(y_val, voting_clf.predict(x_val))
print('Recall', recall)
f1 = f1_score(y_val, voting_clf.predict(x_val))
print('F1', f1)

Accuracy 0.9433973024608628
Precision 0.970258787176516
Recall 0.9459414225941423
F1 0.9579458062329188


## Final Evaluation Dataframe

In [148]:
models = {
    "Random Forest" : tuned_rf_model,
    "XGBoost" : xgb_tuned_model,
    "Cat Boost" : cat_tuned_model,
    "Voting Classifier" : voting_clf
}

In [158]:
results = []
for model_name, model in models.items():
    
    y_pred = model.predict(x_val)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    val_proba = model.predict_proba(x_val)[:, 1]
    #auc = roc_auc_score(y_val, val_proba)

    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
        #"AUC Score": auc
    })

In [159]:
evaluation_df = pd.DataFrame(results)
evaluation_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.944053,0.966647,0.950711,0.958613
1,XGBoost,0.942627,0.969861,0.945188,0.957366
2,Cat Boost,0.9422,0.970691,0.943682,0.956996
3,Voting Classifier,0.947931,0.947531,0.977741,0.962399


# Final Model

In [160]:
joblib.dump(voting_clf, 'pretrained_voting2_model.pkl')

['pretrained_voting2_model.pkl']