In [1]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 8]

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from custom_scripts.config import loader
from custom_scripts.evaluate_performance import display_scores
from custom_scripts.prepare_data import prepare_data

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import KFold

import xgboost as xgb

### Load the data

In [5]:
data_file= loader(day_of_week='Tuesday',data_type='processed',subtype='Normalized')

df=pd.read_pickle(data_file)

## Preparing the Dataset for Classification 

Here we need to separate our class variable, which in our case is "Label" from the rest of the dataset.

In [6]:
X,y=prepare_data(data=df,class_column='Label',classes='binary',neg_class='BENIGN')

## Training and predicting XGBoost w/ cross validation

We will use 5-fold cross validation to evaluate the performance of the xgboost classifier on our dataset.

https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn

In [10]:
kfold = KFold(n_splits=5, shufday_of_weekue, random_state=55)

cm = []
acc = []
auc_score = []

for train_index, test_index in kfold.split(X):   
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)
    
    cm.append(confusion_matrix(y_test,y_pred))
    acc.append(accuracy_score(y_test,y_pred))
    auc_score.append(roc_auc_score(y_test,y_pred))

In [15]:
display_scores(acc)

Scores: [0.9984813641688723, 0.9984596582321504, 0.9983222568537977, 0.9982933302478287, 0.9981559288694759]
Mean: 0.998
Std: 0.000


In [16]:
display_scores(auc_score)

Scores: [0.9987039015025628, 0.9986227694583334, 0.9985249598593898, 0.9984957230306675, 0.9983597266869628]
Mean: 0.999
Std: 0.000


## Save the trained model

Let's pickle our saved model as a serialized binary file for later fetching.

In [13]:
filename='./models/xgb_model_'+day_of_week+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(xgb_model,file)