# 1. Imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, precision_recall_curve, auc
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from numpy import mean
from numpy import std

# 2. Importing the Ranking Files

In [4]:
portmap_ranking = pd.read_csv("../final_ranking/CSE_CIC_IDS2018/ddos_portmap_2018.csv", index_col=0)
syn_ranking = pd.read_csv("../final_ranking/CIC_DDoS2019/ddos_syn_2019.csv", index_col=0)
udp_ranking = pd.read_csv("../final_ranking/CIC_DDoS2019/ddos_udp_2019.csv", index_col=0)
netbios_ranking = pd.read_csv("../final_ranking/CIC_DDoS2019/ddos_netbios_2019.csv", index_col=0)
ldap_ranking = pd.read_csv("../final_ranking/CIC_DDoS2019/ddos_ldap_2019.csv", index_col=0)

# 3. Importing the Training Datasets

In [5]:
portmap_train = pd.read_csv("../data/train_test/CSE_CIC_IDS2018/ddos_portmap_2018_train.csv", index_col=0)
ldap_train = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_ldap_2019_train.csv", index_col=0)
netbios_train = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_netbios_2019_train.csv", index_col=0)
syn_train = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_syn_2019_train.csv", index_col=0)
udp_train = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_udp_2019_train.csv", index_col=0)
target_feature = 'Label'

In [6]:
portmap_test = pd.read_csv("../data/train_test/CSE_CIC_IDS2018/ddos_portmap_2018_test.csv", index_col=0)
ldap_test = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_ldap_2019_test.csv", index_col=0)
netbios_test = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_netbios_2019_test.csv", index_col=0)
syn_test = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_syn_2019_test.csv", index_col=0)
udp_test = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_udp_2019_test.csv", index_col=0)

# 3. Training the Model

In [11]:
def trainTestSaveModel(train_df, test_df, master_ranking, threshold, saveLocation, cv_n, verbose):
    
    cv = KFold(n_splits=cv_n, random_state=1, shuffle=True)
    f1_scorer = make_scorer(f1_score, average='weighted')
    
    methods = master_ranking.columns.tolist()[1:]
    methods.insert(0, 'All Features')
    #if verbose: print(methods)
    
    features = master_ranking['feature'].tolist()
    #if verbose: print(features)
        
    for method in methods:
        if verbose: print("___________________________________________________________________________________ "+method)
            
        # _____________________________________________________________1. CREATE X BASED ON THE FEATURE SELECTED DURING FEATURE SELECTION
        
        # BASELINES
        if method == 'All Features':
            selected_features = master_ranking['feature'].tolist()
        elif method !="STEF-Rank":
            selected_features = master_ranking[master_ranking[method] == 1]['feature'].tolist()
        # STEF-RANK
        else:
            selected_features = master_ranking[master_ranking[method] >= threshold]['feature'].tolist()
            
        X_train = train_df[selected_features]
        X_test = test_df[selected_features]
        
        y_train = train_df[target_feature].values
        y_test = test_df[target_feature].values
                
        if verbose: print("Training Dataset size: rows =", X_train.shape[0], ", columns =", X_train.shape[1], "+ 'Label'")
                
        # _____________________________________________________________2. TRAIN THE MODEL USING CV (K=5)
        model = XGBClassifier(random_state=0)
        model.fit(X_train, y_train)
        f1s = cross_val_score(model, X_train, y_train, scoring=f1_scorer, cv=cv, n_jobs=-1)

        print('CV F1-Score Mean (std): %.3f (%.3f)' % (mean(f1s), std(f1s)))
        
        if verbose: print("Testing Dataset size: rows =", X_test.shape[0], ", columns =", X_test.shape[1], "+ 'Label'")
        
        # _____________________________________________________________3. EVALUATE THE MODEL
        y_preds = model.predict(X_test)
        test_f1 = f1_score(y_test, y_preds)
        
        y_pred_prob = model.predict_proba(X_test)
        y_pred_prob = y_pred_prob[:, 1]
        precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
        auc_pr = auc(recall, precision)
        
        print("Test F1-Scrore "+str(test_f1))
        print("Test AUC-PR "+str(auc_pr))
        
        # _____________________________________________________________4. SAVE THE MODEL

In [None]:
trainTestSaveModel(portmap_train, portmap_test, portmap_ranking, 0.50, '../model/CSE_CIC_IDS2018/portmap', 2, verbose=True)

['all', 'BackwardElimination', 'MutualInformation', 'RFE', 'SelectKBest', 'VarianceThreshold', 'VIF', 'STEF-Rank']
___________________________________________________________________________________ all
Training Dataset size: rows = 157997 , columns = 78 + 'Label'
CV F1-Score Mean (std): 1.000 (0.000)
Testing Dataset size: rows = 67714 , columns = 78 + 'Label'
Test F1-Scrore 0.9999607930263863
Test AUC-PR 0.9999999808838231
___________________________________________________________________________________ BackwardElimination
Training Dataset size: rows = 157997 , columns = 50 + 'Label'
CV F1-Score Mean (std): 1.000 (0.000)
Testing Dataset size: rows = 67714 , columns = 50 + 'Label'
Test F1-Scrore 0.9999738616759893
Test AUC-PR 0.9999999890736062
___________________________________________________________________________________ MutualInformation
Training Dataset size: rows = 157997 , columns = 65 + 'Label'
CV F1-Score Mean (std): 1.000 (0.000)
Testing Dataset size: rows = 67714 , colu

In [None]:
trainTestSaveModel(udp_train, udp_test, udp_ranking, 0.50, '../model/CIC_DDoS2019/udp', 2, verbose=True)