In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

In [3]:
# # Read the sampled dataset
# df=pd.read_csv('./data/CICIDS2017_sample_km.csv')

In [4]:
# X = df.drop(['Label'],axis=1).values
# y = df.iloc[:, -1].values.reshape(-1,1)
# y=np.ravel(y)

In [5]:
# X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = 0,stratify = y)

# Anomaly-based IDS
Generate the port-scan datasets for unknown attack detection

In [6]:
df=pd.read_csv('./data/CICIDS2017_sample_km.csv')

In [7]:
df.Label.value_counts()

Label
0    18225
3     3042
6     2180
1     1966
5     1255
2       96
4       36
Name: count, dtype: int64

In [8]:
df1 = df[df['Label'] != 5]
df1['Label'][df1['Label'] > 0] = 1
df1.to_csv('./data/CICIDS2017_sample_km_without_portscan.csv',index=0)

In [9]:
df2 = df[df['Label'] == 5]
df2['Label'][df2['Label'] == 5] = 1
df2.to_csv('./data/CICIDS2017_sample_km_portscan.csv',index=0)

# Read the generated datasets for unknown attack detection

In [10]:
df1 = pd.read_csv('./data/CICIDS2017_sample_km_without_portscan.csv')
df2 = pd.read_csv('./data/CICIDS2017_sample_km_portscan.csv')

In [11]:
features = df1.drop(['Label'],axis=1).dtypes[df1.dtypes != 'object'].index
df1[features] = df1[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df2[features] = df2[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df1 = df1.fillna(0)
df2 = df2.fillna(0)

In [12]:
df1.Label.value_counts()

Label
0    18225
1     7320
Name: count, dtype: int64

In [13]:
df2.Label.value_counts()

Label
1    1255
Name: count, dtype: int64

In [14]:
df2p=df1[df1['Label']==0]
df2pp=df2p.sample(n=None, frac=1255/18225, replace=False, weights=None, random_state=None, axis=0)
df2=pd.concat([df2, df2pp])

In [15]:
df2.Label.value_counts()

Label
1    1255
0    1255
Name: count, dtype: int64

In [16]:
df = pd.concat([df1, df2])

In [17]:
X = df.drop(['Label'],axis=1) .values
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)
pd.Series(y).value_counts()

0    19480
1     8575
Name: count, dtype: int64

### Feature engineering (IG, FCBF, and KPCA)
Feature selection by information gain (IG)

In [18]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X, y)

In [19]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [20]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break   

In [21]:
X_fs = df[fs].values

In [22]:
X_fs.shape

(28055, 50)

In [23]:
X_fs

array([[-0.34612159, -0.27787307, -0.44364535, ..., -0.13353417,
        -0.09211243, -0.05349902],
       [-0.3443274 , -0.27787307, -0.44364535, ..., -0.13353417,
        -0.09211243, -0.05349902],
       [-0.3443274 , -0.27787307, -0.44364535, ..., -0.13353417,
        -0.09211243, -0.05349902],
       ...,
       [-0.36859622, -0.27739742, -0.39838515, ..., -0.13353417,
        -0.09211243, -0.05349902],
       [-0.3470659 , -0.27773453, -0.41921935, ..., -0.13353417,
        -0.09211243,  0.08592008],
       [-0.36859622, -0.27787307, -0.44364535, ..., -0.13353417,
        -0.09211243, -0.04188076]])

### Feature selection by Fast Correlation Based Filter (FCBF)

The module is imported from the GitHub repo: https://github.com/SantiagoEG/FCBF_module

In [24]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBFK(k = 20)

In [25]:
X_fss = fcbf.fit_transform(X_fs,y)

In [26]:
X_fss.shape

(28055, 20)

In [27]:
X_fss

array([[-0.34612159, -0.53319222, -0.34935843, ..., -0.42229765,
        -0.2803002 , -0.41947688],
       [-0.3443274 , -0.54906516, -0.34935843, ..., -0.42229765,
        -0.2803002 , -0.41947688],
       [-0.3443274 , -0.55544206, -0.34935843, ..., -0.42229765,
        -0.2803002 , -0.41947688],
       ...,
       [-0.36859622, -0.56375976, -0.34935843, ..., -0.42229765,
        -0.2803002 , -0.37580056],
       [-0.3470659 ,  1.46028278, -0.34640024, ..., -0.41817787,
        -0.27953283, -0.39575147],
       [-0.36859622, -0.56369044, -0.34935843, ..., -0.42229753,
        -0.28029976, -0.42271216]])

Kernel principal component analysis (KPCA)

In [28]:
from sklearn.decomposition import KernelPCA

In [29]:
kpca = KernelPCA(n_components = 10, kernel = 'rbf')

In [30]:
kpca.fit(X_fss, y)

In [31]:
X_kpca = kpca.transform(X_fss)

Train-test split after feature selection

In [32]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]

Solve class-imbalance by SMOTE

In [33]:
pd.Series(y_train).value_counts()

0    18225
1     7320
Name: count, dtype: int64

In [34]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(n_jobs=-1,sampling_strategy={1:18225})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [35]:
pd.Series(y_train).value_counts()

0    18225
1    18225
Name: count, dtype: int64

In [36]:

pd.Series(y_test).value_counts()

1    1255
0    1255
Name: count, dtype: int64

## Apply the cluster labeling (CL) agglomerative method

In [82]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN,MeanShift
from sklearn.cluster import SpectralClustering,AgglomerativeClustering,AffinityPropagation,Birch,MiniBatchKMeans,MeanShift,FeatureAgglomeration 
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import classification_report
from sklearn import metrics

In [87]:
def CL_agglomerative(X_train, X_test, y_train, y_test,n,b=100):
    a_cluster = AgglomerativeClustering(n_clusters=n)
    result = a_cluster.fit_predict(X_train)
    result2 = a_cluster.fit_predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    print(classification_report(y_test, result2))
    cm=confusion_matrix(y_test,result2)
    acc=metrics.accuracy_score(y_test,result2)
    print(str(acc))
    print(cm)

### Hyperparameter optimization of CL-agglomerative

In [88]:
#Hyperparameter optimization by BO-GP
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn import metrics

space  = [Integer(2, 50, name='n_clusters')]
@use_named_args(space)
def objective(**params):
    km_cluster = MiniBatchKMeans(batch_size=100, **params)
    n=params['n_clusters']
    
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    cm=metrics.accuracy_score(y_test,result2)
    print(str(n)+" "+str(cm))
    return (1-cm)
from skopt import gp_minimize
import time
t1=time.time()
res_gp = gp_minimize(objective, space, n_calls=20, random_state=0)
t2=time.time()
print(t2-t1)
print("Best score=%.4f" % (1-res_gp.fun))
print("""Best parameters: n_clusters=%d""" % (res_gp.x[0]))

30 0.37888446215139443
43 0.40239043824701193
43 0.3916334661354582
43 0.39800796812749006
32 0.4290836653386454
20 0.42270916334661357
16 0.6880478087649402
5 0.4645418326693227
15 0.4784860557768924
25 0.36573705179282867
16 0.702390438247012
16 0.3537848605577689
13 0.4956175298804781
10 0.9254980079681275
9 0.6721115537848605
10 0.4745019920318725
7 0.4701195219123506
2 0.45976095617529883
10 0.47250996015936253
50 0.4362549800796813
17.848207473754883
Best score=0.9255
Best parameters: n_clusters=10


In [89]:
#Hyperparameter optimization by BO-TPE
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics

def objective(params):
    params = {
        'n_clusters': int(params['n_clusters']), 
    }
    km_cluster = MiniBatchKMeans(batch_size=100, **params)
    n=params['n_clusters']
    
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    score=metrics.accuracy_score(y_test,result2)
    print(str(params['n_clusters'])+" "+str(score))
    return {'loss':1-score, 'status': STATUS_OK }
space = {
    'n_clusters': hp.quniform('n_clusters', 2, 50, 1),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("Random Forest: Hyperopt estimated optimum {}".format(best))

44 0.4131474103585657                                 
10 0.44860557768924303                                                          
25 0.39681274900398406                                                          
11 0.7350597609561753                                                          
29 0.40398406374501994                                                          
22 0.3665338645418327                                                           
31 0.4338645418326693                                                           
34 0.38366533864541835                                                          
23 0.6099601593625498                                                           
5 0.4358565737051793                                                            
32 0.6163346613545817                                                            
43 0.399601593625498                                                             
48 0.43745019920318723                               

In [90]:
CL_agglomerative(X_train, X_test, y_train, y_test, 16)

              precision    recall  f1-score   support

           0       0.27      0.19      0.23      1255
           1       0.38      0.49      0.43      1255

    accuracy                           0.34      2510
   macro avg       0.33      0.34      0.33      2510
weighted avg       0.33      0.34      0.33      2510

0.34143426294820717
[[ 242 1013]
 [ 640  615]]


### Apply the CL-agglomerative model with biased classifiers

In [95]:
# needs to work on the entire dataset to generate sufficient training samples for biased classifiers
def Anomaly_IDS(X_train, X_test, y_train, y_test,n,b=100):
    # CL-kmeans
    a_cluster = AgglomerativeClustering(n_clusters=n)
    result = a_cluster.fit_predict(X_train)
    result2 = a_cluster.fit_predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    print(classification_report(y_test, result2))
    cm=confusion_matrix(y_test,result2)
    acc=metrics.accuracy_score(y_test,result2)
    print(str(acc))
    print(cm)
    
    #Biased classifier construction
    count=0
    print(len(y))
    a=np.zeros(n)
    b=np.zeros(n)
    FNL=[]
    FPL=[]
    for v in range(0,n):
        al=[]
        bl=[]
        for i in range(0,len(y)):   
            if result[i]==v:        
                if y[i]==1:        #label 1
                    a[v]=a[v]+1
                    al.append(i)
                else:             #label 0
                    b[v]=b[v]+1
                    bl.append(i)
        if a[v]<=b[v]:
            FNL.extend(al)
        else:
            FPL.extend(bl)
        #print(str(v)+"="+str(a[v]/(a[v]+b[v])))
        
    dffp=df.iloc[FPL, :]
    dffn=df.iloc[FNL, :]
    dfva0=df[df['Label']==0]
    dfva1=df[df['Label']==1]
    
    dffpp=dfva1.sample(n=None, frac=len(FPL)/dfva1.shape[0], replace=False, weights=None, random_state=None, axis=0)
    dffnp=dfva0.sample(n=None, frac=len(FNL)/dfva0.shape[0], replace=False, weights=None, random_state=None, axis=0)
    
    dffp_f=pd.concat([dffp, dffpp])
    dffn_f=pd.concat([dffn, dffnp])
    
    Xp = dffp_f.drop(['Label'],axis=1)  
    yp = dffp_f.iloc[:, -1].values.reshape(-1,1)
    yp=np.ravel(yp)

    Xn = dffn_f.drop(['Label'],axis=1)  
    yn = dffn_f.iloc[:, -1].values.reshape(-1,1)
    yn=np.ravel(yn)
    
    rfp = RandomForestClassifier(random_state = 0)
    rfp.fit(Xp,yp)
    rfn = RandomForestClassifier(random_state = 0)
    rfn.fit(Xn,yn)

    dffnn_f=pd.concat([dffn, dffnp])
    
    Xnn = dffn_f.drop(['Label'],axis=1)  
    ynn = dffn_f.iloc[:, -1].values.reshape(-1,1)
    ynn=np.ravel(ynn)

    rfnn = RandomForestClassifier(random_state = 0)
    rfnn.fit(Xnn,ynn)

    X2p = df2.drop(['Label'],axis=1) 
    y2p = df2.iloc[:, -1].values.reshape(-1,1)
    y2p=np.ravel(y2p)

    result2 = a_cluster.fit_predict(X2p)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y)):
            if result[i]==v:
                if y[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    l1=[]
    l0=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0,len(y2p)):
        if result2[v] in list1:
            result2[v]=0
            l0.append(v)
        elif result2[v] in list2:
            result2[v]=1
            l1.append(v)
        else:
            print("-1")
    print(classification_report(y2p, result2))
    cm=confusion_matrix(y2p,result2)
    print(cm)

In [96]:
Anomaly_IDS(X_train, X_test, y_train, y_test, 16)

              precision    recall  f1-score   support

           0       0.27      0.19      0.23      1255
           1       0.38      0.49      0.43      1255

    accuracy                           0.34      2510
   macro avg       0.33      0.34      0.33      2510
weighted avg       0.33      0.34      0.33      2510

0.34143426294820717
[[ 242 1013]
 [ 640  615]]
28055
              precision    recall  f1-score   support

           0       0.38      0.31      0.34      1255
           1       0.42      0.49      0.45      1255

    accuracy                           0.40      2510
   macro avg       0.40      0.40      0.40      2510
weighted avg       0.40      0.40      0.40      2510

[[393 862]
 [640 615]]
