In [1]:
import pandas as pd
import glob
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, BatchNormalization, Dropout
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from imblearn import under_sampling, over_sampling, combine
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

In [2]:
#reading data

path = '/Users/ahmetokanarik/Desktop/MScThesis/Dataset/UNSQ-NB15'
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    df = pd.read_csv(filename,header=None,sep=';',low_memory=False)
    li.append(df)


data = pd.concat(li, axis=0, ignore_index=True)
data.columns = ["srcip","sport","dstip","dsport","proto","state","dur","sbytes","dbytes","sttl","dttl","sloss","dloss","service","Sload","Dload","Spkts","Dpkts","swin","dwin","stcpb","dtcpb","smeansz","dmeansz","trans_depth","res_bdy_len","Sjit","Djit","Stime","Ltime","Sintpkt","Dintpkt","tcprtt","synack","ackdat","is_sm_ips_ports","ct_state_ttl","ct_flw_http_mthd","is_ftp_login","ct_ftp_cmd","ct_srv_src","ct_srv_dst","ct_dst_ltm","ct_src_ ltm","ct_src_dport_ltm","ct_dst_sport_ltm","ct_dst_src_ltm","attack_cat","Label"]


In [3]:
#one-hot encoding
dummies = pd.get_dummies(data[['service','proto','state']])
data.drop(['proto','service','state'],axis=1,inplace=True)
data = pd.concat([data, dummies], axis=1)
data = data[["dtcpb","stcpb","service_-","Dload","dmeansz","service_dns","smeansz","Sload","trans_depth","sttl",
            "service_ftp-data","ct_ftp_cmd","attack_cat"]]

data['attack_cat'] = data['attack_cat'].fillna('Normal')

In [4]:
replace_dict = {np.nan: 0, ' ': 0}
for cols in ['ct_ftp_cmd']:
    data[cols] = data[cols].replace(replace_dict)
    
replace_dict = {np.nan: 0, '0': 0}
for cols in ['ct_ftp_cmd']:
    data[cols] = data[cols].replace(replace_dict)
        

In [5]:
x = data.drop('attack_cat',axis=1).values.astype('float32')
y = data.attack_cat.values

In [6]:
#Target Variable Label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
x = scaler.fit_transform(x)

In [8]:
x.shape

(2540047, 12)

In [9]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)

for train_index, test_index in skf.split(x, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

"""
from sklearn.model_selection import RepeatedStratifiedKFold

rskf = RepeatedStratifiedKFold(n_splits=6, n_repeats=6, random_state=36851234)
for train_index, test_index in rskf.split(x, y):
     print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = x[train_index], x[test_index]
     y_train, y_test = y[train_index], y[test_index]"""

TRAIN: [ 354206  354207  354208 ... 2540044 2540045 2540046] TEST: [      0       1       2 ... 1142311 1142327 1142344]
TRAIN: [      0       1       2 ... 2540044 2540045 2540046] TEST: [ 354206  354207  354208 ... 1711562 1711563 1711564]
TRAIN: [      0       1       2 ... 1711562 1711563 1711564] TEST: [1385884 1385885 1385886 ... 2540044 2540045 2540046]


'\nfrom sklearn.model_selection import RepeatedStratifiedKFold\n\nrskf = RepeatedStratifiedKFold(n_splits=6, n_repeats=6, random_state=36851234)\nfor train_index, test_index in rskf.split(x, y):\n     print("TRAIN:", train_index, "TEST:", test_index)\n     X_train, X_test = x[train_index], x[test_index]\n     y_train, y_test = y[train_index], y[test_index]'

In [10]:
print(sorted(Counter(y_train).items()))

[(0, 1785), (1, 1553), (2, 10902), (3, 29684), (4, 16164), (5, 143654), (6, 1479176), (7, 9324), (8, 1007), (9, 116)]


In [11]:
X_train.shape

(1693365, 12)

In [12]:
a = 177803 #y-traine göre belirlenmeli.
smo = SMOTE(sampling_strategy={0:a,1:a,2:a,3:a,4:a,5:a,7:a,8:a,9:a},random_state=42) 
X_train, y_train = smo.fit_resample(X_train, y_train)   
print(sorted(Counter(y_train).items()))

[(0, 177803), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 1479176), (7, 177803), (8, 177803), (9, 177803)]


In [13]:
from imblearn.under_sampling import EditedNearestNeighbours

st = [6]
# define the undersampling method
undersample = EditedNearestNeighbours(n_neighbors=3, sampling_strategy=st)
X_train, y_train = undersample.fit_resample(X_train, y_train)


In [14]:
print(sorted(Counter(y_train).items()))

[(0, 177803), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 1458279), (7, 177803), (8, 177803), (9, 177803)]


In [15]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
import xgboost as xgb
from sklearn.metrics import accuracy_score
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pylab as plt
import numpy as np

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

In [16]:
params = {
    'n_estimators':2000,
    'subsample':0.6,
    'colsample_bytree':0.7,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 10,
    'eta':0.05,
    'max_depth':3,
    'nthread':4,
    'scale_pos_weight':1,
    'min_child_weight':5,
    'learning_rate':0.1,
    'gamma':0,
    'seed':27
}
bst = xgb.train(params, dtrain)


Parameters: { "n_estimators", "scale_pos_weight" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [17]:
pred = bst.predict(dtest)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

print("Classification report:",classification_report(y_test,pred))
print("Accuracy score: ", accuracy_score(y_test,pred))


Classification report:               precision    recall  f1-score   support

           0       0.07      0.14      0.09       892
           1       0.05      0.78      0.09       776
           2       0.24      0.00      0.00      5451
           3       0.73      0.39      0.51     14841
           4       0.41      0.51      0.46      8082
           5       1.00      0.97      0.98     71827
           6       1.00      0.99      0.99    739588
           7       0.72      0.78      0.75      4663
           8       0.06      0.67      0.11       504
           9       0.02      0.79      0.04        58

    accuracy                           0.96    846682
   macro avg       0.43      0.60      0.40    846682
weighted avg       0.98      0.96      0.97    846682

Accuracy score:  0.9626435899192377


In [19]:
f1_score(y_test, pred, average="weighted") * 100

96.8916208382921