# Intrusion Detection Evaluation Dataset (CIC-IDS2017)
By AliK604 

Intrusion Detection Systems (IDSs) and Intrusion Prevention Systems (IPSs) are the most important defense tools against the sophisticated and ever-growing network attacks. Due to the lack of reliable test and validation datasets, anomaly-based intrusion detection approaches are suffering from consistent and accurate performance evolutions.

In [1]:
# %config IPCompleter.greedy=True
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib as matplot
import matplotlib.pyplot as plt
# %matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings, os 
# warnings.filterwarnings("ignore")

# from keras import Sequential
# from keras.models import Model, load_model
# from keras.layers import *
# from keras.callbacks import ModelCheckpoint
# from keras import regularizers

from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, TruncatedSVD, PCA
from sklearn.svm import LinearSVC

import xgboost, lightgbm
from mlxtend.classifier import EnsembleVoteClassifier 



In [2]:
ls = [] 
for filename in os.listdir(r'./'):
  if '.csv' in  filename:
    print(filename)
    df = pd.read_csv(filename)
    ls.append(df)
    print(f'Shape: {df.shape}. Attack Type {df[" Label"].unique()}')

Friday-WorkingHours-Morning.pcap_ISCX.csv
Shape: (191033, 79). Attack Type ['BENIGN' 'Bot']
Monday-WorkingHours.pcap_ISCX.csv
Shape: (529918, 79). Attack Type ['BENIGN']
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Shape: (286467, 79). Attack Type ['BENIGN' 'PortScan']
Tuesday-WorkingHours.pcap_ISCX.csv
Shape: (445909, 79). Attack Type ['BENIGN' 'FTP-Patator' 'SSH-Patator']
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Shape: (225745, 79). Attack Type ['BENIGN' 'DDoS']
Wednesday-workingHours.pcap_ISCX.csv
Shape: (692703, 79). Attack Type ['BENIGN' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye'
 'Heartbleed']
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Shape: (288602, 79). Attack Type ['BENIGN' 'Infiltration']
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Shape: (170366, 79). Attack Type ['BENIGN' 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection']


In [3]:
for df in ls:
  cols = df.columns.to_list()
  for i in range(len(cols)):
    cols[i] = cols[i].strip()
  df.columns = cols

In [4]:
df = pd.concat(ls)
df.head(3)
assert df.shape[1] == 79

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3268,112740690,32,16,6448,1152,403,0,201.5,204.724205,72,72,72.0,0.0,67.411331,0.425756,2398738.0,5798697.94,16400000,3,113000000,3636796.0,6848760.823,16400000,3,113000000,7516023.2,8323384.915,16400000,3,1,0,0,0,1024,512,0.283837,0.141919,0,403,163.326531,178.931713,32016.55782,0,1,0,0,1,0,0,0,0,166.729167,201.5,72.0,1024,0,0,0,0,0,0,32,6448,16,1152,377,2079,15,32,359.4286,11.99802,380,343,16100000.0,498804.8,16400000,15400000,BENIGN
1,389,112740560,32,16,6448,5056,403,0,201.5,204.724205,316,316,316.0,0.0,102.039585,0.425756,2398735.0,5798709.67,16400000,2,113000000,3636792.0,6848776.836,16400000,2,113000000,7516016.133,8323376.147,16400000,4,1,0,0,0,1024,512,0.283838,0.141919,0,403,243.0,174.716914,30526.0,0,1,0,0,1,0,0,0,0,248.0625,201.5,316.0,1024,0,0,0,0,0,0,32,6448,16,5056,955,2079,15,32,320.2857,15.74499,330,285,16100000.0,498793.7,16400000,15400000,BENIGN
2,0,113757377,545,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,4.790898,209112.8,1395543.434,20800000,0,114000000,209112.8,1395543.434,20800000,0,0,0.0,0.0,0,0,0,0,0,0,0,0,4.790898,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,545,0,0,0,-1,-1,0,0,9361829.0,7324646.0,18900000,19,12200000.0,6935824.0,20800000,5504997,BENIGN


In [5]:
# mix of ints and floats. Label is a object (words)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    # df.dtypes


In [6]:
from collections import Counter

Counter(df["Label"])

Counter({'BENIGN': 2273097,
         'Bot': 1966,
         'DDoS': 128027,
         'DoS GoldenEye': 10293,
         'DoS Hulk': 231073,
         'DoS Slowhttptest': 5499,
         'DoS slowloris': 5796,
         'FTP-Patator': 7938,
         'Heartbleed': 11,
         'Infiltration': 36,
         'PortScan': 158930,
         'SSH-Patator': 5897,
         'Web Attack � Brute Force': 1507,
         'Web Attack � Sql Injection': 21,
         'Web Attack � XSS': 652})

In [7]:
print(f'df.shape {df.shape} before sampling out most of benign data')
df = pd.concat([df[df['Label'] != 'BENIGN'], df[df['Label'] == 'BENIGN'].sample(frac=.1, random_state=42)]) # 20% of total is Malicious; Use only a sample of the BENIGN data
print(f'df.shape {df.shape} after sampling out most of benign data')

df.shape (2830743, 79) before sampling out most of benign data
df.shape (784956, 79) after sampling out most of benign data


In [8]:
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])
df.head(3)
le.classes_

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
24072,8080,60202640,9,9,322,256,322,0,35.777778,107.333333,256,0,28.444444,85.333333,9.600908,0.29899,3541332.0,4901981.0,10200000,47,51200000,6396442.0,5268490.0,10200000,234,60200000,7518953.625,4645137.0,10300000,637,0,0,0,0,296,296,0.149495,0.149495,0,322,30.421053,91.783753,8424.25731,0,0,0,1,0,0,0,0,1,32.111111,35.777778,28.444444,296,0,0,0,0,0,0,9,322,9,256,29200,110,1,32,63678.2,22252.53596,103175,50911,10200000.0,34941.27201,10200000,10100000,1
24677,8080,57891,1,1,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,34.547684,57891.0,0.0,57891,57891,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,32,32,17.273842,17.273842,0,0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,1,0.0,0.0,0.0,32,0,0,0,0,0,0,1,0,1,0,237,110,0,32,0.0,0.0,0,0,0.0,0.0,0,0,1
56376,8080,134812,4,3,206,134,194,0,51.5,95.042096,128,0,44.666667,72.231111,2522.03068,51.924161,22468.67,53230.91,131123,123,134812,44937.33,76126.82,132841,949,132783,66391.5,91544.17,131123,1660,0,0,0,0,92,72,29.670949,22.253212,0,194,42.5,75.288018,5668.285714,0,0,0,1,0,0,0,0,0,48.571429,51.5,44.666667,92,0,0,0,0,0,0,4,206,3,134,8192,237,3,20,0.0,0.0,0,0,0.0,0.0,0,0,1


array(['BENIGN', 'Bot', 'DDoS', 'DoS GoldenEye', 'DoS Hulk',
       'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed',
       'Infiltration', 'PortScan', 'SSH-Patator',
       'Web Attack � Brute Force', 'Web Attack � Sql Injection',
       'Web Attack � XSS'], dtype=object)

In [9]:
lowSTD = list(df.std().to_frame().nsmallest(20, columns=0).index)
df[lowSTD].head(3)

Unnamed: 0,Bwd PSH Flags,Bwd URG Flags,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Fwd URG Flags,CWE Flag Count,RST Flag Count,ECE Flag Count,Fwd PSH Flags,SYN Flag Count,URG Flag Count,FIN Flag Count,ACK Flag Count,PSH Flag Count,Down/Up Ratio,Label
24072,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
24677,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1
56376,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [10]:
lowCORR = list(df.corr().abs().sort_values('Label')['Label'].nsmallest(20).index) # .where(lambda x: x < 0.005).dropna()
df[lowCORR].head(3)

Unnamed: 0,Bwd Header Length,Fwd Header Length,Fwd Header Length.1,min_seg_size_forward,Destination Port,Subflow Bwd Bytes,Total Length of Bwd Packets,Subflow Bwd Packets,Total Backward Packets,act_data_pkt_fwd,Subflow Fwd Packets,Total Fwd Packets,CWE Flag Count,Fwd URG Flags,FIN Flag Count,ECE Flag Count,RST Flag Count,Fwd Packets/s,Bwd IAT Std,Flow Packets/s
24072,296,296,296,32,8080,256,256,9,9,1,9,9,0,0,0,0,0,0.149495,4645137.0,0.29899
24677,32,32,32,32,8080,0,0,1,1,0,1,1,0,0,0,0,0,17.273842,0.0,34.547684
56376,72,92,92,20,8080,134,134,3,3,3,4,4,0,0,0,0,0,29.670949,91544.17,51.924161


In [11]:
print(f'Intersection: {set(lowSTD).intersection(set(lowCORR))}')
print(f'Union:        {len(set(lowSTD).union(set(lowCORR)))}')

Intersection: {'ECE Flag Count', 'FIN Flag Count', 'RST Flag Count', 'CWE Flag Count', 'Fwd URG Flags'}
Union:        35


In [12]:
import gc 
gc.collect()

11

In [13]:
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
df = df[indices_to_keep]


for i in df.columns:
    df = df[df[i] != "Infinity"]
    df = df[df[i] != np.nan]
    df = df[df[i] != np.inf]
    df = df[df[i] != -np.inf]
    df = df[df[i] != ",,"]
    df = df[df[i] != ", ,"]
    
print(np.any(np.isnan(df)))
print(np.any(np.isfinite(df)))

X_train, X_test, y_train, y_test = train_test_split(df.drop(['Label'],axis=1), df['Label'], test_size=.20, random_state=42)
X_train.head(2)
y_train.head(2)

False
True


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
51164,443,280071,9,8,3242,991,2917,0,360.222222,961.661294,853,0,123.875,296.37883,15114.02466,60.698894,17504.44,34521.52,87684,3,280071,35008.88,46450.26,98199,4,182676,26096.57143,43773.01175,93132,3,0,0,0,0,200,164,32.134709,28.564186,0,2917,235.166667,699.09027,488727.2,0,0,0,1,0,0,0,0,0,249.0,360.222222,123.875,200,0,0,0,0,0,0,9,3242,8,991,29200,65057,4,20,0.0,0.0,0,0,0.0,0.0,0,0
148374,80,85573048,5,5,402,11595,390,0,80.4,173.09766,7240,0,2319.0,3331.638186,140.196011,0.116859,9508116.0,28500000.0,85400000,4,85400000,21400000.0,42700000.0,85400000,4,150444,37611.0,66024.75868,136158,86,0,0,0,0,144,168,0.05843,0.05843,0,7240,1091.181818,2415.425628,5834281.0,1,0,0,0,0,0,0,0,1,1200.3,80.4,2319.0,144,0,0,0,0,0,0,5,402,5,11595,0,235,2,20,3028.0,0.0,3028,3028,85400000.0,0.0,85400000,85400000


51164     0
148374    4
Name: Label, dtype: int64

In [14]:
def benchmark(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
  GBM = lightgbm.LGBMClassifier(objective='binary', n_estimators= 50)
  clfs = [GBM]
  for clf in clfs:
      _ = clf.fit(X_train, y_train)
      pred = clf.score(X_test, y_test)
      name = str(type(clf)).split(".")[-1][:-2]
      print("Acc: %0.5f for the %s" % (pred, name))
  return clfs

In [15]:
print('Baseline with all features')
clfs = benchmark()

Baseline with all features
Acc: 0.98591 for the LGBMClassifier


In [16]:
print('Solely with features identified as useless')

GBM = clfs[0]
remove = X_train.columns.to_numpy()[GBM.feature_importances_ == 0] 
clfs = benchmark(X_train[remove], X_test[remove])
print(remove)

Solely with features identified as useless
Acc: 0.95991 for the LGBMClassifier
['Bwd PSH Flags' 'Bwd URG Flags' 'SYN Flag Count' 'RST Flag Count'
 'CWE Flag Count' 'ECE Flag Count' 'Avg Fwd Segment Size'
 'Avg Bwd Segment Size' 'Fwd Header Length.1' 'Fwd Avg Bytes/Bulk'
 'Fwd Avg Packets/Bulk' 'Fwd Avg Bulk Rate' 'Bwd Avg Bytes/Bulk'
 'Bwd Avg Packets/Bulk' 'Bwd Avg Bulk Rate' 'Subflow Fwd Packets'
 'Subflow Fwd Bytes' 'Subflow Bwd Packets' 'Subflow Bwd Bytes']


In [17]:
print('Solely with features identified as useful')
clfs = benchmark(X_train.drop(remove,1), X_test.drop(remove,1))

Solely with features identified as useful
Acc: 0.96422 for the LGBMClassifier


In [18]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

print('PCA 5')
pca = PCA(5)
_ = benchmark(pca.fit_transform(X_train), pca.transform(X_test))

PCA 5
Acc: 0.92331 for the LGBMClassifier


In [19]:
print('PCA 15')
pca = PCA(15)
_ = benchmark(pca.fit_transform(X_train), pca.transform(X_test))

PCA 15
Acc: 0.95175 for the LGBMClassifier


In [20]:
print('PCA 25')
pca = PCA(25)
_ = benchmark(pca.fit_transform(X_train), pca.transform(X_test))

PCA 25
Acc: 0.96008 for the LGBMClassifier


In [21]:
tmp = np.argsort(GBM.feature_importances_)
tmp = tmp[::-1] # large to small 
top = tmp[:20] # index of the top N
rest = tmp[20:]
print(GBM.feature_importances_[top]) # check 

[1715 1540 1300 1250 1109  940  767  721  667  608  550  530  505  449
  406  403  387  362  345  338]


In [22]:
print('Top 20 features (per `LGBMClassifier`) + PCA(15) of remaining') # 2nd highest 
pca = PCA(15)

a = pd.concat([X_train.iloc[:, top], pd.DataFrame(pca.fit_transform(X_train.iloc[:, rest]))], axis=1, ignore_index=True) 
b = pd.concat([X_test.iloc[:, top] , pd.DataFrame(pca.transform    (X_test.iloc[:, rest])) ], axis=1, ignore_index=True) 

_ = benchmark(a, b)

Top 20 features (per `LGBMClassifier`) + PCA(15) of remaining
Acc: 0.98277 for the LGBMClassifier


# Conclusion 

**Baseline with all features**
* Acc: 0.98591 for the LGBMClassifier

Solely with features identified as useless
* Acc: 0.95991 for the LGBMClassifier

Solely with features identified as useful
* Acc: 0.96422 for the LGBMClassifier

PCA 5
* Acc: 0.92331 for the LGBMClassifier

PCA 15
* Acc: 0.94629 for the LGBMClassifier

PCA 25
* Acc: 0.94210 for the LGBMClassifier

**Top 20 features (per `LGBMClassifier`) + PCA(15) of remaining**
* Acc: 0.98277 for the LGBMClassifier