In [1]:
%cd ../..

C:\Users\bram_\home\msc


# Exclude almost all `bruteForce` attack types from `week1` training set and almost all `pingScan` attack types from `week2` training set
We need to keep just a few entries of the attack type that we want to exclude because we need to give the model at least the chance to correctly predict the excluded attack type. Otherwise we may not make conclusions like "the accuracy is lower because the model is not predicting the excluded attack type correctly" since there will be no chance for the model to predict the excluded attack type. Therefore just a few entries are included.

# Imports

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from timeit import default_timer as timer
import utils.cidds_001 as utils

from utils.cidds_001 import columns_to_drop

# Start global timer

In [3]:
start_global = timer()

# Load and suffle datasets

In [4]:
# load and shuffle week1
week1 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week1-cleaned.feather')
week1_shuffled = week1.sample(frac=1, random_state=13).reset_index(drop=True)

# load and shuffle week2
week2 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week2-cleaned.feather')
week2_shuffled = week2.sample(frac=1, random_state=13).reset_index(drop=True)

In [5]:
count_week1_df = pd.DataFrame(week1_shuffled.groupby(by='attack_type').size(), columns=['count'])
count_week1_df

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,7010897
bruteForce,1626
dos,1252127
pingScan,3359
portScan,183511


In [6]:
count_week2_df = pd.DataFrame(week2_shuffled.groupby(by='attack_type').size(), columns=['count'])
count_week2_df

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,8515329
bruteForce,3366
dos,1706900
pingScan,2731
portScan,82407


# Remove all flows of `bruteForce` and `pingScan` of respectively week1 and week2

In [7]:
# Remove all of the flows of bruteForce from week1 dataset
week1_excl = week1_shuffled.where(week1_shuffled['attack_type'] != 'bruteForce').dropna().reset_index(drop=True)
# week1_one_brute = week1_shuffled.where(week1_shuffled['attack_type'] == 'bruteForce').dropna().head(n=1).reset_index(drop=True)
# week1_excl = week1_excl.append(week1_one_brute)

# Remove all of the flows of pingScan from week2 dataset
week2_excl = week2_shuffled.where(week2_shuffled['attack_type'] != 'pingScan').dropna().reset_index(drop=True)
# week2_one_ping = week2_shuffled.where(week2_shuffled['attack_type'] == 'pingScan').dropna().head(n=1).reset_index(drop=True)
# week2_excl = week2_excl.append(week2_one_ping)

In [8]:
pd.DataFrame(week1_excl.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,7010897
1,dos,1252127
2,pingScan,3359
3,portScan,183511


In [9]:
pd.DataFrame(week2_excl.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,8515329
1,bruteForce,3366
2,dos,1706900
3,portScan,82407


# Create balanced datasets (ignoring the  entries of the excluded attack type)

In [10]:
# extract a balanced dataset of the remaining attack types (ignoring the few entries of the excluded attack type)
week1_excl_balanced = utils.get_balanced_cidds(week1_excl)
week2_excl_balanced = utils.get_balanced_cidds(week2_excl)

# drop columns that are of no use for classification
week1_excl_balanced.drop(columns=columns_to_drop, inplace=True)
week2_excl_balanced.drop(columns=columns_to_drop, inplace=True)

### Confirm having created a balanced dataset

In [11]:
pd.DataFrame(week1_excl_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3359
1,dos,3359
2,pingScan,3359
3,portScan,3359


In [12]:
pd.DataFrame(week2_excl_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3366
1,bruteForce,3366
2,dos,3366
3,portScan,3366


# Split datasets in training and test sets

In [13]:
# week 1
x_week1_excl = week1_excl_balanced.drop(columns='attack_type')
y_week1_excl = week1_excl_balanced['attack_type']
x_train_week1_excl, x_test_week1_excl, y_train_week1_excl, y_test_week1_excl = train_test_split(
    x_week1_excl, y_week1_excl, test_size=0.2, random_state=0)

# week 2
x_week2_excl = week2_excl_balanced.drop(columns='attack_type')
y_week2_excl = week2_excl_balanced['attack_type']
x_train_week2_excl, x_test_week2_excl, y_train_week2_excl, y_test_week2_excl = train_test_split(
    x_week2_excl, y_week2_excl, test_size=0.2, random_state=0)

# Random Forest Classification

## Fit model of week 1 with data of week 1

In [14]:
rfc_week1_excl = RandomForestClassifier(max_depth=7)

start = timer()
rfc_week1_excl.fit(x_train_week1_excl, y_train_week1_excl)
end = timer()

print('Fitting RFC took {} seconds'.format(end - start))

Fitting RFC took 0.5335565000000031 seconds


## Confusion matrix

In [15]:
# Predict
predicted_y = rfc_week1_excl.predict(x_train_week1_excl)

# Create confusion matrix
confusion_matrix(y_train_week1_excl, predicted_y)

array([[2717,    0,    0,    0],
       [   1, 2704,    0,    0],
       [  77,    0, 2576,    2],
       [  49,    0,   65, 2557]], dtype=int64)

## Fit model of week 2 with data of week 2

In [16]:
rfc_week2_excl = RandomForestClassifier(max_depth=7)

start = timer()
rfc_week2_excl.fit(x_train_week2_excl, y_train_week2_excl)
end = timer()

print('Fitting RFC took {} seconds'.format(end - start))

Fitting RFC took 0.4861468000000002 seconds


## Confusion matrix

In [17]:
# Predict
predicted_y = rfc_week2_excl.predict(x_train_week2_excl)

# Create confusion matrix
confusion_matrix(y_train_week2_excl, predicted_y)

array([[2681,   36,    1,    3],
       [   3, 2693,    0,    0],
       [   0,    0, 2667,    0],
       [  44,   68,    0, 2575]], dtype=int64)

## Score model of week 1 with data of week 2

In [18]:
# extract rfc_week1_excl's input data (i.e. week2 data without the columns dropped for training and without 'attack_type')
x_week2_all = week2.drop(columns=(columns_to_drop + ['attack_type']))

# extract week2's labels
y_week2_all = week2['attack_type']

# predict the labels of week 2 by model of week 1
predicted_y = rfc_week1_excl.predict(x_week2_all)

# analyze the results
results_df = utils.analyze_classification_results(predicted_y, y_week2_all).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,8506966,8515329,0.999018
1,bruteForce,0,3366,0.0
2,dos,1705550,1706900,0.999209
3,pingScan,2483,2731,0.909191
4,portScan,76334,82407,0.926305
5,total,10291333,10310733,0.998118


In [19]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_week2_all, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,8506966,0,3015,771,4577
bruteForce,3155,0,0,0,211
dos,1349,0,1705550,0,1
pingScan,243,0,0,2483,5
portScan,3696,0,0,2377,76334


## Score model of week 2 with data of week 1

In [20]:
# extract rfc_week2_excl's input data (i.e. week1 data without the columns dropped for training and without 'attack_type')
x_week1_all = week1.drop(columns=(columns_to_drop + ['attack_type']))

# extract week1's labels
y_week1_all = week1['attack_type']

# predict the labels of week 2 by model of week 1
predicted_y = rfc_week2_excl.predict(x_week1_all)

# analyze the results
results_df = utils.analyze_classification_results(predicted_y, y_week1_all).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,6933654,7010897,0.988982
1,bruteForce,1565,1626,0.962485
2,dos,1251771,1252127,0.999716
3,pingScan,0,3359,0.0
4,portScan,180245,183511,0.982203
5,total,8367235,8451520,0.990027


In [21]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_week1_all, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,6933654,61464,6448,0,9331
bruteForce,61,1565,0,0,0
dos,344,9,1251771,0,3
pingScan,88,25,0,0,3246
portScan,2016,1250,0,0,180245


# End global timer

In [22]:
end_global = timer()
print(f'Running the complete notebook took {end_global - start_global} seconds.')

Running the complete notebook took 762.1824141 seconds.
