In [1]:
%cd ../..

C:\Users\bram_\home\msc


# Imports

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from timeit import default_timer as timer
import utils.cidds_001 as utils

from utils.cidds_001 import columns_to_drop

# Load and shuffle datasets

In [3]:
# load and shuffle week1
week1 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week1-cleaned.feather')
week1_shuffled = week1.sample(frac=1, random_state=13).reset_index(drop=True)

# load and shuffle week2
week2 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week2-cleaned.feather')
week2_shuffled = week2.sample(frac=1, random_state=13).reset_index(drop=True)

In [4]:
count_week1_df = pd.DataFrame(week1_shuffled.groupby(by='attack_type').size(), columns=['count'])
count_week1_df

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,7010897
bruteForce,1626
dos,1252127
pingScan,3359
portScan,183511


In [5]:
count_week2_df = pd.DataFrame(week2_shuffled.groupby(by='attack_type').size(), columns=['count'])
count_week2_df

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,8515329
bruteForce,3366
dos,1706900
pingScan,2731
portScan,82407


# Create balanced datasets

In [6]:
# get balanced dataset
week1_dataset = utils.get_balanced_cidds(week1_shuffled)
week2_dataset = utils.get_balanced_cidds(week2_shuffled)

# drop columns that are of no use for classification
week1_dataset.drop(columns=columns_to_drop, inplace=True)
week2_dataset.drop(columns=columns_to_drop, inplace=True)

## Confirm having created balanced datasets

In [7]:
pd.DataFrame(week1_dataset.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,1626
1,bruteForce,1626
2,dos,1626
3,pingScan,1626
4,portScan,1626


In [8]:
pd.DataFrame(week2_dataset.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,2731
1,bruteForce,2731
2,dos,2731
3,pingScan,2731
4,portScan,2731


# Create training and test sets

In [9]:
# week 1
x_week1 = week1_dataset.drop(columns='attack_type')
y_week1 = week1_dataset['attack_type']
x_train_week1, x_test_week1, y_train_week1, y_test_week1 = train_test_split(x_week1, y_week1, test_size=0.2, random_state=13)

# week 2
x_week2 = week2_dataset.drop(columns='attack_type')
y_week2 = week2_dataset['attack_type']
x_train_week2, x_test_week2, y_train_week2, y_test_week2 = train_test_split(x_week2, y_week2, test_size=0.2, random_state=13)

# Random Forest Classification

## OpenStack - week 1

In [10]:
rfc_week1 = RandomForestClassifier(max_depth=5)

start = timer()
rfc_week1.fit(x_train_week1, y_train_week1)
end = timer()

print('Fitting RFC took {} seconds'.format(end - start))

Fitting RFC took 0.33198290000001407 seconds


In [11]:
# predict the test data and analyze results
predicted_y = rfc_week1.predict(x_test_week1)
results_df = utils.analyze_classification_results(predicted_y, y_test_week1).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,321,333,0.963964
1,bruteForce,310,328,0.945122
2,dos,331,331,1.0
3,pingScan,285,307,0.928339
4,portScan,317,327,0.969419
5,total,1564,1626,0.96187


In [12]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_test_week1, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,321,12,0,0,0
bruteForce,18,310,0,0,0
dos,0,0,331,0,0
pingScan,15,6,0,285,1
portScan,7,0,0,3,317


## OpenStack - week 2

In [13]:
rfc_week2 = RandomForestClassifier(max_depth=5)

start = timer()
rfc_week2.fit(x_train_week2, y_train_week2)
end = timer()

print('Fitting RFC took {} seconds'.format(end - start))

Fitting RFC took 0.3873338000000217 seconds


In [14]:
# predict the test data and analyze results
predicted_y = rfc_week2.predict(x_test_week2)
results_df = utils.analyze_classification_results(predicted_y, y_test_week2).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,557,562,0.991103
1,bruteForce,436,495,0.880808
2,dos,575,575,1.0
3,pingScan,520,571,0.910683
4,portScan,482,528,0.912879
5,total,2570,2731,0.941047


In [15]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_test_week2, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,557,0,1,0,4
bruteForce,47,436,12,0,0
dos,0,0,575,0,0
pingScan,51,0,0,520,0
portScan,24,0,0,22,482


## Score model of week 1 with data of week 2

In [16]:
# extract rfc_week1's input data (i.e. week2 data without the columns dropped for training and without 'attack_type')
x_week2_all = week2.drop(columns=(columns_to_drop + ['attack_type']))

# extract week2's labels
y_week2_all = week2['attack_type']

# predict the test data and analyze results
predicted_y = rfc_week1.predict(x_week2_all)
results_df = utils.analyze_classification_results(predicted_y, y_week2_all).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,8163134,8515329,0.95864
1,bruteForce,2896,3366,0.860368
2,dos,1706527,1706900,0.999781
3,pingScan,2485,2731,0.909923
4,portScan,76274,82407,0.925577
5,total,9951316,10310733,0.965141


In [17]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_week2_all, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,8163134,244239,76010,620,31326
bruteForce,259,2896,0,0,211
dos,366,6,1706527,0,1
pingScan,231,12,0,2485,3
portScan,3655,76,0,2402,76274


## Score model of week 2 with data of week 1

In [18]:
# extract rfc_week2's input data (i.e. week1 data without the columns dropped for training and without 'attack_type')
x_week1_all = week1.drop(columns=(columns_to_drop + ['attack_type']))

# extract week1's labels
y_week1_all = week1['attack_type']

# predict the test data and analyze results
predicted_y = rfc_week2.predict(x_week1_all)
results_df = utils.analyze_classification_results(predicted_y, y_week1_all).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,6938096,7010897,0.989616
1,bruteForce,1185,1626,0.728782
2,dos,1251905,1252127,0.999823
3,pingScan,3244,3359,0.965764
4,portScan,175087,183511,0.954095
5,total,8369517,8451520,0.990297


In [19]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_week1_all, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,6938096,8183,18822,634,45162
bruteForce,331,1185,110,0,0
dos,216,6,1251905,0,0
pingScan,113,0,0,3244,2
portScan,3131,110,0,5183,175087
