In [1]:
%cd ../..

C:\Users\bram_\home\msc


# Imports

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from timeit import default_timer as timer
import utils.cidds_001 as utils

from utils.cidds_001 import columns_to_drop

# Load and shuffle datasets

In [3]:
# load and shuffle week1
week1 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week1-cleaned.feather')
week1_shuffled = week1.sample(frac=1, random_state=13).reset_index(drop=True)

# load and shuffle week2
week2 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week2-cleaned.feather')
week2_shuffled = week2.sample(frac=1, random_state=13).reset_index(drop=True)

In [4]:
count_week1_df = pd.DataFrame(week1_shuffled.groupby(by='attack_type').size(), columns=['count'])
count_week1_df

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,7010897
bruteForce,1626
dos,1252127
pingScan,3359
portScan,183511


In [5]:
count_week2_df = pd.DataFrame(week2_shuffled.groupby(by='attack_type').size(), columns=['count'])
count_week2_df

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,8515329
bruteForce,3366
dos,1706900
pingScan,2731
portScan,82407


# Create balanced datasets

In [6]:
# get balanced dataset
week1_dataset = utils.get_balanced_cidds(week1_shuffled)
week2_dataset = utils.get_balanced_cidds(week2_shuffled)

# drop columns that are of no use for classification
week1_dataset.drop(columns=columns_to_drop, inplace=True)
week2_dataset.drop(columns=columns_to_drop, inplace=True)

## Confirm having created balanced datasets

In [7]:
pd.DataFrame(week1_dataset.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,1626
1,bruteForce,1626
2,dos,1626
3,pingScan,1626
4,portScan,1626


In [8]:
pd.DataFrame(week2_dataset.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,2731
1,bruteForce,2731
2,dos,2731
3,pingScan,2731
4,portScan,2731


# Create training and test sets

In [9]:
# week 1
x_week1 = week1_dataset.drop(columns='attack_type')
y_week1 = week1_dataset['attack_type']
x_train_week1, x_test_week1, y_train_week1, y_test_week1 = train_test_split(x_week1, y_week1, test_size=0.2, random_state=13)

# week 2
x_week2 = week2_dataset.drop(columns='attack_type')
y_week2 = week2_dataset['attack_type']
x_train_week2, x_test_week2, y_train_week2, y_test_week2 = train_test_split(x_week2, y_week2, test_size=0.2, random_state=13)

# Random Forest Classification

## OpenStack - week 1

In [10]:
rfc_week1 = RandomForestClassifier(max_depth=10)

start = timer()
rfc_week1.fit(x_train_week1, y_train_week1)
end = timer()

print('Fitting RFC took {} seconds'.format(end - start))

Fitting RFC took 0.3693336000000045 seconds


In [11]:
# predict the test data and analyze results
predicted_y = rfc_week1.predict(x_test_week1)
results_df = utils.analyze_classification_results(predicted_y, y_test_week1).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,326,333,0.978979
1,bruteForce,328,328,1.0
2,dos,331,331,1.0
3,pingScan,290,307,0.944625
4,portScan,320,327,0.978593
5,total,1595,1626,0.980935


In [12]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_test_week1, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,326,7,0,0,0
bruteForce,0,328,0,0,0
dos,0,0,331,0,0
pingScan,5,11,0,290,1
portScan,1,3,0,3,320


## OpenStack - week 2

In [13]:
rfc_week2 = RandomForestClassifier(max_depth=10)

start = timer()
rfc_week2.fit(x_train_week2, y_train_week2)
end = timer()

print('Fitting RFC took {} seconds'.format(end - start))

Fitting RFC took 0.4715700999999797 seconds


In [14]:
# predict the test data and analyze results
predicted_y = rfc_week2.predict(x_test_week2)
results_df = utils.analyze_classification_results(predicted_y, y_test_week2).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,554,562,0.985765
1,bruteForce,494,495,0.99798
2,dos,572,575,0.994783
3,pingScan,542,571,0.949212
4,portScan,484,528,0.916667
5,total,2646,2731,0.968876


In [15]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_test_week2, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,554,6,0,2,0
bruteForce,0,494,0,1,0
dos,3,0,572,0,0
pingScan,14,15,0,542,0
portScan,5,12,0,27,484


## Score model of week 1 with data of week 2

In [16]:
# extract rfc_week1's input data (i.e. week2 data without the columns dropped for training and without 'attack_type')
x_week2_all = week2.drop(columns=(columns_to_drop + ['attack_type']))

# extract week2's labels
y_week2_all = week2['attack_type']

# predict the test data and analyze results
predicted_y = rfc_week1.predict(x_week2_all)
results_df = utils.analyze_classification_results(predicted_y, y_week2_all).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,8428773,8515329,0.989835
1,bruteForce,2999,3366,0.890969
2,dos,1703808,1706900,0.998189
3,pingScan,2503,2731,0.916514
4,portScan,76830,82407,0.932324
5,total,10214913,10310733,0.990707


In [17]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_week2_all, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,8428773,59277,2928,9163,15188
bruteForce,152,2999,0,4,211
dos,3023,56,1703808,12,1
pingScan,167,54,0,2503,7
portScan,2075,1045,0,2457,76830


## Score model of week 2 with data of week 1

In [18]:
# extract rfc_week2's input data (i.e. week1 data without the columns dropped for training and without 'attack_type')
x_week1_all = week1.drop(columns=(columns_to_drop + ['attack_type']))

# extract week1's labels
y_week1_all = week1['attack_type']

# predict the test data and analyze results
predicted_y = rfc_week2.predict(x_week1_all)
results_df = utils.analyze_classification_results(predicted_y, y_week1_all).sort_values(by='attack_type').reset_index(drop=True)
results_df

Unnamed: 0,attack_type,correct,total,acc
0,---,6930108,7010897,0.988477
1,bruteForce,1567,1626,0.963715
2,dos,1251339,1252127,0.999371
3,pingScan,3276,3359,0.97529
4,portScan,175149,183511,0.954433
5,total,8361439,8451520,0.989341


In [19]:
# calculate confucion matrix
conf_mat = confusion_matrix(y_week1_all, predicted_y)

# put confusion matrix in DataFrame for nicer output in Jupyter
conf_df = pd.DataFrame(conf_mat)

# the order rows/columns of the confusion matrix depends on the order at which
# each of the variables is first seen in y_week1_all. This order is the same
# as the order of the indexes when groupby() was used on week1.
conf_df.columns = count_week1_df.index.to_list()
conf_df.index = count_week1_df.index.to_list()

conf_df

Unnamed: 0,---,bruteForce,dos,pingScan,portScan
---,6930108,54556,2949,13693,9591
bruteForce,28,1567,0,30,1
dos,716,9,1251339,52,11
pingScan,53,22,0,3276,8
portScan,1846,1228,0,5288,175149
