## Partitioning UNSW-NB15-Train-Basic into 5 nodes 

The partitions made can be balanced/ unbalanced, with 5 nodes. Attacks might appear in all nodes or only a subset. 

In [1]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

In [2]:
# Get UNSW-NB15-Train-Basic dataset 
complete = pd.read_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic.csv')

### id = 5A : Partition with 5 balanced nodes 

All of the traffic represented in the 5 nodes. 


- UNSW-NB15-Train-Basic-Part1 (87103): 
    - Normal: 43560 (%)
    - Generic: 32274 (%)
    - Exploits: 6708 (%)
    - DoS: 2461 (%)
    - Reconnaissance: 2100 (%)

- UNSW-NB15-Train-Basic-Part2 (87104): 
    - Normal: 43560 (%)
    - Generic: 32275 (%)
    - Exploits: 6707 (%)
    - DoS: 2461 (%)
    - Reconnaissance: 2101 (%)

- UNSW-NB15-Train-Basic-Part3 (87104): 
    - Normal: 43560 (%)
    - Generic: 32275 (%)
    - Exploits: 6707 (%)
    - DoS: 2461 (%)
    - Reconnaissance: 2101 (%)

- UNSW-NB15-Train-Basic-Part4 (87104): 
    - Normal: 43560 (%)
    - Generic: 32275 (%)
    - Exploits: 6707 (%)
    - DoS: 2461 (%)
    - Reconnaissance: 2101 (%)

- UNSW-NB15-Train-Basic-Part5 (87104): 
    - Normal: 43560 (%)
    - Generic: 32275 (%)
    - Exploits: 6707 (%)
    - DoS: 2462 (%)
    - Reconnaissance: 2100 (%)



In [3]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:43560]
normal2 = complete[complete['label'] == 0].iloc[43560:43560*2]
normal3 = complete[complete['label'] == 0].iloc[43560*2:43560*3]
normal4 = complete[complete['label'] == 0].iloc[43560*3:43560*4]
normal5 = complete[complete['label'] == 0].iloc[43560*4:]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:32274]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[32274:(32275*2-1)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(32275*2-1):(32275*3-1)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(32275*3-1):(32275*4-1)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(32275*4-1):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:6708]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[6708:(6707*2+1)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(6707*2+1):(6707*3+1)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(6707*3+1):(6707*4+1)]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[(6707*4+1):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:2461]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[2461:(2461*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(2461*2):(2461*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(2461*3):(2461*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(2461*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2100]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2100:(2101*2-1)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2101*2-1):(2101*3-1)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2101*3-1):(2101*4-1)]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2101*4-1):]

In [4]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])
part4 = pd.concat([normal4, generic4, exploits4, dos4, recon4])
part5 = pd.concat([normal5, generic5, exploits5, dos5, recon5])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5A-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5A-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5A-Part5.csv', index=False)

### id = 5B : Partition with 5 balanced nodes 



- UNSW-NB15-Train-Basic-Part1 (): 
    - Normal: 43560 (%)
    - Generic: 24530 (%)
    - Exploits: 6708 (%)
    - DoS: 12306 (%)
    - Reconnaissance: 0 (%)

- UNSW-NB15-Train-Basic-Part2 (): 
    - Normal: 43560 (%)
    - Generic: 34211 (%)
    - Exploits: 6707 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2626 (%)

- UNSW-NB15-Train-Basic-Part3 (): 
    - Normal: 77770 (%)
    - Generic: 0 (%)
    - Exploits: 6707 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2626 (%)

- UNSW-NB15-Train-Basic-Part4 (): 
    - Normal: 0 (%)
    - Generic: 77771 (%)
    - Exploits: 6707 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2625 (%)

- UNSW-NB15-Train-Basic-Part5 (): 
    - Normal: 52910 (%)
    - Generic: 24862 (%)
    - Exploits: 6707 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2625 (%)



In [5]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:43560]
normal2 = complete[complete['label'] == 0].iloc[43560:43560*2]
normal3 = complete[complete['label'] == 0].iloc[43560*2:(43560*2+77770)]
normal4 = complete[complete['label'] == 0].iloc[(43560*2+77770):]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:24530]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[24530:(24530+34211)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(24530+34211):(24530+34211+77771)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(24530+34211+77771):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:6708]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[6708:(6707*2+1)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(6707*2+1):(6707*3+1)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(6707*3+1):(6707*4+1)]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[(6707*4+1):]

dos1 = complete[complete['attack_cat'] == "dos"]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2626]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2626:(2626*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2626*2):(2626*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2626*3):]

In [6]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1])
part2 = pd.concat([normal2, generic2, exploits2, recon1])
part3 = pd.concat([normal3, exploits3, recon2])
part4 = pd.concat([generic3, exploits4, recon3])
part5 = pd.concat([normal4, generic4, exploits5, recon4])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part5.csv', index=False)

### id = 5C : Partition with 5 unbalanced nodes 

- UNSW-NB15-Train-Basic-Part1 (): 
    - Normal: 21780 (%)
    - Generic: 8309 (%)
    - Exploits: 8384 (%)
    - DoS: 2461 (%)
    - Reconnaissance: 2626 (%)

- UNSW-NB15-Train-Basic-Part2 (): 
    - Normal: 21780 (%)
    - Generic: 8309 (%)
    - Exploits: 0 (%)
    - DoS: 2461  (%)
    - Reconnaissance: 2626 (%)

- UNSW-NB15-Train-Basic-Part3 (): 
    - Normal: 43560 (%)
    - Generic: 8309 (%)
    - Exploits: 8384 (%)
    - DoS: 2461  (%)
    - Reconnaissance: 2626 (%)

- UNSW-NB15-Train-Basic-Part4 (): 
    - Normal: 0 (%)
    - Generic: 60000 (%)
    - Exploits: 8384 (%)
    - DoS: 2461 (%)
    - Reconnaissance: 2625 (%)

- UNSW-NB15-Train-Basic-Part5 (): 
    - Normal: 87120 (%)
    - Generic: 76447 (%)
    - Exploits: 8384 (%)
    - DoS: 2462 (%)
    - Reconnaissance: 0 (%)



In [7]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:21780]
normal2 = complete[complete['label'] == 0].iloc[21780:21780*2]
normal3 = complete[complete['label'] == 0].iloc[21780*2:(21780*2+43560)]
normal4 = complete[complete['label'] == 0].iloc[(21780*2+43560):]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:8309]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[8309:(8309*2)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(8309*2):(8309*3)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(8309*3):(8309*3+60000)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(8309*3+60000):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:8384]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[8384:(8384*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[((8309*2)):(8309*3)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(8309*3):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:2461]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[2461:(2461*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(2461*2):(2461*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(2461*3):(2461*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(2461*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2626]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2626:(2626*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2626*2):(2626*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2626*3):]

In [8]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits2, dos3, recon3])
part4 = pd.concat([generic4, exploits3, dos4, recon4])
part5 = pd.concat([normal4, generic5, exploits4, dos5])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-5B-Part5.csv', index=False)