## Partitioning UNSW-NB15-Train-Basic into 7 nodes 

The partitions made can be balanced/ unbalanced, with 7 nodes. Attacks might appear in all nodes or only a subset. 

In [2]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

In [3]:
# Get UNSW-NB15-Train-Basic dataset 
complete = pd.read_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic.csv')

### id = 7A : Partition with 7 balanced nodes 

All of the traffic represented in the 7 nodes. 


- UNSW-NB15-Train-Basic-Part1 (): 
    - Normal: 31114 (%)
    - Generic: 23054 (%)
    - Exploits: 4790 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1501 (%)

- UNSW-NB15-Train-Basic-Part2 (): 
    - Normal: 31115 (%)
    - Generic: 23053 (%)
    - Exploits: 4791 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1500 (%)

- UNSW-NB15-Train-Basic-Part3 (): 
    - Normal: 31114 (%)
    - Generic: 23054 (%)
    - Exploits: 4791 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1500 (%)

- UNSW-NB15-Train-Basic-Part4 (): 
    - Normal: 31115 (%)
    - Generic: 23053 (%)
    - Exploits: 4791 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1500 (%)

- UNSW-NB15-Train-Basic-Part5 (): 
    - Normal: 31114 (%)
    - Generic: 23054 (%)
    - Exploits: 4791 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1500 (%)

- UNSW-NB15-Train-Basic-Part6 (): 
    - Normal: 31114 (%)
    - Generic: 23053 (%)
    - Exploits: 4791 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1501 (%)

- UNSW-NB15-Train-Basic-Part7 (): 
    - Normal: 31114 (%)
    - Generic: 23053 (%)
    - Exploits: 4791 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1501 (%)



In [4]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:31114]
normal2 = complete[complete['label'] == 0].iloc[31114:(31114*2+1)]
normal3 = complete[complete['label'] == 0].iloc[(31114*2+1):(31114*3+1)]
normal4 = complete[complete['label'] == 0].iloc[(31114*3+1):(31114*4+2)]
normal5 = complete[complete['label'] == 0].iloc[(31114*4+2):(31114*5+2)]
normal6 = complete[complete['label'] == 0].iloc[(31114*5+2):(31114*6+2)]
normal7 = complete[complete['label'] == 0].iloc[(31114*6+2):]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:23054]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[23054:(23053*2+1)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(23053*2+1):(23053*3+2)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(23053*3+2):(23053*4+2)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(23053*4+2):(23053*5+3)]
generic6 = complete[complete['attack_cat'] == "generic"].iloc[(23053*5+3):(23053*6+3)]
generic7 = complete[complete['attack_cat'] == "generic"].iloc[(23053*6+3):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:4790]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[4790:(4791*2-1)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(4791*2-1):(4791*3-1)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(4791*3-1):(4791*4-1)]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[(4791*4-1):(4791*5-1)]
exploits6 = complete[complete['attack_cat'] == "exploits"].iloc[(4791*5-1):(4791*6-1)]
exploits7 = complete[complete['attack_cat'] == "exploits"].iloc[(4791*6-1):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:1758]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[1758:(1758*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(1758*2):(1758*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(1758*3):(1758*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(1758*4):(1758*5)]
dos6 = complete[complete['attack_cat'] == "dos"].iloc[(1758*5):(1758*6)]
dos7 = complete[complete['attack_cat'] == "dos"].iloc[(1758*6):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1501]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1501:(1500*2+1)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1500*2+1):(1500*3+1)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1500*3+1):(1500*4+1)]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1500*4+1):(1500*5+1)]
recon6 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1500*5+1):(1500*6+2)]
recon7 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1500*6+2):]

In [5]:

part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])
part4 = pd.concat([normal4, generic4, exploits4, dos4, recon4])
part5 = pd.concat([normal5, generic5, exploits5, dos5, recon5])
part6 = pd.concat([normal6, generic6, exploits6, dos6, recon6])
part7 = pd.concat([normal7, generic7, exploits7, dos7, recon7])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7A-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7A-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7A-Part5.csv', index=False)
part6.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7A-Part6.csv', index=False)
part7.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7A-Part7.csv', index=False)

### id = 7B : Partition with 7 balanced nodes 



- UNSW-NB15-Train-Basic-Part1 (): 
    - Normal: 31814 (%)
    - Generic: 26895 (%)
    - Exploits: 0 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1750 (%)

- UNSW-NB15-Train-Basic-Part2 (): 
    - Normal: 27974 (%)
    - Generic: 26895 (%)
    - Exploits: 5590 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 0 (%)

- UNSW-NB15-Train-Basic-Part3 (): 
    - Normal: 26224 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1750 (%)

- UNSW-NB15-Train-Basic-Part4 (): 
    - Normal: 26224 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1750 (%)

- UNSW-NB15-Train-Basic-Part5 (): 
    - Normal: 24465 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 3516 (%)
    - Reconnaissance: 1751 (%)

- UNSW-NB15-Train-Basic-Part6 (): 
    - Normal: 27981 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 0 (%)
    - Reconnaissance: 1751 (%)

- UNSW-NB15-Train-Basic-Part7 (): 
    - Normal: 53118 (%)
    - Generic: 0 (%)
    - Exploits: 5590 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1751 (%)



In [6]:
normal1 = complete[complete['label'] == 0].iloc[:31814]
normal2 = complete[complete['label'] == 0].iloc[31814:(31814+27974)]
normal3 = complete[complete['label'] == 0].iloc[(31814+27974):(31814+27974+26224)]
normal4 = complete[complete['label'] == 0].iloc[(31814+27974+26224):(31814+27974+26224*2)]
normal5 = complete[complete['label'] == 0].iloc[(31814+27974+26224*2):(31814+27974+26224*2+24465)]
normal6 = complete[complete['label'] == 0].iloc[(31814+27974+26224*2+24465):(31814+27974+26224*2+24465+27981)]
normal7 = complete[complete['label'] == 0].iloc[(31814+27974+26224*2+24465+27981):]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:26895]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[26895:(26896*2-2)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(26896*2-2):(26896*3-2)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(26896*3-2):(26896*4-2)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(26896*4-2):(26896*5-2)]
generic6 = complete[complete['attack_cat'] == "generic"].iloc[(26896*5-2):(26896*6-2)]
generic7 = complete[complete['attack_cat'] == "generic"].iloc[(26896*6-2):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:5590]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[5590:(5589*2+1)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*2+1):(5589*3+1)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*3+1):(5589*4+1)]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*4+1):(5589*5+1)]
exploits6 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*5+1):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:1758]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[1758:(1758*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(1758*2):(1758*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(1758*3):(1758*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(1758*4):(1758*6)]
dos6 = complete[complete['attack_cat'] == "dos"].iloc[(1758*6):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1750]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1750:(1750*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*2):(1750*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*3):(1750*4+1)]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*4+1):(1750*5+2)]
recon6 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*5+2):]

In [7]:
part1 = pd.concat([normal1, generic1,  dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits1, dos2])
part3 = pd.concat([normal3, generic3, exploits2, dos3, recon2])
part4 = pd.concat([normal4, generic4, exploits3, dos4, recon3])
part5 = pd.concat([normal5, generic5, exploits4, dos5, recon4])
part6 = pd.concat([normal6, generic6, exploits5, recon5])
part7 = pd.concat([normal7, dos6, exploits6, recon6])

part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7B-Part5.csv', index=False)
part6.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7B-Part6.csv', index=False)
part7.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7B-Part7.csv', index=False)

### id = 7C : Partition with 7 unbalanced nodes 



- UNSW-NB15-Train-Basic-Part1 (): 
    - Normal: 10000 (%)
    - Generic: 26895 (%)
    - Exploits: 0 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1750 (%)

- UNSW-NB15-Train-Basic-Part2 (): 
    - Normal: 50000 (%)
    - Generic: 26895 (%)
    - Exploits: 5590 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 0 (%)

- UNSW-NB15-Train-Basic-Part3 (): 
    - Normal: 20000 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1750 (%)

- UNSW-NB15-Train-Basic-Part4 (): 
    - Normal: 0 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1750 (%)

- UNSW-NB15-Train-Basic-Part5 (): 
    - Normal: 75000 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 3516 (%)
    - Reconnaissance: 1751 (%)

- UNSW-NB15-Train-Basic-Part6 (): 
    - Normal: 52800 (%)
    - Generic: 26896 (%)
    - Exploits: 5589 (%)
    - DoS: 0 (%)
    - Reconnaissance: 1751 (%)

- UNSW-NB15-Train-Basic-Part7 (): 
    - Normal: 10000 (%)
    - Generic: 0 (%)
    - Exploits: 5590 (%)
    - DoS: 1758 (%)
    - Reconnaissance: 1751 (%)



In [8]:
normal1 = complete[complete['label'] == 0].iloc[:10000]
normal2 = complete[complete['label'] == 0].iloc[10000:60000]
normal3 = complete[complete['label'] == 0].iloc[60000:80000]
normal4 = complete[complete['label'] == 0].iloc[80000:(80000+75000)]
normal5 = complete[complete['label'] == 0].iloc[(80000+75000):(80000+75000+52800)]
normal6 = complete[complete['label'] == 0].iloc[(80000+75000+52800):]


generic1 = complete[complete['attack_cat'] == "generic"].iloc[:26895]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[26895:(26896*2-2)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(26896*2-2):(26896*3-2)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(26896*3-2):(26896*4-2)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(26896*4-2):(26896*5-2)]
generic6 = complete[complete['attack_cat'] == "generic"].iloc[(26896*5-2):(26896*6-2)]
generic7 = complete[complete['attack_cat'] == "generic"].iloc[(26896*6-2):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:5590]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[5590:(5589*2+1)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*2+1):(5589*3+1)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*3+1):(5589*4+1)]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*4+1):(5589*5+1)]
exploits6 = complete[complete['attack_cat'] == "exploits"].iloc[(5589*5+1):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:1758]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[1758:(1758*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(1758*2):(1758*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(1758*3):(1758*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(1758*4):(1758*6)]
dos6 = complete[complete['attack_cat'] == "dos"].iloc[(1758*6):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1750]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1750:(1750*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*2):(1750*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*3):(1750*4+1)]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*4+1):(1750*5+2)]
recon6 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1750*5+2):]

In [9]:
part1 = pd.concat([normal1, generic1,  dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits1, dos2])
part3 = pd.concat([normal3, generic3, exploits2, dos3, recon2])
part4 = pd.concat([ generic4, exploits3, dos4, recon3])
part5 = pd.concat([normal4, generic5, exploits4, dos5, recon4])
part6 = pd.concat([normal5, generic6, exploits5, recon5])
part7 = pd.concat([normal6, dos6, exploits6, recon6])

part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7C-Part5.csv', index=False)
part6.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7C-Part6.csv', index=False)
part7.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-7C-Part7.csv', index=False)