## Partitioning UNSW-NB15-Train-Basic into 5 nodes 

The partitions made can be balanced/ unbalanced, with 5 nodes. Attacks might appear in all nodes or only a subset. 

In [1]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

In [2]:
# Get UNSW-NB15-Train-Basic dataset 
complete = pd.read_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic.csv')

In [3]:
def filter(dataset): 
    # Get only the rows that contain state (PAR, ACC, ECO, CON, FIN, INT, REQ, RST) and proto (igmp, arp, icmp, udp, tcp, ipv6-icmp, rarp)
    dataset = dataset[dataset['state'].isin(['PAR', 'ACC', 'ECO', 'CON', 'FIN', 'INT', 'REQ', 'RST'])]
    dataset = dataset[dataset['proto'].isin(['igmp', 'arp', 'icmp', 'udp', 'tcp', 'ipv6-icmp', 'rarp'])]
    return dataset

In [4]:
complete = filter(complete)

### id = 5A : Partition with 5 balanced nodes 

All of the traffic represented in the 5 nodes. 


- UNSW-NB15-Train-Basic-Part1 (77295): 
    - Normal: 38647 (%)
    - Generic: 32008 (%)
    - Exploits: 4274 (%)
    - DoS: 583 (%)
    - Reconnaissance: 1783 (%)

- UNSW-NB15-Train-Basic-Part2 (77295): 
    - Normal: 38647 (%)
    - Generic: 32008 (%)
    - Exploits: 4274 (%)
    - DoS: 583 (%)
    - Reconnaissance: 1783 (%)

- UNSW-NB15-Train-Basic-Part3 (77295): 
    - Normal: 38647 (%)
    - Generic: 32008 (%)
    - Exploits: 4274 (%)
    - DoS: 583 (%)
    - Reconnaissance: 1782 (%)

- UNSW-NB15-Train-Basic-Part4 (77294): 
    - Normal: 38647 (%)
    - Generic: 32008 (%)
    - Exploits: 4274 (%)
    - DoS: 583 (%)
    - Reconnaissance: 1782 (%)

- UNSW-NB15-Train-Basic-Part5 (77294): 
    - Normal: 38648 (%)
    - Generic: 32009 (%)
    - Exploits: 4273 (%)
    - DoS: 582 (%)
    - Reconnaissance: 1782 (%)



In [6]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:38647]
normal2 = complete[complete['label'] == 0].iloc[38647:38647*2]
normal3 = complete[complete['label'] == 0].iloc[38647*2:38647*3]
normal4 = complete[complete['label'] == 0].iloc[38647*3:38647*4]
normal5 = complete[complete['label'] == 0].iloc[38647*4:(38647*5+1)]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:32008]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[32008:32008*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[32008*2:32008*3]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[32008*3:32008*4]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[32008*4:]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:4274]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[4274:4274*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[4274*2:4274*3]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[4274*3:4274*4]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[4274*4:]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:583]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[583:(583*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(583*2):(583*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(583*3):(583*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(583*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1783]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1783:1783*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1783*2:1783*3-1]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1783*3-1:(1783*4-2)]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1783*4-2):]

In [7]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])
part4 = pd.concat([normal4, generic4, exploits4, dos4, recon4])
part5 = pd.concat([normal5, generic5, exploits5, dos5, recon5])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5A-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5A-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5A-Part5.csv', index=False)

### id = 5B : Partition with 5 balanced nodes 



- UNSW-NB15-Train-Basic-Part1 (77294): 
    - Normal: 38647 (%)
    - Generic: 31459 (%)
    - Exploits: 4274 (%)
    - DoS: 2914 (%)
    - Reconnaissance: 0 (%)

- UNSW-NB15-Train-Basic-Part2 (77294): 
    - Normal: 38647 (%)
    - Generic: 32145 (%)
    - Exploits: 4274 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2228 (%)

- UNSW-NB15-Train-Basic-Part3 (77295): 
    - Normal: 70793 (%)
    - Generic: 0 (%)
    - Exploits: 4274 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2228 (%)

- UNSW-NB15-Train-Basic-Part4 (77295): 
    - Normal: 0 (%)
    - Generic: 70793 (%)
    - Exploits: 4274 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2228 (%)

- UNSW-NB15-Train-Basic-Part5 (77295): 
    - Normal: 45149 (%)
    - Generic: 23211 (%)
    - Exploits: 4273 (%)
    - DoS: 0 (%)
    - Reconnaissance: 2228 (%)



In [13]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:38647]
normal2 = complete[complete['label'] == 0].iloc[38647:38647*2]
normal3 = complete[complete['label'] == 0].iloc[38647*2:(38647*2+70793)]
normal4 = complete[complete['label'] == 0].iloc[(38647*2+70793):(38647*2+70793+45149)]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:31459]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[31459:(31459+32145)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(31459+32145):(31459+32145+70793)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(31459+32145+70793):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:4274]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[4274*1:4274*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[4274*2:4274*3]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[4274*3:4274*4]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[4274*4:]

dos1 = complete[complete['attack_cat'] == "dos"]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2228]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2228:(2228*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*2):(2228*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*3):]

In [14]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1])
part2 = pd.concat([normal2, generic2, exploits2, recon1])
part3 = pd.concat([normal3, exploits3, recon2])
part4 = pd.concat([generic3, exploits4, recon3])
part5 = pd.concat([normal4, generic4, exploits5, recon4])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5B-Part5.csv', index=False)

### id = 5C : Partition with 5 unbalanced nodes 

- UNSW-NB15-Train-Basic-Part1 (): 
    - Normal: 24154 (%)
    - Generic: 7865 (%)
    - Exploits: 5342 (%)
    - DoS: 583 (%)
    - Reconnaissance: 2228 (%)

- UNSW-NB15-Train-Basic-Part2 (): 
    - Normal: 24154 (%)
    - Generic: 7865 (%)
    - Exploits: 0 (%)
    - DoS: 583  (%)
    - Reconnaissance: 2228 (%)

- UNSW-NB15-Train-Basic-Part3 (): 
    - Normal: 48308 (%)
    - Generic: 7865 (%)
    - Exploits: 5342 (%)
    - DoS: 583  (%)
    - Reconnaissance: 2228 (%)

- UNSW-NB15-Train-Basic-Part4 (): 
    - Normal: 0 (%)
    - Generic: 60000 (%)
    - Exploits: 5342 (%)
    - DoS: 583 (%)
    - Reconnaissance: 2228 (%)

- UNSW-NB15-Train-Basic-Part5 (): 
    - Normal: 96620 (%)
    - Generic: 76446 (%)
    - Exploits: 5343 (%)
    - DoS: 582 (%)
    - Reconnaissance: 0 (%)



In [15]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:24154]
normal2 = complete[complete['label'] == 0].iloc[24154:24154*2]
normal3 = complete[complete['label'] == 0].iloc[24154*2:(24154*2+48308)]
normal4 = complete[complete['label'] == 0].iloc[(24154*2+48308):(24154*2+48308+96620)]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:7865]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[7865:(7865*2)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(7865*2):(7865*3)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(7865*3):(7865*3+60000)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(7865*3+60000):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:5342]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[5342:(5342*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[((5342*2)):(5342*3)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(5342*3):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:583]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[583:(583*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(583*2):(583*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(583*3):(583*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(583*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2228]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2228:(2228*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*2):(2228*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*3):]

In [16]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits2, dos3, recon3])
part4 = pd.concat([generic4, exploits3, dos4, recon4])
part5 = pd.concat([normal4, generic5, exploits4, dos5])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt5C-Part5.csv', index=False)

### Classification datasets 

Normal traffic is not considered, as it's going to feed models that categorize attacks (when detection model detects attack). Information about percentages and distribution of attacks is included in the "datasets.xslx" file. 

### CAT5A

In [7]:

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:32008]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[32008:32008*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[32008*2:32008*3]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[32008*3:32008*4]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[32008*4:]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:4274]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[4274:4274*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[4274*2:4274*3]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[4274*3:4274*4]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[4274*4:]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:583]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[583:(583*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(583*2):(583*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(583*3):(583*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(583*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1782]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1782:1782*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1782*2:1782*3]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1782*3:(1782*4+1)]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1782*4+1):]

In [8]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([generic1, exploits1, dos1, recon1])
part2 = pd.concat([generic2, exploits2, dos2, recon2])
part3 = pd.concat([generic3, exploits3, dos3, recon3])
part4 = pd.concat([generic4, exploits4, dos4, recon4])
part5 = pd.concat([generic5, exploits5, dos5, recon5])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5A-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5A-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5A-Part5.csv', index=False)

### CAT5B

In [9]:

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:600]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[600:600*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[600*2:600*3]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[600*3:600*4]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[600*4:600*5]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:600]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[600:600*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[600*2:600*3]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[600*3:600*4]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[600*4:600*5]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:583]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[583:(583*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(583*2):(583*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(583*3):(583*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(583*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:600]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[600:600*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[600*2:600*3]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[600*3:600*4]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[600*4:600*5]

In [10]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([generic1, exploits1, dos1, recon1])
part2 = pd.concat([generic2, exploits2, dos2, recon2])
part3 = pd.concat([generic3, exploits3, dos3, recon3])
part4 = pd.concat([generic4, exploits4, dos4, recon4])
part5 = pd.concat([generic5, exploits5, dos5, recon5])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5B-Part5.csv', index=False)

### CAT5C

In [11]:

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:1457]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[1457:(1457+3728)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(1457+3728):(1457+3728+17642)]


exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:3727]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[3727:(17642+3727)]


dos1 = complete[complete['attack_cat'] == "dos"].iloc[:1457]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[1457:]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1457]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1457):(1457+3727)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1457+3727):(1457+3727+3728)]


In [12]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([generic1, dos1])
part2 = pd.concat([dos2, recon1])
part3 = pd.concat([exploits1, recon2])
part4 = pd.concat([generic2,recon3])
part5 = pd.concat([generic3, exploits2])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat5C-Part5.csv', index=False)

### Corrupted datasets
Corruption of Filt5B node 1 and Filt5C node 1 and node 3 -> percentages of 5, 10 and 25

#### Corr5A

In [5]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:38647]
normal2 = complete[complete['label'] == 0].iloc[38647:38647*2]
normal3 = complete[complete['label'] == 0].iloc[38647*2:38647*3]
normal4 = complete[complete['label'] == 0].iloc[38647*3:38647*4]
normal5 = complete[complete['label'] == 0].iloc[38647*4:(38647*5+1)]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:32008]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[32008:32008*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[32008*2:32008*3]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[32008*3:32008*4]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[32008*4:]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:4274]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[4274:4274*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[4274*2:4274*3]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[4274*3:4274*4]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[4274*4:]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:583]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[583:(583*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(583*2):(583*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(583*3):(583*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(583*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1783]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1783:1783*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1783*2:1783*3-1]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1783*3-1:(1783*4-2)]
recon5 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(1783*4-2):]

In [6]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])
part4 = pd.concat([normal4, generic4, exploits4, dos4, recon4])
part5 = pd.concat([normal5, generic5, exploits5, dos5, recon5])


In [7]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.025, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.025, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [8]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15A-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15A-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15A-Part5.csv', index=False)

In [9]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])
part4 = pd.concat([normal4, generic4, exploits4, dos4, recon4])
part5 = pd.concat([normal5, generic5, exploits5, dos5, recon5])


In [10]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.05, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.05, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [11]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15A-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15A-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15A-Part5.csv', index=False)

In [12]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3,4,5 dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])
part4 = pd.concat([normal4, generic4, exploits4, dos4, recon4])
part5 = pd.concat([normal5, generic5, exploits5, dos5, recon5])


In [13]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.0125, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.0125, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [14]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15A-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15A-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15A-Part5.csv', index=False)

#### Corr5B

In [10]:
# Create the partitions 

normal1 = complete[complete['label'] == 0].iloc[:38647]
normal2 = complete[complete['label'] == 0].iloc[38647:38647*2]
normal3 = complete[complete['label'] == 0].iloc[38647*2:(38647*2+70793)]
normal4 = complete[complete['label'] == 0].iloc[(38647*2+70793):(38647*2+70793+45149)]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:31459]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[31459:(31459+32145)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(31459+32145):(31459+32145+70793)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(31459+32145+70793):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:4274]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[4274*1:4274*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[4274*2:4274*3]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[4274*3:4274*4]
exploits5 = complete[complete['attack_cat'] == "exploits"].iloc[4274*4:]

dos1 = complete[complete['attack_cat'] == "dos"]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2228]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2228:(2228*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*2):(2228*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*3):]

In [11]:
part1 = pd.concat([normal1, generic1, exploits1, dos1])
part2 = pd.concat([normal2, generic2, exploits2, recon1])
part3 = pd.concat([normal3, exploits3, recon2])
part4 = pd.concat([generic3, exploits4, recon3])
part5 = pd.concat([normal4, generic4, exploits5, recon4])

In [12]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.025, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.025, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [13]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15B-Part5.csv', index=False)

In [17]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.05, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.05, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [18]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15B-Part5.csv', index=False)

In [19]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.125, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.125, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [20]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15B-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15B-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15B-Part5.csv', index=False)

#### Corr5C node 1

In [25]:
# Create the partitions 
normal1 = complete[complete['label'] == 0].iloc[:24154]
normal2 = complete[complete['label'] == 0].iloc[24154:24154*2]
normal3 = complete[complete['label'] == 0].iloc[24154*2:(24154*2+48308)]
normal4 = complete[complete['label'] == 0].iloc[(24154*2+48308):(24154*2+48308+96620)]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:7865]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[7865:(7865*2)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(7865*2):(7865*3)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(7865*3):(7865*3+60000)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(7865*3+60000):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:5342]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[5342:(5342*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[((5342*2)):(5342*3)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(5342*3):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:583]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[583:(583*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(583*2):(583*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(583*3):(583*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(583*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2228]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2228:(2228*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*2):(2228*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*3):]

In [26]:
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits2, dos3, recon3])
part4 = pd.concat([generic4, exploits3, dos4, recon4])
part5 = pd.concat([normal4, generic5, exploits4, dos5])

In [27]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.025, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.025, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [28]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%15C-Part5.csv', index=False)

In [29]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.05, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.05, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [30]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%15C-Part5.csv', index=False)

In [31]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.125, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.125, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1_changed = part1_concatenated.sample(frac=1, random_state=42)

In [32]:
part1_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%15C-Part5.csv', index=False)

#### Corr 5C node 3

In [33]:
# Create the partitions 
normal1 = complete[complete['label'] == 0].iloc[:24154]
normal2 = complete[complete['label'] == 0].iloc[24154:24154*2]
normal3 = complete[complete['label'] == 0].iloc[24154*2:(24154*2+48308)]
normal4 = complete[complete['label'] == 0].iloc[(24154*2+48308):(24154*2+48308+96620)]

generic1 = complete[complete['attack_cat'] == "generic"].iloc[:7865]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[7865:(7865*2)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(7865*2):(7865*3)]
generic4 = complete[complete['attack_cat'] == "generic"].iloc[(7865*3):(7865*3+60000)]
generic5 = complete[complete['attack_cat'] == "generic"].iloc[(7865*3+60000):]

exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:5342]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[5342:(5342*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[((5342*2)):(5342*3)]
exploits4 = complete[complete['attack_cat'] == "exploits"].iloc[(5342*3):]

dos1 = complete[complete['attack_cat'] == "dos"].iloc[:583]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[583:(583*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(583*2):(583*3)]
dos4 = complete[complete['attack_cat'] == "dos"].iloc[(583*3):(583*4)]
dos5 = complete[complete['attack_cat'] == "dos"].iloc[(583*4):]

recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2228]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2228:(2228*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*2):(2228*3)]
recon4 = complete[complete['attack_cat'] == "reconnaissance"].iloc[(2228*3):]

In [34]:
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits2, dos3, recon3])
part4 = pd.concat([generic4, exploits3, dos4, recon4])
part5 = pd.concat([normal4, generic5, exploits4, dos5])

In [35]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part3_sample_label0 = part3[part3['label'] == 0].sample(frac=0.025, random_state=42)
part3_sample_label0['label'] = 1
part3_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part3_sample_label1 = part3[part3['label'] == 1].sample(frac=0.025, random_state=42)
part3_sample_label1['label'] = 0
part3_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part3_concatenated = pd.concat([part3.drop(part3_sample_label0.index).drop(part3_sample_label1.index), part3_sample_label0, part3_sample_label1])

# Shuffle the rows of the concatenated dataframe
part3_changed = part3_concatenated.sample(frac=1, random_state=42)

In [36]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%35C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%35C-Part2.csv', index=False)
part3_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%35C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%35C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%35C-Part5.csv', index=False)

In [37]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part3_sample_label0 = part3[part3['label'] == 0].sample(frac=0.05, random_state=42)
part3_sample_label0['label'] = 1
part3_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part3_sample_label1 = part3[part3['label'] == 1].sample(frac=0.05, random_state=42)
part3_sample_label1['label'] = 0
part3_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part3_concatenated = pd.concat([part3.drop(part3_sample_label0.index).drop(part3_sample_label1.index), part3_sample_label0, part3_sample_label1])

# Shuffle the rows of the concatenated dataframe
part3_changed = part3_concatenated.sample(frac=1, random_state=42)

In [38]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%35C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%35C-Part2.csv', index=False)
part3_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%35C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%35C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr10%35C-Part5.csv', index=False)

In [39]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part3_sample_label0 = part3[part3['label'] == 0].sample(frac=0.125, random_state=42)
part3_sample_label0['label'] = 1
part3_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part3_sample_label1 = part3[part3['label'] == 1].sample(frac=0.125, random_state=42)
part3_sample_label1['label'] = 0
part3_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part3_concatenated = pd.concat([part3.drop(part3_sample_label0.index).drop(part3_sample_label1.index), part3_sample_label0, part3_sample_label1])

# Shuffle the rows of the concatenated dataframe
part3_changed = part3_concatenated.sample(frac=1, random_state=42)

In [40]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%35C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%35C-Part2.csv', index=False)
part3_changed.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%35C-Part3.csv', index=False)
part4.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%35C-Part4.csv', index=False)
part5.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%35C-Part5.csv', index=False)