## Partitioning UNSW-NB15-Train-Basic into 3 nodes 

The partitions made can be balanced/ unbalanced, with 3 nodes. Attacks might appear in all nodes or only a subset. 

In [1]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

In [2]:
# Get UNSW-NB15-Train-Basic dataset 
complete = pd.read_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic.csv')

In [3]:
complete

Unnamed: 0,proto,srcip,sport,dstip,dsport,spkts,dpkts,sbytes,dbytes,state,stime,ltime,dur,label,attack_cat
0,udp,59.166.0.7,45584,149.171.126.0,53,2,2,130,162,CON,1424257612,1424257612,0.003362,0,normal
1,tcp,59.166.0.2,18633,149.171.126.3,8908,38,40,2438,19440,FIN,1424258064,1424258064,0.013975,0,normal
2,tcp,59.166.0.7,48428,149.171.126.4,143,122,126,7824,14814,FIN,1421951117,1421951118,0.757193,0,normal
3,unas,175.45.176.1,0,149.171.126.17,0,2,0,200,0,INT,1424244417,1424244417,0.000004,1,dos
4,tcp,175.45.176.1,65485,149.171.126.17,179,10,6,876,268,FIN,1421928270,1421928271,0.436004,0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435514,udp,175.45.176.2,31072,149.171.126.19,500,2,0,136,0,INT,1424223755,1424223755,0.000001,1,dos
435515,tcp,59.166.0.5,2758,149.171.126.8,51824,44,46,2766,28558,FIN,1424261274,1424261274,0.014994,0,normal
435516,udp,175.45.176.0,47439,149.171.126.10,53,2,0,114,0,INT,1424259155,1424259155,0.000002,1,generic
435517,udp,175.45.176.1,47439,149.171.126.18,53,2,0,114,0,INT,1424238658,1424238658,0.000002,1,generic


In [3]:
def filter(dataset): 
    # Get only the rows that contain state (PAR, ACC, ECO, CON, FIN, INT, REQ, RST) and proto (igmp, arp, icmp, udp, tcp, ipv6-icmp, rarp)
    dataset = dataset[dataset['state'].isin(['PAR', 'ACC', 'ECO', 'CON', 'FIN', 'INT', 'REQ', 'RST'])]
    dataset = dataset[dataset['proto'].isin(['igmp', 'arp', 'icmp', 'udp', 'tcp', 'ipv6-icmp', 'rarp'])]
    return dataset

In [4]:
complete = filter(complete)

### id = Filt3A : Partition with 3 balanced nodes 

Normal (3 nodes), Generic (3 nodes), Exploits (2 nodes), DoS and Reconnaissance (1 node).

WARNING: There are 24084 preprocessed normal traffic samples that are not being used to maintain a 50-50% rate in each node. 
- UNSW-NB15-Train-Basic-Part1 (128824): 
    - Normal: 64412 (50.0%)
    - Generic: 42360 (%)
    - Exploits: 13140 (%)
    - DoS: 0 (0.0%)
    - Reconnaissance: 8912 (%)

- UNSW-NB15-Train-Basic-Part2 (128824): 
    - Normal: 64412 (50.0%)
    - Generic: 61498 (%)
    - Exploits: 0 (0.0%)
    - DoS: 2914 (%)
    - Reconnaissance: 0 (0.0%)

- UNSW-NB15-Train-Basic-Part3 (128824): 
    - Normal: 64412 (50.0%)
    - Generic: 56183 (%)
    - Exploits: 8229 (%)
    - DoS: 0 (0.0%)
    - Reconnaissance: 0 (0.0%)

Get amount of samples for every type of traffic (attack subtypes as well): 

In [8]:
normal = complete[complete['label'] == 0]
normal.shape

(217320, 15)

In [9]:
generic = complete[complete['attack_cat'] == "generic"]
generic.shape

(160041, 15)

In [10]:
exploits = complete[complete['attack_cat'] == "exploits"]
exploits.shape

(21369, 15)

In [11]:
dos = complete[complete['attack_cat'] == "dos"]
dos.shape

(2914, 15)

In [12]:
reconnaissance = complete[complete['attack_cat'] == "reconnaissance"]
reconnaissance.shape

(8912, 15)

Construct the partitions for each subtype: 

In [24]:
# Separate into three partitions normal samples (64412)
normal1 = complete[complete['label'] == 0].iloc[:64412]
normal2 = complete[complete['label'] == 0].iloc[64412:64412*2]
normal3 = complete[complete['label'] == 0].iloc[64412*2:64412*3]


In [25]:
# Separate into three partitions generic samples (47369, 60213, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:42360]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[42360:(42360+61498)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(42360+61498):]

In [26]:
# Separate into two partitions exploits samples (14728, 18808)
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:13140]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[13140:]

In [27]:
# Get all dos samples and reconnaissance samples
dos = complete[complete['attack_cat'] == "dos"]
recon = complete[complete['attack_cat'] == "reconnaissance"]

Concatenate the different df to obtain the three final partitions: 

In [28]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, recon])
part2 = pd.concat([normal2, generic2, dos])
part3 = pd.concat([normal3, generic3, exploits2])

In [29]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3A-Part3.csv', index=False)

### id = Filt3B : Partition with 3 balanced nodes 

Normal (2 nodes), Generic (2 nodes), Exploits (1 nodes), DoS and Reconnaissance (1 node).


- UNSW-NB15-Train-Basic-Part1 (128824): 
    - Normal: 128824 (100.0%)
    - Generic: 0 (0.0%)
    - Exploits: 0 (0.0%)
    - DoS: 0 (0.0%)
    - Reconnaissance: 0 (0.0%)

- UNSW-NB15-Train-Basic-Part2 (128824): 
    - Normal: 64412 (50.0%)
    - Generic: 61498 (%)
    - Exploits: 0 (0.0%)
    - DoS: 2914 (%)
    - Reconnaissance: 0 (0.0%)

- UNSW-NB15-Train-Basic-Part3 (128824): 
    - Normal: 0 (0.0%)
    - Generic: 98543 (69.7%)
    - Exploits: 21369 (23.1%)
    - DoS: 0 (0.0%)
    - Reconnaissance: 8912 (7.2%)

In [22]:
# Create the partitions 

# Separate into two partitions normal samples (145200, 64412)
normal1 = complete[complete['label'] == 0].iloc[:128824]
normal2 = complete[complete['label'] == 0].iloc[128824:(128824+64412)]

# Separate into two partitions generic samples (60213, 101161)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:61498]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[61498:]

# Grab the samples with exploit attacks 
exploits1 = complete[complete['attack_cat'] == "exploits"]

# Get all dos samples and reconnaissance samples
dos = complete[complete['attack_cat'] == "dos"]
recon = complete[complete['attack_cat'] == "reconnaissance"]

In [23]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1])
part2 = pd.concat([normal2, generic1, dos])
part3 = pd.concat([generic1, exploits1, recon])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3B-Part3.csv', index=False)

### id = Filt3C : Partition with 3 balanced nodes 

All types of traffic represented in all nodes: 
 
- UNSW-NB15-Train-Basic-Part1 (128824): 
    - Normal: 64412 (50.0%)
    - Generic: 53347 (%)
    - Exploits: 7123 (%)
    - DoS: 971 (%)
    - Reconnaissance: 2971 (%)

- UNSW-NB15-Train-Basic-Part2 (128824): 
    - Normal: 64412 (50.0%)
    - Generic: 53347 (%)
    - Exploits: 7123 (%)
    - DoS: 971 (%)
    - Reconnaissance: 2971 (%)

- UNSW-NB15-Train-Basic-Part3 (128824): 
    - Normal: 64412 (50.0%)
    - Generic: 53347 (%)
    - Exploits: 7123 (%)
    - DoS: 972 (%)
    - Reconnaissance: 2970 (%)

In [17]:
# Create the partitions 

# Separate into three partitions normal samples (64412)
normal1 = complete[complete['label'] == 0].iloc[:64412]
normal2 = complete[complete['label'] == 0].iloc[64412:64412*2]
normal3 = complete[complete['label'] == 0].iloc[64412*2:64412*3]

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:53347]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[53347:53347*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[53347*2:]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:7123*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[7123*2:]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:971*2]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[971*2:]

# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:2971*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [18]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3C-Part3.csv', index=False)

### id = Filt3D : Partition with 3 balanced nodes 

To try with balanced nodes (same number of samples) that only contain Normal or Generic in one node, we need to restrict the number of samples at each node. 
- UNSW-NB15-Train-Basic-Part1 (33195): 
    - Normal: 33195 (100.0%)
    - Generic: 0 (0.0%)
    - Exploits: 0 (0.0%)
    - DoS: 0 (0.0%)
    - Reconnaissance: 0 (0.0%)

- UNSW-NB15-Train-Basic-Part2 (33195): 
    - Normal: 0 (0.0%)
    - Generic: 33195 (100.0%)
    - Exploits: 0 (0.0%)
    - DoS: 0 (0.0%)
    - Reconnaissance: 0 (0.0%)

- UNSW-NB15-Train-Basic-Part3 (33195): 
    - Normal: 0 (0.0%)
    - Generic: 0 (0.0%)
    - Exploits: 21369 (%)
    - DoS: 2914 (%)
    - Reconnaissance: 8912 (%)

In [32]:
# Create the partitions 

# Grab 56345 random samples from normal traffic 
normal = complete[complete['attack_cat'] == "normal"].sample(33195)

# Grab 56345 random samples from generic attacks
generic = complete[complete['attack_cat'] == "generic"].sample(33195)

# Grab the samples with exploit attacks 
exploits1 = complete[complete['attack_cat'] == "exploits"]

# Get all dos samples and reconnaissance samples
dos = complete[complete['attack_cat'] == "dos"]
recon = complete[complete['attack_cat'] == "reconnaissance"]

In [33]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal])
part2 = pd.concat([generic])
part3 = pd.concat([exploits1, dos, recon])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3D-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3D-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3D-Part3.csv', index=False)

### id = Filt3E : Partition with 3 unbalanced nodes 


- UNSW-NB15-Train-Basic-Part1 (128824): 
    - Normal: 66897 (%)
    - Generic: 53347 (%)
    - Exploits: 7123 (%)
    - DoS: 1457 (%)
    - Reconnaissance: 0 (0.0%)

- UNSW-NB15-Train-Basic-Part2 (128824): 
    - Normal: 62441 (%)
    - Generic: 53347 (%)
    - Exploits: 7123 (%)
    - DoS: 1457 (%)
    - Reconnaissance: 4456 (%)

- UNSW-NB15-Train-Basic-Part3 (128824): 
    - Normal: 63898 (%)
    - Generic: 53347 (%)
    - Exploits: 7123 (%)
    - DoS: 0 (0.0%)
    - Reconnaissance: 4456 (%)

In [34]:
# Create the partitions 

# Separate into three partitions normal samples (74050, 68799, 74951)
normal1 = complete[complete['label'] == 0].iloc[:66897]
normal2 = complete[complete['label'] == 0].iloc[66897:(66897+62441)]
normal3 = complete[complete['label'] == 0].iloc[(66897+62441):(66897+62441+63898)]

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:53347]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[53347:53347*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[53347*2:]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:7123*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[7123*2:]

# Separate into two partitions dos samples (6153)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:1457]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[1457:]


# Separate into two partitions reconnaissance samples (5251, 5252)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:4456]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[4456:]

In [35]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon1])
part3 = pd.concat([normal3, generic3, exploits3, recon2])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3E-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3E-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3E-Part3.csv', index=False)

### id = Filt3F : Partition with 3 unbalanced nodes

Normal (3 nodes), Generic (3 nodes), Exploits (2 nodes), DoS (1 node), Reconnaissance (1 node)

- UNSW-NB15-Train-Basic-Part1 (100000): 
    - Normal: 50000 
    - Generic: 23174 
    - Exploits: 15000
    - DoS: 2914 
    - Reconnaissance: 8912 

- UNSW-NB15-Train-Basic-Part2 (250000): 
    - Normal: 137042 
    - Generic: 112958 
    - Exploits: 0 (0.0%)
    - DoS: 0 (0%)
    - Reconnaissance: 0 (0.0%)

- UNSW-NB15-Train-Basic-Part3 (60556): 
    - Normal: 30278
    - Generic: 23909
    - Exploits: 6369 
    - DoS: 0 (0.0%)
    - Reconnaissance: 0 (0.0%)

In [36]:
# Create the partitions 

# Separate into three partitions normal samples 
normal1 = complete[complete['label'] == 0].iloc[:50000]
normal2 = complete[complete['label'] == 0].iloc[50000:(50000+137042)]
normal3 = complete[complete['label'] == 0].iloc[(50000+137042):]

# Separate into three partitions generic samples
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:2317]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[2317:(2317+112958)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(2317+112958):]

# Separate into two partitions exploits samples 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:15000]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[15000:]

# Gather all DoS components
dos1 = complete[complete['attack_cat'] == "dos"]

# Gather all Reconnaissance components 
recon1 = complete[complete['attack_cat'] == "reconnaissance"]

In [37]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2])
part3 = pd.concat([normal3, generic3, exploits2])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3F-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3F-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3F-Part3.csv', index=False)

### id = Filt3G : Partition with 3 unbalanced nodes 

- UNSW-NB15-Train-Basic-Part1 (50000): 
    - Normal: 25000 
    - Generic: 13935
    - Exploits: 7123
    - DoS: 971
    - Reconnaissance: 2971

- UNSW-NB15-Train-Basic-Part2 (136472): 
    - Normal: 68236 
    - Generic: 57171
    - Exploits: 7123
    - DoS: 971
    - Reconnaissance: 2971

- UNSW-NB15-Train-Basic-Part3 (200000)
    - Normal: 100000
    - Generic: 88935
    - Exploits: 7123
    - DoS: 972
    - Reconnaissance: 2970

In [38]:
# Create the partitions 

# Separate into three partitions normal samples 
normal1 = complete[complete['label'] == 0].iloc[:25000]
normal2 = complete[complete['label'] == 0].iloc[25000:(25000+68236)]
normal3 = complete[complete['label'] == 0].iloc[(25000+68236):(25000+68236+100000)]

# Separate into three partitions generic samples 
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:13935]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[13935:(13935+57171)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(13935+57171):]

# Separate into three partitions exploits samples
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:(7123*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(7123*2):]

# Separate into three partitions dos samples
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:(971*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(971*2):]


# Separate into three partitions reconnaissance samples
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:(2971*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [39]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, recon3, dos3])


In [None]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3G-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3G-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Filt3G-Part3.csv', index=False)

### Classification datasets 

Normal traffic is not considered, as it's going to feed models that categorize attacks (when detection model detects attack). Information about percentages and distribution of attacks is included in the "datasets.xslx" file. 

In [8]:
complete_cat = complete[complete['attack_cat'] != "normal"]

In [11]:
complete_cat['attack_cat'].value_counts()

generic           160041
exploits           21369
reconnaissance      8912
dos                 2914
Name: attack_cat, dtype: int64

#### Cat3A 

In [115]:
# Create the partitions 

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:53347]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[53347:53347*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[53347*2:]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:7123*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[7123*2:]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:971*2]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[971*2:]

# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:2971*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [116]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([generic1, exploits1, dos1, recon1])
part2 = pd.concat([generic2, exploits2, dos2, recon2])
part3 = pd.concat([generic3, exploits3, dos3, recon3])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3A-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3A-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3A-Part3.csv', index=False)

#### Cat3B

In [117]:
# Create the partitions 

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:7123]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[7123:7123*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[7123*2:7123*3]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:7123*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[7123*2:]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:971*2]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[971*2:]

# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:2971*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [118]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([generic1, exploits1, dos1, recon1])
part2 = pd.concat([generic2, exploits2, dos2, recon2])
part3 = pd.concat([generic3, exploits3, dos3, recon3])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3B-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3B-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3B-Part3.csv', index=False)

#### Cat3C

In [119]:
# Create the partitions 

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:1000]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[1000:1000*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[1000*2:1000*3]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:1000]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[1000:1000*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[1000*2:1000*3]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:971*2]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[971*2:]

# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:1000]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1000:1000*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[1000*2:1000*3]

In [120]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([generic1, exploits1, dos1, recon1])
part2 = pd.concat([generic2, exploits2, dos2, recon2])
part3 = pd.concat([generic3, exploits3, dos3, recon3])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3C-Part3.csv', index=False)

#### Cat3D

In [121]:
# Create the partitions 

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:21369]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[21369:(21369+2914)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(21369+2914):(21369+2914+8912)]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"]


# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"]

In [122]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([generic1, exploits1])
part2 = pd.concat([generic2, dos1])
part3 = pd.concat([generic3,recon1])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3D-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3D-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3D-Part3.csv', index=False)

#### Cat3E

In [123]:
# Create the partitions 

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"]
# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"]

In [124]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([generic1])
part2 = pd.concat([exploits1])
part3 = pd.concat([dos1, recon1])
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3E-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3E-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Cat3E-Part3.csv', index=False)

### Corrupted datasets

Based on the idea of corrupting a local node's model by performing Label Switching in the creation of the local partition's dataset. The name of the variation includes the percentage of random data switched and the node in which this has been performed. The project doesn't consider more than one node being corrupted, which would cause in the 3 nodes case the majority being corrupted, and in the 5 and 7 nodes case it would complicate more the findings of a robust aggregation function. 

#### Corr5%13C
5% of the samples of node 1 in the variation 3C are affected by label switching

In [11]:
# Create the partitions 

# Separate into three partitions normal samples (64412)
normal1 = complete[complete['label'] == 0].iloc[:64412]
normal2 = complete[complete['label'] == 0].iloc[64412:64412*2]
normal3 = complete[complete['label'] == 0].iloc[64412*2:64412*3]

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:53347]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[53347:53347*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[53347*2:]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:7123*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[7123*2:]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:971*2]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[971*2:]

# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:2971*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [12]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])

In [17]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.025, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.025, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1 = part1_concatenated.sample(frac=1, random_state=42)

In [18]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%13C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%13C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%13C-Part3.csv', index=False)

#### Corr25%13C
25% of the samples of node 1 in the variation 3C are affected by label switching

In [19]:
# Create the partitions 

# Separate into three partitions normal samples (64412)
normal1 = complete[complete['label'] == 0].iloc[:64412]
normal2 = complete[complete['label'] == 0].iloc[64412:64412*2]
normal3 = complete[complete['label'] == 0].iloc[64412*2:64412*3]

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:53347]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[53347:53347*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[53347*2:]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:7123*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[7123*2:]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:971*2]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[971*2:]

# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:2971*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [20]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])

In [21]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.125, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.125, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1 = part1_concatenated.sample(frac=1, random_state=42)

In [22]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%13C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%13C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%13C-Part3.csv', index=False)

#### Corr50%13C
50% of the samples of node 1 in the variation 3C are affected by label switching

In [23]:
# Create the partitions 

# Separate into three partitions normal samples (64412)
normal1 = complete[complete['label'] == 0].iloc[:64412]
normal2 = complete[complete['label'] == 0].iloc[64412:64412*2]
normal3 = complete[complete['label'] == 0].iloc[64412*2:64412*3]

# Separate into three partitions generic samples (53347, 53347, 53792)
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:53347]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[53347:53347*2]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[53347*2:]

# Separate into three partitions exploits samples (7123, 7123, 11178) 
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:7123*2]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[7123*2:]

# Separate into three partitions dos samples (971)
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:971*2]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[971*2:]

# Separate into three partitions reconnaissance samples (2971)
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:2971*2]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [24]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, dos3, recon3])

In [25]:
# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part1_sample_label0 = part1[part1['label'] == 0].sample(frac=0.25, random_state=42)
part1_sample_label0['label'] = 1
part1_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part1_sample_label1 = part1[part1['label'] == 1].sample(frac=0.25, random_state=42)
part1_sample_label1['label'] = 0
part1_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part1, dropping the modified samples
part1_concatenated = pd.concat([part1.drop(part1_sample_label0.index).drop(part1_sample_label1.index), part1_sample_label0, part1_sample_label1])

# Shuffle the rows of the concatenated dataframe
part1 = part1_concatenated.sample(frac=1, random_state=42)

In [26]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr50%13C-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr50%13C-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr50%13C-Part3.csv', index=False)

#### Corr5%23G

In [27]:
# Create the partitions 

# Separate into three partitions normal samples 
normal1 = complete[complete['label'] == 0].iloc[:25000]
normal2 = complete[complete['label'] == 0].iloc[25000:(25000+68236)]
normal3 = complete[complete['label'] == 0].iloc[(25000+68236):(25000+68236+100000)]

# Separate into three partitions generic samples 
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:13935]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[13935:(13935+57171)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(13935+57171):]

# Separate into three partitions exploits samples
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:(7123*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(7123*2):]

# Separate into three partitions dos samples
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:(971*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(971*2):]


# Separate into three partitions reconnaissance samples
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:(2971*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [28]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, recon3, dos3])


In [29]:
import pandas as pd

# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part2_sample_label0 = part2[part2['label'] == 0].sample(frac=0.025, random_state=42)
part2_sample_label0['label'] = 1
part2_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part2_sample_label1 = part2[part2['label'] == 1].sample(frac=0.025, random_state=42)
part2_sample_label1['label'] = 0
part2_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part2, dropping the modified samples
part2_concatenated = pd.concat([part2.drop(part2_sample_label0.index).drop(part2_sample_label1.index), part2_sample_label0, part2_sample_label1])

# Shuffle the rows of the concatenated dataframe
part2 = part2_concatenated.sample(frac=1, random_state=42)


In [30]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%23G-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%23G-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr5%23G-Part3.csv', index=False)

#### Corr25%23G

In [31]:
# Create the partitions 

# Separate into three partitions normal samples 
normal1 = complete[complete['label'] == 0].iloc[:25000]
normal2 = complete[complete['label'] == 0].iloc[25000:(25000+68236)]
normal3 = complete[complete['label'] == 0].iloc[(25000+68236):(25000+68236+100000)]

# Separate into three partitions generic samples 
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:13935]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[13935:(13935+57171)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(13935+57171):]

# Separate into three partitions exploits samples
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:(7123*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(7123*2):]

# Separate into three partitions dos samples
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:(971*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(971*2):]


# Separate into three partitions reconnaissance samples
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:(2971*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [32]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, recon3, dos3])


In [33]:
import pandas as pd

# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part2_sample_label0 = part2[part2['label'] == 0].sample(frac=0.125, random_state=42)
part2_sample_label0['label'] = 1
part2_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part2_sample_label1 = part2[part2['label'] == 1].sample(frac=0.125, random_state=42)
part2_sample_label1['label'] = 0
part2_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part2, dropping the modified samples
part2_concatenated = pd.concat([part2.drop(part2_sample_label0.index).drop(part2_sample_label1.index), part2_sample_label0, part2_sample_label1])

# Shuffle the rows of the concatenated dataframe
part2 = part2_concatenated.sample(frac=1, random_state=42)


In [34]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%23G-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%23G-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr25%23G-Part3.csv', index=False)

#### Corr50%23G

In [35]:
# Create the partitions 

# Separate into three partitions normal samples 
normal1 = complete[complete['label'] == 0].iloc[:25000]
normal2 = complete[complete['label'] == 0].iloc[25000:(25000+68236)]
normal3 = complete[complete['label'] == 0].iloc[(25000+68236):(25000+68236+100000)]

# Separate into three partitions generic samples 
generic1 = complete[complete['attack_cat'] == "generic"].iloc[:13935]
generic2 = complete[complete['attack_cat'] == "generic"].iloc[13935:(13935+57171)]
generic3 = complete[complete['attack_cat'] == "generic"].iloc[(13935+57171):]

# Separate into three partitions exploits samples
exploits1 = complete[complete['attack_cat'] == "exploits"].iloc[:7123]
exploits2 = complete[complete['attack_cat'] == "exploits"].iloc[7123:(7123*2)]
exploits3 = complete[complete['attack_cat'] == "exploits"].iloc[(7123*2):]

# Separate into three partitions dos samples
dos1 = complete[complete['attack_cat'] == "dos"].iloc[:971]
dos2 = complete[complete['attack_cat'] == "dos"].iloc[971:(971*2)]
dos3 = complete[complete['attack_cat'] == "dos"].iloc[(971*2):]


# Separate into three partitions reconnaissance samples
recon1 = complete[complete['attack_cat'] == "reconnaissance"].iloc[:2971]
recon2 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971:(2971*2)]
recon3 = complete[complete['attack_cat'] == "reconnaissance"].iloc[2971*2:]

In [36]:
# Create UNSW-NB15-Train-Basic-PartN, N = 1,2,3, dataset and export to csv
part1 = pd.concat([normal1, generic1, exploits1, dos1, recon1])
part2 = pd.concat([normal2, generic2, exploits2, dos2, recon2])
part3 = pd.concat([normal3, generic3, exploits3, recon3, dos3])


In [37]:
import pandas as pd

# Randomly select 2.5% of samples with label 0 and change their label to 1 and attack_cat to "generic"
part2_sample_label0 = part2[part2['label'] == 0].sample(frac=0.25, random_state=42)
part2_sample_label0['label'] = 1
part2_sample_label0['attack_cat'] = "generic"

# Randomly select 2.5% of samples with label 1 and change their label to 0 and attack_cat to "normal"
part2_sample_label1 = part2[part2['label'] == 1].sample(frac=0.25, random_state=42)
part2_sample_label1['label'] = 0
part2_sample_label1['attack_cat'] = "normal"

# Concatenate the modified parts and the original part2, dropping the modified samples
part2_concatenated = pd.concat([part2.drop(part2_sample_label0.index).drop(part2_sample_label1.index), part2_sample_label0, part2_sample_label1])

# Shuffle the rows of the concatenated dataframe
part2 = part2_concatenated.sample(frac=1, random_state=42)


In [38]:
part1.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr50%23G-Part1.csv', index=False)
part2.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr50%23G-Part2.csv', index=False)
part3.to_csv('C:/Users/UX430/Documents/thesis/datasets/UNSW-NB15/UNSW-NB15-Train-Basic-Corr50%23G-Part3.csv', index=False)