In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None, 'display.max_columns', None)
print(pd.__version__)

1.5.3


In [2]:
# Define the path of the folder of datas
folder_path = '/Datasets/CIC_IDS2018/data'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]    # Read all of dataset

# Create an empty dictionary to store the dataframes
dfs = {}

# Loop through each CSV file and read it into a dataframe
for file in csv_files:
    df_name = os.path.splitext(file)[0]          # Get the datafeame name from te file name
    file_path = os.path.join(folder_path, file)  # Get the full file path
    print(file,os.path.getsize(file_path))
    chunks=pd.read_table(file_path,chunksize=1000000,encoding='unicode_escape', sep=',', low_memory=False)
    df=pd.DataFrame()
    i=0
    for chunk in chunks:
         i+=1
         print("chunck :",i,chunk.shape)
         df=pd.concat([df,chunk])
    dfs[df_name] = df            # Store the dataframe in the dictionary with the same name as the file (e.g. dfs[data1] = df1)

02-23-2018.csv 382840456
chunck : 1 (1000000, 80)
chunck : 2 (48575, 80)
03-02-2018.csv 352368373
chunck : 1 (1000000, 80)
chunck : 2 (48575, 80)
02-21-2018.csv 328893673
chunck : 1 (1000000, 80)
chunck : 2 (48575, 80)
02-22-2018.csv 382636202
chunck : 1 (1000000, 80)
chunck : 2 (48575, 80)
02-15-2018.csv 375945899
chunck : 1 (1000000, 80)
chunck : 2 (48575, 80)
02-16-2018.csv 333723605
chunck : 1 (1000000, 80)
chunck : 2 (48575, 80)
02-14-2018.csv 358223333
chunck : 1 (1000000, 80)
chunck : 2 (48575, 80)
02-20-2018.csv 4054925350
chunck : 1 (1000000, 84)
chunck : 2 (1000000, 84)
chunck : 3 (1000000, 84)
chunck : 4 (1000000, 84)
chunck : 5 (1000000, 84)
chunck : 6 (1000000, 84)
chunck : 7 (1000000, 84)
chunck : 8 (948748, 84)
02-28-2018.csv 209249758
chunck : 1 (613104, 80)
03-01-2018.csv 107842858
chunck : 1 (331125, 80)


In [3]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f'initial shape of {key} = {df.shape}')

initial shape of 02-23-2018 = (1048575, 80)
initial shape of 03-02-2018 = (1048575, 80)
initial shape of 02-21-2018 = (1048575, 80)
initial shape of 02-22-2018 = (1048575, 80)
initial shape of 02-15-2018 = (1048575, 80)
initial shape of 02-16-2018 = (1048575, 80)
initial shape of 02-14-2018 = (1048575, 80)
initial shape of 02-20-2018 = (7948748, 84)
initial shape of 02-28-2018 = (613104, 80)
initial shape of 03-01-2018 = (331125, 80)


In [4]:
# Drop Flow ID, Src IP and Dst IP columns of the 02-20-2018 dataset for being strings
for item in dfs:
    if item == '02-20-2018':
        dfs['02-20-2018'].drop(['Flow ID','Src IP','Dst IP', 'Src Port'], axis=1, inplace=True)

In [5]:
for key in dfs.keys():
    df = dfs[key]   # Get the dataframe corresponding to the key
    # print(f"Dataframe: '{key}'\n")

    # replace +ve and -ve infinity with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Drop all NaN values
    df.dropna(inplace=True)

In [6]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')


for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df['Timestamp'] = (df['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [7]:
for key in dfs.keys():
    df = dfs[key]
    for col in df.columns:
        #Check if the datatype of the column is object
        if df[col].dtype == 'object' and col != 'Label':
            # Change all values to numeric, and to NaN if it is a string
            df[col] = pd.to_numeric(df[col], errors='coerce')

In [8]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    # print(f"Dataframe: '{key}'\n")
    count_NA = df.isna().sum()
    # print(count_NA)


for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df.dropna(inplace=True)

In [9]:
# Outliers Filtering:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key

    # Define a function to filter outliers using Z-score
    def filter_outliers_zscore(data, threshold):
        z_scores = np.abs(stats.zscore(data))
        outlier_mask = (z_scores > threshold).any(axis=1)
        return data[~outlier_mask], data[outlier_mask]

    # Define a threshold value
    threshold = 7

    # The filtering was removing all DDoS-LOIC-UDP, so we will not execute it on them
    if key == '02-21-2018':
        df_temp = df[df['Label'] == 'DDOS attack-LOIC-UDP']
        df = df[df['Label'] != 'DDOS attack-LOIC-UDP']

    # Loop through the columns of the dataframe and filter outliers in each column
    filtered_cols = []
    removed_outliers = []
    for col in df.columns:
        if col != 'Label':
            filtered_col, outliers = filter_outliers_zscore(df[[col]], threshold)

            filtered_cols.append(filtered_col)
            removed_outliers.append(outliers)

    # Combine the filtered columns back into a dataframe
    df_filtered = pd.concat(filtered_cols, axis=1)

    # Combine the removed outliers back into a dataframe
    df_outliers = pd.concat(removed_outliers, axis=1)

    n_outliers = df_outliers.shape[0]
 
    # Assign filtered dataframe columns to original one
    columns = [col for col in df.columns if col != 'Label']
    df.loc[:,columns] = df_filtered.loc[:,columns]

    # Recombine rows from 'DDOS attack-LOIC-UDP'
    if key == '02-21-2018':
        df = pd.concat([df,df_temp])

    values_orig = df.loc[df.index.isin(df_outliers.index), 'Label']
    # print(f'\n{values_orig.value_counts()}')

In [10]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df.dropna(inplace=True)

In [11]:
# Feature Correlation Filtering
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key

    columns = [col for col in df.columns if col != 'Label']

    corr_matrix = df[columns].corr().abs()

    threshold = 0.99
    # Find features with high correlation
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    # print(f'\nDataset: {key}')
    # Print features to drop
    # print(f"The following {len(to_drop)} features will be dropped due to high correlation: {to_drop}")

    df = df.drop(to_drop, axis = 1)

In [12]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    # print(f"Dataframe: '{key}'\n")
    # replace +ve and -ve infinity with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Drop all NaN values
    df.dropna(inplace=True)

In [13]:
for item in dfs:

    if item == '02-14-2018':
        df = dfs['02-14-2018']
        df = df.sample(frac=1) #Randomize rows's sequence
        df2 = df[df["Label"] == "FTP-BruteForce"]
        df3 = df[df["Label"] == "SSH-Bruteforce"]
        df0 = df[df["Label"] == "Benign"][:df2.shape[0]]
        df1 = df[df["Label"] == "Benign"][:df3.shape[0]]
        df_equal_FTP_BruteForce = pd.concat([ df0,df2], axis =0)
        df_equal_SSH_BruteForce = pd.concat([ df1,df3], axis =0)


    if item == '02-15-2018':
        df = dfs['02-15-2018']
        df = df.sample(frac=1) #Randomize rows's sequence
        df2 = df[df["Label"] == "DoS attacks-GoldenEye"]
        df3 = df[df["Label"] == "DoS attacks-Slowloris"]
        df0 = df[df["Label"] == "Benign"][:df2.shape[0]]
        df1 = df[df["Label"] == "Benign"][:df3.shape[0]*100]
        df_equal_DoS_GoldenEye = pd.concat([ df0,df2], axis =0)
        df_equal_DoS_Slowloris = pd.concat([ df1,df3], axis =0)


    if item == '02-16-2018':
        df = dfs['02-16-2018']
        df = df.sample(frac=1) #Randomize rows's sequence
        df3 = df[df["Label"] == "DoS attacks-SlowHTTPTest"]
        df0 = df[df["Label"] == "Benign"]
        df2 = df[df["Label"] == "DoS attacks-Hulk"][:df0.shape[0]]
        df1 = df[df["Label"] == "Benign"][:df3.shape[0]]
        df_equal_DoS_Hulk = pd.concat([ df0,df2], axis =0)
        df_equal_DoS_SlowHTTPTest = pd.concat([ df1,df3], axis =0)


    if item == '02-21-2018':
        df = dfs['02-21-2018']
        df = df.sample(frac=1) #Randomize rows's sequence
        df3 = df[df["Label"] == "DDOS attack-LOIC-UDP"]
        df0 = df[df["Label"] == "Benign"]
        df2 = df[df["Label"] == "DDOS attack-HOIC"][:df0.shape[0]]
        df1 = df[df["Label"] == "Benign"][:df3.shape[0]*100]
        df_equal_DDoS_HOIC = pd.concat([ df0,df2], axis =0)
        df_equal_DDoS_LOIC_UDP = pd.concat([ df1,df3], axis =0)


    if item == '02-28-2018' or item == '03-01-2018':
        df_02_28 = dfs['02-28-2018']
        df_03_01 = dfs['03-01-2018']
        df_inf = pd.concat([ df_02_28,df_03_01], axis =0)
        df_inf = df_inf.sample(frac=1) #Randomize rows's sequence
        df2 = df_inf[df_inf["Label"] == "Infilteration"]
        df1 = df_inf[df_inf["Label"] == "Benign"][:df2.shape[0]]
        df_equal_Infilteration = pd.concat([ df1,df2], axis =0)


    if item == '03-02-2018':
        df = dfs['03-02-2018']
        df = df.sample(frac=1) #Randomize rows's sequence
        df2 = df[df["Label"] == "Bot"]
        df1 = df[df["Label"] == "Benign"][:df2.shape[0]]
        df_equal_Bot = pd.concat([ df1,df2], axis =0)


    if item == '02-23-2018' or item == '02-22-2018':
        df1 = dfs['02-23-2018']
        df2 = dfs['02-22-2018']
        df_BruteForce_Web_XSS = pd.concat([df1,df2], axis = 0)
        df_BruteForce_Web_XSS["Label"] = df_BruteForce_Web_XSS.Label.map(lambda a:"Benign" if a == 'Benign' else "Attack")


    if item == '02-20-2018':
        df = dfs['02-20-2018']
        df = df.sample(frac=1) #Randomize rows's sequence
        df2 = df[df["Label"] == "DDoS attacks-LOIC-HTTP"]
        df1 = df[df["Label"] == "Benign"][:df2.shape[0]]
        df_equal_DDoS_LOIC_HTTP = pd.concat([ df1,df2], axis =0)

In [14]:
# print shape of each dataset
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f'final shape of {key} = {df.shape}')


dfs_final = {}

for name, df in zip(['df_equal_Bot','df_equal_DDoS_HOIC',
                         'df_equal_DDoS_LOIC_UDP','df_equal_DoS_GoldenEye','df_equal_DoS_Hulk',
                         'df_equal_DoS_SlowHTTPTest','df_equal_DoS_Slowloris','df_equal_FTP_BruteForce',
                         'df_equal_Infilteration','df_equal_SSH_BruteForce','df_BruteForce_Web_XSS',
                         'df_equal_DDoS_LOIC_HTTP'],
                         [df_equal_Bot,df_equal_DDoS_HOIC,
                         df_equal_DDoS_LOIC_UDP,df_equal_DoS_GoldenEye,df_equal_DoS_Hulk,
                         df_equal_DoS_SlowHTTPTest,df_equal_DoS_Slowloris,df_equal_FTP_BruteForce,
                         df_equal_Infilteration,df_equal_SSH_BruteForce,df_BruteForce_Web_XSS,
                         df_equal_DDoS_LOIC_HTTP]):
    dfs_final[name] = df

final shape of 02-23-2018 = (1007064, 80)
final shape of 03-02-2018 = (1009488, 80)
final shape of 02-21-2018 = (1048575, 80)
final shape of 02-22-2018 = (1011778, 80)
final shape of 02-15-2018 = (992169, 80)
final shape of 02-16-2018 = (1036539, 80)
final shape of 02-14-2018 = (1011097, 80)
final shape of 02-20-2018 = (7565845, 80)
final shape of 02-28-2018 = (575450, 80)
final shape of 03-01-2018 = (309531, 80)


In [15]:
combined = pd.concat(dfs_final.values())
print(combined['Label'].value_counts())
print(f"shape of final dataset: {combined.shape}")

Benign                      4836385
DDoS attacks-LOIC-HTTP       573347
DoS attacks-Hulk             439126
DDOS attack-HOIC             360833
Bot                          285763
FTP-BruteForce               193354
SSH-Bruteforce               187589
Infilteration                152861
DoS attacks-SlowHTTPTest     139890
DoS attacks-GoldenEye         39924
DoS attacks-Slowloris          2724
DDOS attack-LOIC-UDP           1730
Attack                          544
Name: Label, dtype: int64
shape of final dataset: (7214070, 80)


In [16]:
df = combined
# Replace the values of "Label" with numerical values
new_Label = df['Label'].replace({'Benign':0, 'DDoS attacks-LOIC-HTTP':1,
                                 'DoS attacks-Hulk':2, 'DDOS attack-HOIC':1,
                                 'Bot':3,'FTP-BruteForce':4, 'SSH-Bruteforce':4,
                                 'Infilteration':5, 'DoS attacks-SlowHTTPTest':2,
                                 'DoS attacks-GoldenEye':2, 'DoS attacks-Slowloris':2,
                                 'DDOS attack-LOIC-UDP':1, 'Attack':6})
df['Label'] = new_Label

In [17]:
# print the distribution of every class:
to_drop_Benign = [0]
to_drop_DDoS = [1]
to_drop_DoS = [2]
to_drop_Bot = [3]
to_drop_BruteForce = [4]
to_drop_Infilteration = [5]
to_drop_Attack = [6]

Benign_df = df[df['Label'].isin(to_drop_Benign)];
DDoS_df = df[df['Label'].isin(to_drop_DDoS)];
DoS_df = df[df['Label'].isin(to_drop_DoS)];
Bot_df = df[df['Label'].isin(to_drop_Bot)];
BruteForce_df = df[df['Label'].isin(to_drop_BruteForce)];
Infilteration_df = df[df['Label'].isin(to_drop_Infilteration)];
Attack_df = df[df['Label'].isin(to_drop_Attack)];


# split dataset into labels and rest of it:
X_Benign = Benign_df.drop('Label',1)
Y_Benign = Benign_df.Label
X_DDoS = DDoS_df.drop('Label',1)
Y_DDoS = DDoS_df.Label
X_DoS = DoS_df.drop('Label',1)
Y_DoS = DoS_df.Label
X_Bot = Bot_df.drop('Label',1)
Y_Bot = Bot_df.Label
X_BruteForce = BruteForce_df.drop('Label',1)
Y_BruteForce = BruteForce_df.Label
X_Infilteration = Infilteration_df.drop('Label',1)
Y_Infilteration = Infilteration_df.Label
X_Attack = Attack_df.drop('Label',1)
Y_Attack = Attack_df.Label

In [18]:
from sklearn.preprocessing import MinMaxScaler

colNames=list(df)

# Min_Max normalization:
scaler = MinMaxScaler()

X_Benign1 = scaler.fit_transform(X_Benign)
X_DDoS1 = scaler.fit_transform(X_DDoS)
X_DoS1 = scaler.fit_transform(X_DoS)
X_Bot1 = scaler.fit_transform(X_Bot)
X_BruteForce1 = scaler.fit_transform(X_BruteForce)
X_Infilteration1 = scaler.fit_transform(X_Infilteration)
X_Attack1 = scaler.fit_transform(X_Attack)

In [19]:
# Add the column of label into normalized datset
Benign = np.hstack((X_Benign1, Y_Benign[:, np.newaxis]))
DDoS = np.hstack((X_DDoS1, Y_DDoS[:, np.newaxis]))
DoS = np.hstack((X_DoS1, Y_DoS[:, np.newaxis]))
Bot = np.hstack((X_Bot1, Y_Bot[:, np.newaxis]))
BruteForce = np.hstack((X_BruteForce1, Y_BruteForce[:, np.newaxis]))
Infilteration = np.hstack((X_Infilteration1, Y_Infilteration[:, np.newaxis]))
Attack = np.hstack((X_Attack1, Y_Attack[:, np.newaxis]))

np_data = np.concatenate((Benign, DDoS, DoS, Bot, BruteForce, Infilteration, Attack))
np.random.shuffle(np_data)
print(np_data.shape)

df1 = pd.DataFrame(np_data, columns=colNames)
print(df1.shape)

(7214070, 80)
(7214070, 80)


In [20]:
output_dir = '/Datasets/CIC_IDS2018/'
filename = os.path.join(output_dir, 'normalized_CIC_IDS' + '.csv')
df1.to_csv(filename, index = False)
print(df1['Label'].value_counts())

0.0    4836385
1.0     935910
2.0     621664
4.0     380943
3.0     285763
5.0     152861
6.0        544
Name: Label, dtype: int64
