# 1. Imports

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# 2. Importing the Datasets

In [2]:
ddos_portmap_2018_df = pd.read_csv("../data/train_test/CSE_CIC_IDS2018/ddos_portmap_2018_train.csv", index_col=0)

In [None]:
ddos_ldap_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_ldap_2019_train.csv")

In [None]:
ddos_netbios_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_netbios_2019_train.csv")

In [None]:
ddos_syn_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_syn_2019_train.csv")

In [None]:
ddos_udp_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_udp_2019_train.csv")

In [6]:
target_feature = 'Label'

# 3. Feature Selection Method 1 - Variance Threshold

In [33]:
# Investigating the Variance Threshold Behavior

def VarianceThreshold_Helper(X_train, threshold):
    
    new_X_train = X_train.copy()

    constant_filter = VarianceThreshold(threshold= threshold)
    constant_filter.fit(new_X_train)

    len(new_X_train.columns[constant_filter.get_support()])

    constant_columns = [column for column in new_X_train.columns
                        if column not in new_X_train.columns[constant_filter.get_support()]]

    new_X_train.drop(labels=constant_columns, axis=1, inplace=True)
    
    return new_X_train.columns.tolist()

In [34]:
X_train_portmap = ddos_portmap_2018_df.drop([target_feature], axis=1)
y_train_portmap = ddos_portmap_2018_df[target_feature]

In [35]:
X_train_portmap.columns

Index(['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
       'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       '

In [37]:
for t in [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3]:
    features = VarianceThreshold_Helper(X_train = X_train_portmap, threshold = t)
    print("Variance Threshold "+str(t)+": "+". Features: ("+str(len(features))+") "+str(features))
    print("_______________________________________________________________________________")

Variance Threshold 1e-05: . Features: (68) ['DestinationPort', 'FlowDuration', 'TotalFwdPackets', 'TotalBackwardPackets', 'TotalLengthofFwdPackets', 'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin', 'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax', 'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd', 'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd', 'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd', 'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd', 'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s', 'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength', 'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance', 'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount', 'ACKFlagCount', 'URGFlagCount', 'ECEFlagCount', 'Down/UpRatio', 'AveragePacketSize', 'AvgFwdSegmentSize', 'AvgBwdSegmentSize', 'FwdHeaderLength.1', 'SubflowFwdPackets', 'S

ValueError: No feature in X meets the variance threshold 0.30000

In [45]:
def stable_VarianceThreshold(train_df, num_splits, threshold):
    
    selectedFeatures = []
    
    df_split = np.array_split(train_df, num_splits)
    
    for small_df in df_split:
        X_train = small_df.drop([target_feature], axis=1)
        y_train = small_df[target_feature]
        
        selectedFeatures = selectedFeatures + VarianceThreshold_Helper(X_train, threshold)
        
    for feature in train_df.columns.tolist():
        print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature))+"/"+str(num_splits))

In [48]:
portmap_ranking = stable_VarianceThreshold(ddos_portmap_2018_df, 5, 0.00005)

Feature: DestinationPort. Count: 5/5
Feature: FlowDuration. Count: 5/5
Feature: TotalFwdPackets. Count: 3/5
Feature: TotalBackwardPackets. Count: 3/5
Feature: TotalLengthofFwdPackets. Count: 5/5
Feature: TotalLengthofBwdPackets. Count: 3/5
Feature: FwdPacketLengthMax. Count: 5/5
Feature: FwdPacketLengthMin. Count: 5/5
Feature: FwdPacketLengthMean. Count: 5/5
Feature: FwdPacketLengthStd. Count: 5/5
Feature: BwdPacketLengthMax. Count: 5/5
Feature: BwdPacketLengthMin. Count: 5/5
Feature: BwdPacketLengthMean. Count: 5/5
Feature: BwdPacketLengthStd. Count: 5/5
Feature: FlowBytes/s. Count: 4/5
Feature: FlowPackets/s. Count: 5/5
Feature: FlowIATMean. Count: 5/5
Feature: FlowIATStd. Count: 5/5
Feature: FlowIATMax. Count: 5/5
Feature: FlowIATMin. Count: 4/5
Feature: FwdIATTotal. Count: 5/5
Feature: FwdIATMean. Count: 5/5
Feature: FwdIATStd. Count: 5/5
Feature: FwdIATMax. Count: 5/5
Feature: FwdIATMin. Count: 5/5
Feature: BwdIATTotal. Count: 5/5
Feature: BwdIATMean. Count: 5/5
Feature: BwdIATStd

In [None]:
# Save portmap_ranking in csv

# 4. Saving the Ranking

CODE TO SAVE A CSV FILE WITH:
- 'feature' is a column of all of the features in the original dataset
- 'ranking' is a column with the rankings (in range 0 to 1) of the feature