# 1. Imports

In [6]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from collinearity import SelectNonCollinear

# 2. Importing the Datasets

In [7]:
ddos_portmap_2018_df = pd.read_csv("../data/train_test/CSE_CIC_IDS2018/ddos_portmap_2018_train.csv", index_col=0)

In [None]:
ddos_ldap_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_ldap_2019_train.csv", index_col=0)

In [None]:
ddos_netbios_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_netbios_2019_train.csv", index_col=0)

In [None]:
ddos_syn_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_syn_2019_train.csv", index_col=0)

In [None]:
ddos_udp_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_udp_2019_train.csv", index_col=0)

In [8]:
target_feature = 'Label'

# 3. Feature Selection - NonCollinear

### Helper Functions

In [31]:
def NonCollinear_Helper(X_train, y_train, threshold):
    
    new_X_train = X_train.copy()

    selector  = SelectNonCollinear(threshold)
    selector.fit(new_X_train, y_train)
    mask = selector.get_support()
    
    return mask.tolist()

In [32]:
def stable_NonCollinear(train_df, num_splits, threshold, verbose):
    
    selectedFeatures = []
    
    df_split = np.array_split(train_df, num_splits)
    
    for small_df in df_split:
        X_train = small_df.drop([target_feature], axis=1)
        y_train = small_df[target_feature]
        
        selectedFeatures = selectedFeatures + NonCollinear_Helper(X_train, threshold)
        # NOTE TO ARYAN, PRANAV, AND ANISHA: THIS IS THE LINE YOU SHOULD CHANGE, AFTER IMPLEMENTING YOUR FEATURE
        # SELECTION METHOD
        
    rank_data = {}
    features = train_df.columns.tolist()
    ranks = []
        
    for feature in features:
        if verbose :
            print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature))+"/"+str(num_splits))
        ranks.append(selectedFeatures.count(feature)/num_splits)
    
    rank_data = {'feature':features, 'rank':ranks}
    rank_df = pd.DataFrame(rank_data) 
    
    return rank_df

### Investigating a Good Choice of Threshold

In [33]:
X_train_portmap = ddos_portmap_2018_df.drop([target_feature], axis=1)
y_train_portmap = ddos_portmap_2018_df[target_feature]

In [34]:
X_train_portmap.columns

Index(['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
       'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       '

In [38]:
temp = y_train_portmap.tolist()
for index in range(len(temp)):
    if np.isnan(temp[index]):
        print("FOUND")
        print(index)

FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND
FOUND


In [35]:
for t in [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3]:
    features = NonCollinear_Helper(X_train_portmap, y_train_portmap, threshold = 1.0-t)
    print("NonCollinear Threshold "+str(1.0 - t)+": "+". Features: ("+str(len(features))+") "+str(features))
    print("_______________________________________________________________________________")

ValueError: Input y contains NaN.

# 4. Saving the Rankings

In [None]:
portmap_ranking = stable_VarianceThreshold(ddos_portmap_2018_df, 10, 0.00005, verbose=False)
portmap_ranking.to_csv("../ranking/CSE_CIC_IDS2018/ddos_portmap_2018_NonCollinear.csv")
portmap_ranking.head(20)

In [None]:
ldap_ranking = stable_VarianceThreshold(ddos_ldap_2019_df, 10, 0.00005, verbose=False)
ldap_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_ldap_2019_NonCollinear.csv")
ldap_ranking.head(20)

In [None]:
netbios_ranking = stable_VarianceThreshold(ddos_netbios_2019_df, 10, 0.00005, verbose=False)
netbios_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_netbios_2019_NonCollinear.csv")
netbios_ranking.head(20)

In [None]:
syn_ranking = stable_VarianceThreshold(ddos_syn_2019_df, 10, 0.00005, verbose=False)
syn_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_syn_2019_NonCollinear.csv")
syn_ranking.head(20)

In [None]:
udp_ranking = stable_VarianceThreshold(ddos_udp_2019_df, 10, 0.00005, verbose=False)
udp_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_udp_2019_NonCollinear.csv")
udp_ranking.head(20)