# 1. Imports

In [80]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.linear_model import LinearRegression

# 2. Importing the Datasets

In [81]:
ddos_portmap_2018_df = pd.read_csv("../data/train_test/CSE_CIC_IDS2018/ddos_portmap_2018_train.csv", index_col=0)

In [82]:
ddos_ldap_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_ldap_2019_train.csv", index_col=0)

In [83]:
ddos_netbios_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_netbios_2019_train.csv", index_col=0)

In [84]:
ddos_syn_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_syn_2019_train.csv", index_col=0)

In [85]:
ddos_udp_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_udp_2019_train.csv", index_col=0)

In [86]:
target_feature = 'Label'

# 3. Feature Selection - Variance Inflation Factor

### Helper Functions

In [87]:
def VIF_Helper(X_train, threshold):
    
    new_X_train = X_train.copy()
    features = new_X_train.columns.tolist()
    
    selected_features = []

    for feature in features:
        X = [f for f in features if f != feature]
        X, y = X_train[X], X_train[feature]
        r2 = LinearRegression().fit(X, y).score(X, y)
        if r2==1.0:
            vif = 50
        else:
            vif = 1/(1 - r2)
            
        #print(feature)
        #print(vif)
            
        if vif < threshold:
            selected_features.append(feature)

    return selected_features

In [62]:
def stable_VIF(train_df, num_splits, threshold, verbose):
    
    selectedFeatures = []
    
    df_split = np.array_split(train_df, num_splits)
    
    for small_df in df_split:
        X_train = small_df.drop([target_feature], axis=1)
        y_train = small_df[target_feature]
        
        selectedFeatures = selectedFeatures + VIF_Helper(X_train, threshold)
        # NOTE TO ARYAN, PRANAV, AND ANISHA: THIS IS THE LINE YOU SHOULD CHANGE, AFTER IMPLEMENTING YOUR FEATURE
        # SELECTION METHOD
        
    rank_data = {}
    features = train_df.columns.tolist()
    ranks = []
        
    for feature in features:
        if verbose :
            print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature))+"/"+str(num_splits))
        ranks.append(selectedFeatures.count(feature)/num_splits)
    
    rank_data = {'feature':features, 'rank':ranks}
    rank_df = pd.DataFrame(rank_data) 
    
    return rank_df

### Investigating a Good Choice of Threshold

In [63]:
X_train_portmap = ddos_portmap_2018_df.drop([target_feature], axis=1)
y_train_portmap = ddos_portmap_2018_df[target_feature]

In [64]:
len(X_train_portmap.columns.tolist())

78

- 1 — features are not correlated
- 1<VIF<5 — features are moderately correlated
- VIF>5 — features are highly correlated
- VIF>10 — high correlation between features and is cause for concern

In [65]:
t=10.0
features = VIF_Helper(X_train_portmap, threshold = t)
print("NonCollinear Threshold "+str(t)+": "+". Features: ("+str(len(features))+") "+str(features))
print("_______________________________________________________________________________")

NonCollinear Threshold 10.0: . Features: (10) ['DestinationPort', 'BwdPacketLengthMin', 'FlowBytes/s', 'MinPacketLength', 'FINFlagCount', 'URGFlagCount', 'Down/UpRatio', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward']
_______________________________________________________________________________


In [66]:
t=9.0
features = VIF_Helper(X_train_portmap, threshold = t)
print("NonCollinear Threshold "+str(t)+": "+". Features: ("+str(len(features))+") "+str(features))
print("_______________________________________________________________________________")

NonCollinear Threshold 9.0: . Features: (10) ['DestinationPort', 'BwdPacketLengthMin', 'FlowBytes/s', 'MinPacketLength', 'FINFlagCount', 'URGFlagCount', 'Down/UpRatio', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward']
_______________________________________________________________________________


In [67]:
t=5.0
features = VIF_Helper(X_train_portmap, threshold = t)
print("NonCollinear Threshold "+str(t)+": "+". Features: ("+str(len(features))+") "+str(features))
print("_______________________________________________________________________________")

NonCollinear Threshold 5.0: . Features: (7) ['DestinationPort', 'BwdPacketLengthMin', 'FlowBytes/s', 'FINFlagCount', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward']
_______________________________________________________________________________


In [68]:
t=4.0
features = VIF_Helper(X_train_portmap, threshold = t)
print("NonCollinear Threshold "+str(t)+": "+". Features: ("+str(len(features))+") "+str(features))
print("_______________________________________________________________________________")

NonCollinear Threshold 4.0: . Features: (6) ['BwdPacketLengthMin', 'FlowBytes/s', 'FINFlagCount', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward']
_______________________________________________________________________________


In [69]:
t=1.0
features = VIF_Helper(X_train_portmap, threshold = t)
print("NonCollinear Threshold "+str(t)+": "+". Features: ("+str(len(features))+") "+str(features))
print("_______________________________________________________________________________")

NonCollinear Threshold 1.0: . Features: (0) []
_______________________________________________________________________________


# 4. Saving the Rankings

In [72]:
portmap_ranking = stable_VIF(ddos_portmap_2018_df, 10, 10.0, verbose=False)
portmap_ranking.to_csv("../ranking/CSE_CIC_IDS2018/ddos_portmap_2018_VIF.csv")
portmap_ranking.head(20)

Unnamed: 0,feature,rank
0,DestinationPort,1.0
1,FlowDuration,0.0
2,TotalFwdPackets,0.0
3,TotalBackwardPackets,0.0
4,TotalLengthofFwdPackets,0.0
5,TotalLengthofBwdPackets,0.0
6,FwdPacketLengthMax,0.0
7,FwdPacketLengthMin,0.0
8,FwdPacketLengthMean,0.0
9,FwdPacketLengthStd,0.0


In [75]:
portmap_ranking.to_csv("../ranking/CSE_CIC_IDS2018/ddos_portmap_2018_VIF.csv")

In [76]:
ldap_ranking = stable_VIF(ddos_ldap_2019_df, 10, 10.0, verbose=False)
ldap_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_ldap_2019_VIF.csv")
ldap_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,1.0
2,SourcePort,0.0
3,DestinationIP,1.0
4,DestinationPort,1.0
5,Protocol,0.0
6,FlowDuration,0.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,0.0
9,TotalLengthofFwdPackets,0.0


In [77]:
netbios_ranking = stable_VIF(ddos_netbios_2019_df, 10, 10.0, verbose=False)
netbios_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_netbios_2019_VIF.csv")
netbios_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,0.0
2,SourcePort,1.0
3,DestinationIP,1.0
4,DestinationPort,1.0
5,Protocol,0.0
6,FlowDuration,0.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,0.0
9,TotalLengthofFwdPackets,0.0


In [78]:
syn_ranking = stable_VIF(ddos_syn_2019_df, 10, 10.0, verbose=False)
syn_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_syn_2019_VIF.csv")
syn_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,0.0
2,SourcePort,1.0
3,DestinationIP,0.0
4,DestinationPort,1.0
5,Protocol,0.0
6,FlowDuration,0.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,0.0
9,TotalLengthofFwdPackets,0.0


In [79]:
udp_ranking = stable_VIF(ddos_udp_2019_df, 10, 10.0, verbose=False)
udp_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_udp_2019_VIF.csv")
udp_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,1.0
2,SourcePort,1.0
3,DestinationIP,1.0
4,DestinationPort,1.0
5,Protocol,0.0
6,FlowDuration,0.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,0.0
9,TotalLengthofFwdPackets,0.0


# 5. Baselines

In [88]:
def baseline_VIF(train_df, threshold, verbose):
    
    selectedFeatures = []
    
    X_train = train_df.drop([target_feature], axis=1)
    y_train = train_df[target_feature]
        
    selectedFeatures =  VIF_Helper(X_train, threshold)
        
    rank_data = {}
    features = train_df.columns.tolist()
    ranks = []
        
    for feature in features:
        if verbose :
            print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature)))
        ranks.append(selectedFeatures.count(feature))
    
    rank_data = {'feature':features, 'rank':ranks}
    rank_df = pd.DataFrame(rank_data)
    
    return rank_df

In [93]:
portmap_ranking_baseline = baseline_VIF(ddos_portmap_2018_df, 20.0, verbose=False)
portmap_ranking_baseline.to_csv("../baseline_ranking/CSE_CIC_IDS2018/ddos_portmap_2018_VIF.csv")
portmap_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,DestinationPort,1
1,FlowDuration,0
2,TotalFwdPackets,0
3,TotalBackwardPackets,0
4,TotalLengthofFwdPackets,0


In [94]:
ldap_ranking_baseline = baseline_VIF(ddos_ldap_2019_df, 20.0, verbose=False)
ldap_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_ldap_2019_VIF.csv")
ldap_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1


In [95]:
udp_ranking_baseline = baseline_VIF(ddos_udp_2019_df, 20.0, verbose=False)
udp_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_udp_2019_VIF.csv")
udp_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1


In [96]:
netbios_ranking_baseline = baseline_VIF(ddos_netbios_2019_df, 20.0, verbose=False)
netbios_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_netbios_2019_VIF.csv")
netbios_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1


In [97]:
syn_ranking_baseline = baseline_VIF(ddos_syn_2019_df, 20.0, verbose=False)
syn_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_syn_2019_VIF.csv")
syn_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1
