# 1. Imports

In [8]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# 2. Importing the Datasets

In [36]:
ddos_portmap_2018_df = pd.read_csv("../data/train_test/CSE_CIC_IDS2018/ddos_portmap_2018_train.csv", index_col=0)

In [37]:
ddos_ldap_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_ldap_2019_train.csv", index_col=0)

In [38]:
ddos_netbios_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_netbios_2019_train.csv", index_col=0)

In [39]:
ddos_syn_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_syn_2019_train.csv", index_col=0)

In [40]:
ddos_udp_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_udp_2019_train.csv", index_col=0)

In [41]:
target_feature = 'Label'

# 3. Feature Selection Method

In [57]:
def SelectKBest_Helper(X_train, y_train, k, features):
    
    select = SelectKBest(score_func=chi2, k=k)
    z = select.fit_transform(X_train,y_train)
    
    myFilter = select.get_support()

    result = [feature for feature, selected in zip(features, myFilter) if selected]
    
    return result

In [58]:
def stable_SelectKBest(train_df, num_splits, k, verbose):
    
    selectedFeatures = []
    
    if num_splits != None:
        df_split = np.array_split(train_df, num_splits)
    
    for small_df in df_split:
        X_train = small_df.drop([target_feature], axis=1)
        y_train = small_df[target_feature]
    
        features = small_df.columns.tolist()
        
        selectedFeatures = selectedFeatures + SelectKBest_Helper(X_train, y_train, k, features)
        # NOTE TO ARYAN, PRANAV, AND ANISHA: THIS IS THE LINE YOU SHOULD CHANGE, AFTER IMPLEMENTING YOUR FEATURE
        # SELECTION METHOD
        
    rank_data = {}
    features = train_df.columns.tolist()
    ranks = []
        
    for feature in features:
        if verbose :
            print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature))+"/"+str(num_splits))
        ranks.append(selectedFeatures.count(feature)/num_splits)
    
    rank_data = {'feature':features, 'rank':ranks}
    rank_df = pd.DataFrame(rank_data) 
    
    return rank_df

In [59]:
portmap_ranking = stable_SelectKBest(ddos_portmap_2018_df, 10, 40, verbose=False)
portmap_ranking.to_csv("../ranking/CSE_CIC_IDS2018/ddos_portmap_2018_SelectKBest.csv")
portmap_ranking.head(20)

Unnamed: 0,feature,rank
0,DestinationPort,1.0
1,FlowDuration,0.0
2,TotalFwdPackets,0.0
3,TotalBackwardPackets,0.0
4,TotalLengthofFwdPackets,1.0
5,TotalLengthofBwdPackets,0.0
6,FwdPacketLengthMax,1.0
7,FwdPacketLengthMin,1.0
8,FwdPacketLengthMean,1.0
9,FwdPacketLengthStd,1.0


In [60]:
ldap_ranking = stable_SelectKBest(ddos_ldap_2019_df, 10, 40, verbose=False)
ldap_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_ldap_2019_SelectKBest.csv")
ldap_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,0.0
1,SourceIP,1.0
2,SourcePort,1.0
3,DestinationIP,0.0
4,DestinationPort,0.0
5,Protocol,0.0
6,FlowDuration,1.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,0.0
9,TotalLengthofFwdPackets,0.0


In [61]:
netbios_ranking = stable_SelectKBest(ddos_netbios_2019_df, 10, 40, verbose=False)
netbios_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_netbios_2019_SelectKBest.csv")
netbios_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,0.3
1,SourceIP,1.0
2,SourcePort,1.0
3,DestinationIP,0.0
4,DestinationPort,0.0
5,Protocol,0.0
6,FlowDuration,1.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,1.0
9,TotalLengthofFwdPackets,0.0


In [62]:
syn_ranking = stable_SelectKBest(ddos_syn_2019_df, 10, 40, verbose=False)
syn_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_syn_2019_SelectKBest.csv")
syn_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,1.0
2,SourcePort,0.0
3,DestinationIP,0.3
4,DestinationPort,0.6
5,Protocol,0.6
6,FlowDuration,0.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,0.3
9,TotalLengthofFwdPackets,1.0


In [63]:
udp_ranking = stable_SelectKBest(ddos_udp_2019_df, 10, 40, verbose=False)
udp_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_udp_2019_SelectKBest.csv")
udp_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,1.0
2,SourcePort,0.8
3,DestinationIP,0.0
4,DestinationPort,1.0
5,Protocol,1.0
6,FlowDuration,1.0
7,TotalFwdPackets,0.0
8,TotalBackwardPackets,0.0
9,TotalLengthofFwdPackets,0.0


# 4. Baselines

In [64]:
def baseline_SelectKBest(train_df, k, verbose):
    
    # Take a smaller piece of the entire dataset
    df_split = np.array_split(train_df, 20)
    train_df = df_split[0]
    
    selectedFeatures = []
    
    X_train = train_df.drop([target_feature], axis=1)
    y_train = train_df[target_feature]
        
    features = train_df.columns.tolist()
    selectedFeatures =  SelectKBest_Helper(X_train, y_train, k, features)
        
    rank_data = {}
    features = train_df.columns.tolist()
    ranks = []
        
    for feature in features:
        if verbose :
            print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature)))
        ranks.append(selectedFeatures.count(feature))
    
    rank_data = {'feature':features, 'rank':ranks}
    rank_df = pd.DataFrame(rank_data)
    
    return rank_df

In [65]:
portmap_ranking_baseline = baseline_SelectKBest(ddos_portmap_2018_df, 40, verbose=False)
portmap_ranking_baseline.to_csv("../baseline_ranking/CSE_CIC_IDS2018/ddos_portmap_2018_SelectKBest.csv")
portmap_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,DestinationPort,1
1,FlowDuration,0
2,TotalFwdPackets,0
3,TotalBackwardPackets,0
4,TotalLengthofFwdPackets,1


In [66]:
ldap_ranking_baseline = baseline_SelectKBest(ddos_ldap_2019_df, 40, verbose=False)
ldap_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_ldap_2019_SelectKBest.csv")
ldap_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,0
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,0
4,DestinationPort,0


In [67]:
udp_ranking_baseline = baseline_SelectKBest(ddos_udp_2019_df, 40, verbose=False)
udp_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_udp_2019_SelectKBest.csv")
udp_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,0
4,DestinationPort,1


In [70]:
netbios_ranking_baseline = baseline_SelectKBest(ddos_netbios_2019_df, 40, verbose=False)
netbios_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_netbios_2019_SelectKBest.csv")
netbios_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,0
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,0
4,DestinationPort,0


In [71]:
syn_ranking_baseline = baseline_SelectKBest(ddos_syn_2019_df, 40, verbose=False)
syn_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_syn_2019_SelectKBest.csv")
syn_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,0
3,DestinationIP,0
4,DestinationPort,1
