# 1. Imports

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

# 2. Importing the Datasets

In [2]:
ddos_portmap_2018_df = pd.read_csv("../data/train_test/CSE_CIC_IDS2018/ddos_portmap_2018_train.csv", index_col=0)

In [3]:
ddos_ldap_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_ldap_2019_train.csv", index_col=0)

In [4]:
ddos_netbios_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_netbios_2019_train.csv", index_col=0)

In [5]:
ddos_syn_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_syn_2019_train.csv", index_col=0)

In [6]:
ddos_udp_2019_df = pd.read_csv("../data/train_test/CIC_DDoS2019/ddos_udp_2019_train.csv", index_col=0)

In [2]:
target_feature = 'Label'

# 3. Feature Selection Method

In [3]:
from sklearn.feature_selection import mutual_info_classif

def MutualInformation_Helper(X_train, y_train, threshold):
    new_X_train = X_train.copy()

    # Calculate mutual information between each feature and the target variable
    mi_scores = mutual_info_classif(new_X_train, y_train)

    # Create a boolean mask to filter features based on the threshold
    selected_features_mask = mi_scores > threshold

    # Select features based on the mask
    new_X_train = new_X_train.loc[:, selected_features_mask]

    return new_X_train.columns.tolist()

In [4]:
def stable_MutualInformation(train_df, num_splits, threshold, verbose):
    
    selectedFeatures = []
    
    df_split = np.array_split(train_df, num_splits)
    
    for small_df in df_split:
        X_train = small_df.drop([target_feature], axis=1)
        y_train = small_df[target_feature]
        
        selectedFeatures = selectedFeatures + MutualInformation_Helper(X_train, y_train, threshold)
       
        
    rank_data = {}
    features = train_df.columns.tolist()
    ranks = []
        
    for feature in features:
        if verbose :
            print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature))+"/"+str(num_splits))
        ranks.append(selectedFeatures.count(feature)/num_splits)
    
    rank_data = {'feature':features, 'rank':ranks}
    rank_df = pd.DataFrame(rank_data) 
    
    return rank_df

In [21]:
portmap_ranking = stable_MutualInformation(ddos_syn_2019_df, 10, 0.004, verbose=False)

In [22]:
portmap_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,1.0
2,SourcePort,1.0
3,DestinationIP,1.0
4,DestinationPort,1.0
5,Protocol,1.0
6,FlowDuration,1.0
7,TotalFwdPackets,0.7
8,TotalBackwardPackets,0.8
9,TotalLengthofFwdPackets,1.0


In [26]:
portmap_ranking = stable_MutualInformation(ddos_portmap_2018_df, 10, 0.1, verbose=False)
portmap_ranking.to_csv("../ranking/CSE_CIC_IDS2018/ddos_portmap_2018_MutualInformation.csv")
portmap_ranking.head(20)

Unnamed: 0,feature,rank
0,DestinationPort,1.0
1,FlowDuration,1.0
2,TotalFwdPackets,1.0
3,TotalBackwardPackets,1.0
4,TotalLengthofFwdPackets,1.0
5,TotalLengthofBwdPackets,1.0
6,FwdPacketLengthMax,1.0
7,FwdPacketLengthMin,1.0
8,FwdPacketLengthMean,1.0
9,FwdPacketLengthStd,1.0


# 4. Saving the Ranking

CODE TO SAVE A CSV FILE WITH:
- 'feature' is a column of all of the features in the original dataset
- 'ranking' is a column with the rankings (in range 0 to 1) of the feature

In [27]:
portmap_ranking = stable_MutualInformation(ddos_portmap_2018_df, 10, 0.004, verbose=False)
portmap_ranking.to_csv("../ranking/CSE_CIC_IDS2018/ddos_portmap_2018_MutualInformation.csv")
portmap_ranking.head(20)

Unnamed: 0,feature,rank
0,DestinationPort,1.0
1,FlowDuration,1.0
2,TotalFwdPackets,1.0
3,TotalBackwardPackets,1.0
4,TotalLengthofFwdPackets,1.0
5,TotalLengthofBwdPackets,1.0
6,FwdPacketLengthMax,1.0
7,FwdPacketLengthMin,1.0
8,FwdPacketLengthMean,1.0
9,FwdPacketLengthStd,1.0


In [20]:
ldap_ranking = stable_MutualInformation(ddos_ldap_2019_df, 10, 0.05, verbose=False)
ldap_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_ldap_2019_MutualInformation.csv")
ldap_ranking.head(20)

Unnamed: 0,feature,rank
0,Unnamed: 0,1.0
1,FlowID,1.0
2,SourceIP,1.0
3,SourcePort,1.0
4,DestinationIP,1.0
5,DestinationPort,1.0
6,Protocol,1.0
7,FlowDuration,1.0
8,TotalFwdPackets,1.0
9,TotalBackwardPackets,1.0


In [None]:
netbios_ranking = stable_MutualInformation(ddos_netbios_2019_df, 10, 0.009, verbose=False)
netbios_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_netbios_2019_MutualInformation.csv")
netbios_ranking.head(20)

In [None]:
syn_ranking = stable_MutualInformation(ddos_syn_2019_df, 10, 0.008, verbose=False)
syn_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_syn_2019_MutualInformation.csv")
syn_ranking.head(20)

In [31]:
udp_ranking = stable_MutualInformation(ddos_udp_2019_df, 10, 0.08, verbose=False)
udp_ranking.to_csv("../ranking/CIC_DDoS2019/ddos_udp_2019_MutualInformation.csv")
udp_ranking.head(20)

Unnamed: 0,feature,rank
0,FlowID,1.0
1,SourceIP,1.0
2,SourcePort,1.0
3,DestinationIP,1.0
4,DestinationPort,1.0
5,Protocol,1.0
6,FlowDuration,1.0
7,TotalFwdPackets,1.0
8,TotalBackwardPackets,1.0
9,TotalLengthofFwdPackets,1.0


# 5. Baselines

In [9]:
def baseline_MutualInformation(train_df, threshold, verbose):
    
    selectedFeatures = []
    
    X_train = train_df.drop([target_feature], axis=1)
    y_train = train_df[target_feature]
        
    selectedFeatures =  MutualInformation_Helper(X_train, y_train, threshold)
        
    rank_data = {}
    features = train_df.columns.tolist()
    ranks = []
        
    for feature in features:
        if verbose :
            print("Feature: "+feature+". Count: "+str(selectedFeatures.count(feature)))
        ranks.append(selectedFeatures.count(feature))
    
    rank_data = {'feature':features, 'rank':ranks}
    rank_df = pd.DataFrame(rank_data)
    
    return rank_df

In [10]:
portmap_ranking_baseline = baseline_MutualInformation(ddos_portmap_2018_df, 0.004, verbose=False)
portmap_ranking_baseline.to_csv("../baseline_ranking/CSE_CIC_IDS2018/ddos_portmap_2018_MutualInformation.csv")
portmap_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,DestinationPort,1
1,FlowDuration,1
2,TotalFwdPackets,1
3,TotalBackwardPackets,1
4,TotalLengthofFwdPackets,1


In [15]:
ldap_ranking_baseline = baseline_MutualInformation(ddos_ldap_2019_df, 0.05, verbose=False)
ldap_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_ldap_2019_MutualInformation.csv")
ldap_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1


In [14]:
udp_ranking_baseline = baseline_MutualInformation(ddos_udp_2019_df, 0.08, verbose=False)
udp_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_udp_2019_MutualInformation.csv")
udp_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1


In [16]:
netbios_ranking_baseline = baseline_MutualInformation(ddos_netbios_2019_df, 0.009, verbose=False)
netbios_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_netbios_2019_MutualInformation.csv")
netbios_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1


In [17]:
syn_ranking_baseline = baseline_MutualInformation(ddos_syn_2019_df, 0.008, verbose=False)
syn_ranking_baseline.to_csv("../baseline_ranking/CIC_DDoS2019/ddos_syn_2019_MutualInformation.csv")
syn_ranking_baseline.head(5)

Unnamed: 0,feature,rank
0,FlowID,1
1,SourceIP,1
2,SourcePort,1
3,DestinationIP,1
4,DestinationPort,1


# 6. Repeating with Assignment Datasets

In [14]:
a2_train = pd.read_csv("../data/csi5388_assignment2_3_data/assignment2_train.csv", index_col=0)
a3_train = pd.read_csv("../data/csi5388_assignment2_3_data/assignment3_train.csv", index_col=0)

a2ranking = stable_MutualInformation(a2_train, 10, 0.008, verbose=False)
a2ranking.to_csv("../ranking/csi5388_assignment2_3_data/assignment2_MutualInformation.csv")
a2ranking.head(20)

a3ranking = stable_MutualInformation(a3_train, 10, 0.08, verbose=False)
a3ranking.to_csv("../ranking/csi5388_assignment2_3_data/assignment3_MutualInformation.csv")
a3ranking.head(10)

# Baselines 

a2_baseline = baseline_MutualInformation(a2_train, 0.008, verbose=False)
a2_baseline.to_csv("../baseline_ranking/csi5388_assignment2_3_data/assignment2_MutualInformation.csv")
a2_baseline.head(5)

a3_baseline = baseline_MutualInformation(a3_train, 0.08, verbose=False)
a3_baseline.to_csv("../baseline_ranking/csi5388_assignment2_3_data/assignment3_MutualInformation.csv")
a3_baseline.head(5)

Unnamed: 0,feature,rank
0,length_longest_word,1
1,FQDN_count,1
2,sld_vec,1
3,private_192,0
4,lower,1
