In [92]:
# for reading csv & plotting
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# for features selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_regression, VarianceThreshold, SelectFromModel, RFECV

labels = ['normal', 'botnet', 'botnet_spam']

#### Import Data

In [93]:
# Load the datasets
original_df = pd.read_csv('Datasets/train-train.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df = original_df.copy()

In [None]:
print(f'{"No":<4}{"Feature":<15}{"Types":<10}{"Num Of Uniques Values":<15}{"NaN%":<20}')
print(f'{"==":<4}{"=======":<15}{"=====":<10}{"=====":<15}{"=====":<20}')
for i, a in enumerate(df):
    print(f'{str(i):<4}{a:<15}{str(df[a].dtypes):<10}{len(df[a].unique()):<15}{df[a].isna().sum() / df.shape[0] * 100:<20}')

No  Feature        Types     Num Of Uniques ValuesNaN%                
0   Dport          int64     92091          0.0                 
1   igmp           int64     2              0.0                 
2   SrcBytes       int64     59614          0.0                 
3   SrcAddr        int64     1327895        0.0                 
4   ipnip          int64     1              0.0                 
5   unas           int64     2              0.0                 
6   gre            int64     2              0.0                 
7   pim            int64     2              0.0                 
8   TotPkts        int64     9512           0.0                 
9   tcp            int64     2              0.0                 
10  rtp            int64     2              0.0                 
11  TotBytes       int64     158360         0.0                 
12  State          int64     400            0.0                 
13  llc            int64     2              0.0                 
14  ipv6-icmp      

### Preprocessing

In [None]:
df = df.drop(
    columns=[
        'StartTime', 
        'dTos', 
        'sTos', 
        'ActivityLabel', 
        'SensorId',
        'BotnetName'
    ], errors='ignore'
)

In [None]:
num_cols = list(df.select_dtypes(include='number').columns)
print(num_cols)

['Dport', 'igmp', 'SrcBytes', 'SrcAddr', 'ipnip', 'unas', 'gre', 'pim', 'TotPkts', 'tcp', 'rtp', 'TotBytes', 'State', 'llc', 'ipv6-icmp', 'Dur', 'Sport', 'ipv6', 'udp', 'icmp', 'DstAddr', 'ipx/spx', 'arp', 'rsvp', 'esp', 'rtcp', 'Dir', 'rarp', 'udt']


In [None]:
cat_cols = list(df.select_dtypes(include='object').columns)
print(cat_cols)

['Label']


In [None]:
# simplify the label column
def categorize_label(label):
    label = str(label).lower()
    if 'botnet' in label:
        if 'spam' in label:
            return 2
        else:
            return 1
    else:
        return 0

df['Label'] = df['Label'].apply(categorize_label)

In [None]:
df.head()

Unnamed: 0,Dport,igmp,SrcBytes,SrcAddr,ipnip,unas,gre,pim,TotPkts,tcp,...,DstAddr,ipx/spx,arp,rsvp,esp,rtcp,Dir,Label,rarp,udt
0,17871,0,164,496054,0,0,0,0,2,0,...,38042,0,0,0,0,0,0,0,0,0
1,31480,0,158,113373,0,0,0,0,4,0,...,72130,0,0,0,0,0,3,0,0,0
2,55624,0,25912,259676,0,0,0,0,316,0,...,115045,0,0,0,0,0,0,1,0,0
3,86375,0,853,239121,0,0,0,0,32,1,...,250690,0,0,0,0,0,0,0,0,0
4,61472,0,1594,238974,0,0,0,0,17,1,...,187705,0,0,0,0,0,0,0,0,0


In [None]:
df.shape

(15030469, 30)

In [None]:
x_original = df.drop(columns=['Label'])
y = df['Label']

#### SKB-C2

In [None]:
x = x_original.copy()
x.shape

(15030469, 29)

In [None]:
selector = SelectKBest(score_func=chi2, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'Chi2 Score': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='Chi2 Score', ascending=False)

print(feature_scores)

      Feature    Chi2 Score        P-Value
2    SrcBytes  2.317723e+12   0.000000e+00
11   TotBytes  8.859559e+11   0.000000e+00
3     SrcAddr  3.565977e+10   0.000000e+00
20    DstAddr  2.601352e+10   0.000000e+00
16      Sport  7.841966e+09   0.000000e+00
0       Dport  3.384967e+09   0.000000e+00
8     TotPkts  1.208021e+09   0.000000e+00
12      State  1.668538e+08   0.000000e+00
15        Dur  3.776797e+07   0.000000e+00
9         tcp  8.946061e+05   0.000000e+00
26        Dir  8.899830e+05   0.000000e+00
18        udp  2.582138e+05   0.000000e+00
19       icmp  1.005956e+04   0.000000e+00
1        igmp  7.757354e+02  3.558025e-169
10        rtp  5.777763e+02  3.447233e-126
25       rtcp  4.975488e+02  9.091687e-109
22        arp  2.089142e+02   4.313752e-46
14  ipv6-icmp  2.456972e+01   4.621174e-06
21    ipx/spx  5.057069e+00   7.977583e-02
17       ipv6  3.673531e+00   1.593319e-01
7         pim  3.578115e+00   1.671176e-01
28        udt  2.480826e+00   2.892647e-01
24        e

In [None]:
result_skb_chi2 = list(feature_scores['Feature'])
print('skb-chi2', result_skb_chi2)

skb-chi2 ['SrcBytes', 'TotBytes', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'TotPkts', 'State', 'Dur', 'tcp', 'Dir', 'udp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']


#### SKB-AF

In [None]:
x = x_original.copy()

In [None]:
selector = SelectKBest(score_func=f_classif, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'ANOVA F-Value': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='ANOVA F-Value', ascending=False)

print(feature_scores)

  f = msb / msw


      Feature  ANOVA F-Value        P-Value
12      State  683001.922907   0.000000e+00
9         tcp  616378.113132   0.000000e+00
18        udp  594301.105690   0.000000e+00
26        Dir  580133.811052   0.000000e+00
16      Sport  442971.794988   0.000000e+00
20    DstAddr  278301.946253   0.000000e+00
0       Dport  172641.416047   0.000000e+00
3     SrcAddr   76641.078507   0.000000e+00
2    SrcBytes   10006.574964   0.000000e+00
15        Dur    8038.491132   0.000000e+00
19       icmp    5118.653437   0.000000e+00
11   TotBytes    1254.278438   0.000000e+00
8     TotPkts     623.057883  2.634000e-271
1        igmp     388.307711  2.314518e-169
10        rtp     289.134701  2.709009e-126
25       rtcp     248.955346  7.618298e-109
22        arp     104.488980   4.181462e-46
14  ipv6-icmp      12.285300   4.619194e-06
21    ipx/spx       2.528553   7.977442e-02
17       ipv6       1.836775   1.593304e-01
7         pim       1.789067   1.671161e-01
28        udt       1.240417   2

In [None]:
result_skb_af = list(feature_scores['Feature'])
print('skb-af', result_skb_af)

skb-af ['State', 'tcp', 'udp', 'Dir', 'Sport', 'DstAddr', 'Dport', 'SrcAddr', 'SrcBytes', 'Dur', 'icmp', 'TotBytes', 'TotPkts', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']


#### SKB-MI

In [None]:
x = x_original.copy()

In [None]:
selector = SelectKBest(score_func=mutual_info_regression, k='all')
x_new = selector.fit_transform(x, y)

scores = selector.scores_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'MI Score': scores})
feature_scores = feature_scores.sort_values(by='MI Score', ascending=False)

print(feature_scores)

      Feature  MI Score
3     SrcAddr  0.185832
20    DstAddr  0.147128
15        Dur  0.139174
11   TotBytes  0.112358
16      Sport  0.108606
0       Dport  0.101501
2    SrcBytes  0.093358
12      State  0.064125
8     TotPkts  0.041918
26        Dir  0.033127
18        udp  0.031062
9         tcp  0.030335
23       rsvp  0.000478
19       icmp  0.000410
25       rtcp  0.000351
7         pim  0.000332
14  ipv6-icmp  0.000162
21    ipx/spx  0.000090
22        arp  0.000082
17       ipv6  0.000069
4       ipnip  0.000019
1        igmp  0.000000
13        llc  0.000000
10        rtp  0.000000
6         gre  0.000000
24        esp  0.000000
5        unas  0.000000
27       rarp  0.000000
28        udt  0.000000


In [None]:
result_skb_mi = list(feature_scores['Feature'])
print('skb-mi', result_skb_mi)

skb-mi ['SrcAddr', 'DstAddr', 'Dur', 'TotBytes', 'Sport', 'Dport', 'SrcBytes', 'State', 'TotPkts', 'Dir', 'udp', 'tcp', 'rsvp', 'icmp', 'rtcp', 'pim', 'ipv6-icmp', 'ipx/spx', 'arp', 'ipv6', 'ipnip', 'igmp', 'llc', 'rtp', 'gre', 'esp', 'unas', 'rarp', 'udt']


#### Variance Threshold (VT)

In [None]:
x = x_original.copy()

In [None]:
# Example: Compute variance for all features
feature_scores = pd.DataFrame({
    'Feature': x.columns,
    'Variance': np.var(x, axis=0)  # Variance across rows (samples)
})
feature_scores = feature_scores.sort_values(by='Variance', ascending=False)

print(feature_scores['Variance'])

TotBytes     1.388817e+13
SrcBytes     1.634101e+12
SrcAddr      8.787212e+10
DstAddr      4.124137e+09
Dport        5.681540e+08
Sport        3.696682e+08
TotPkts      5.082002e+07
Dur          8.065728e+05
State        8.081372e+03
Dir          1.804884e+00
udp          1.794677e-01
tcp          1.686523e-01
icmp         1.641520e-02
igmp         1.080632e-03
rtp          8.138155e-04
rtcp         6.933758e-04
arp          2.912567e-04
ipv6-icmp    3.426256e-05
ipx/spx      7.052292e-06
ipv6         5.122901e-06
pim          4.989839e-06
udt          3.459627e-06
esp          3.393096e-06
rarp         1.264097e-06
unas         5.322519e-07
llc          4.657204e-07
gre          1.330630e-07
rsvp         6.653152e-08
ipnip        0.000000e+00
Name: Variance, dtype: float64


In [None]:
result_vt = list(feature_scores['Feature'])

In [None]:
print('vt', result_vt)

vt ['TotBytes', 'SrcBytes', 'SrcAddr', 'DstAddr', 'Dport', 'Sport', 'TotPkts', 'Dur', 'State', 'Dir', 'udp', 'tcp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']


#### Backward Elimination (BE)

In [None]:
x = x_original.copy()

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier model
model = DecisionTreeClassifier()
rank = []

# Backward Elimination
cols = list(x.columns)
while len(cols) > 0:
    # Fit the model
    model.fit(x[cols], y)

    # Get feature importances
    feature_importances = model.feature_importances_

    # Create a pandas Series with feature importances and corresponding columns
    imp_series = pd.Series(feature_importances, index=cols)

    # Identify the least important feature
    least_important_feature = imp_series.idxmin()

    # If the least important feature's importance is below a certain threshold (e.g., 0.01), remove it
    if imp_series.min() < 0.01:
        rank.append(least_important_feature)
        cols.remove(least_important_feature)
    else:
        break

selected_features_BE = cols
print(selected_features_BE)

x_new = x[selected_features_BE]

['Dport', 'SrcBytes', 'SrcAddr', 'TotBytes', 'State', 'Sport', 'DstAddr']


In [None]:
print(rank) #this is still reversed

['igmp', 'ipnip', 'unas', 'gre', 'pim', 'rtp', 'llc', 'ipv6-icmp', 'ipv6', 'ipx/spx', 'icmp', 'arp', 'rsvp', 'esp', 'rtcp', 'rarp', 'udt', 'udp', 'Dir', 'Dur', 'tcp', 'TotPkts']


In [None]:
# Refit the model with the selected features
model.fit(x[selected_features_BE], y)

# Get feature importances for the selected features
feature_importances = model.feature_importances_

# Create a pandas Series to rank the selected features
ranking = pd.Series(feature_importances, index=selected_features_BE)

# Sort by importance
ranking = ranking.sort_values(ascending=False)
print("Feature Ranking:\n", ranking)


Feature Ranking:
 SrcAddr     0.599155
Dport       0.147451
DstAddr     0.113137
Sport       0.078608
State       0.031414
TotBytes    0.021241
SrcBytes    0.008993
dtype: float64


In [None]:
a = list(ranking.index)
a

['SrcAddr', 'Dport', 'DstAddr', 'Sport', 'State', 'TotBytes', 'SrcBytes']

In [None]:
rank.reverse()

In [None]:
result_be = a + rank
print('be', result_be)

be ['SrcAddr', 'Dport', 'DstAddr', 'Sport', 'State', 'TotBytes', 'SrcBytes', 'TotPkts', 'tcp', 'Dur', 'Dir', 'udp', 'udt', 'rarp', 'rtcp', 'esp', 'rsvp', 'arp', 'icmp', 'ipx/spx', 'ipv6', 'ipv6-icmp', 'llc', 'rtp', 'pim', 'gre', 'unas', 'ipnip', 'igmp']


#### Recursive Feature Elimination (RFE)

In [None]:
x = x_original.copy()

In [None]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
model = DecisionTreeClassifier()

# Initialize RFE
rfe = RFE(estimator=model, n_features_to_select=1)  # Rank all features
rfe.fit(x, y)

In [None]:
# Get feature rankings
ranking = pd.Series(rfe.ranking_, index=x.columns)
ranking = ranking.sort_values()

print("Feature Rankings (lower is better):\n", ranking)

Feature Rankings (lower is better):
 SrcAddr       1
DstAddr       2
Dport         3
Sport         4
State         5
TotBytes      6
SrcBytes      7
TotPkts       8
tcp           9
Dur          10
Dir          11
udp          12
icmp         13
rtcp         14
ipv6         15
pim          16
gre          17
unas         18
ipnip        19
rtp          20
rsvp         21
esp          22
arp          23
igmp         24
ipx/spx      25
rarp         26
udt          27
llc          28
ipv6-icmp    29
dtype: int32


In [None]:
result_rfe = list(ranking.index)
print('rfe', result_rfe)

rfe ['SrcAddr', 'DstAddr', 'Dport', 'Sport', 'State', 'TotBytes', 'SrcBytes', 'TotPkts', 'tcp', 'Dur', 'Dir', 'udp', 'icmp', 'rtcp', 'ipv6', 'pim', 'gre', 'unas', 'ipnip', 'rtp', 'rsvp', 'esp', 'arp', 'igmp', 'ipx/spx', 'rarp', 'udt', 'llc', 'ipv6-icmp']


#### SelectFromModel-TreeBase (SFM-TB)

In [None]:
x = x_original.copy()

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier  # Example tree-based model

# Initialize the tree-based model
model = RandomForestClassifier(random_state=42)

In [None]:
# Fit the model
sfm = SelectFromModel(estimator=model, threshold='mean')  # Use 'mean' threshold or specify a custom one
sfm.fit(x, y)

# Get the selected features
selected_features = x.columns[sfm.get_support()]
print("Selected features:", selected_features)

Selected features: Index(['Dport', 'SrcBytes', 'SrcAddr', 'TotPkts', 'TotBytes', 'State', 'Dur',
       'Sport', 'DstAddr'],
      dtype='object')


In [None]:
# Fit the model directly to get feature importances
model.fit(x, y)
feature_importances = model.feature_importances_

# Rank the features
ranking = pd.Series(feature_importances, index=x.columns).sort_values(ascending=False)
print("Feature Importances:\n", ranking)

Feature Importances:
 SrcAddr      3.133136e-01
Dport        1.314256e-01
Sport        1.262605e-01
DstAddr      1.052739e-01
State        6.292738e-02
TotBytes     6.167875e-02
SrcBytes     5.826493e-02
Dur          5.112238e-02
TotPkts      3.971427e-02
tcp          1.956690e-02
udp          1.560864e-02
Dir          1.284862e-02
icmp         1.711601e-03
igmp         1.847918e-04
rtp          6.445326e-05
arp          1.354726e-05
rtcp         9.104541e-06
ipv6-icmp    4.356675e-06
ipx/spx      4.152026e-06
esp          7.259911e-07
pim          7.116750e-07
ipv6         6.214885e-07
llc          2.267059e-07
udt          1.104209e-07
unas         3.905885e-08
gre          2.023068e-08
rarp         1.717662e-08
rsvp         4.505477e-09
ipnip        0.000000e+00
dtype: float64


In [None]:
result_sfm_tb = list(ranking.index)
print('sfm_tb', result_sfm_tb)

sfm_tb ['SrcAddr', 'Dport', 'Sport', 'DstAddr', 'State', 'TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'tcp', 'udp', 'Dir', 'icmp', 'igmp', 'rtp', 'arp', 'rtcp', 'ipv6-icmp', 'ipx/spx', 'esp', 'pim', 'ipv6', 'llc', 'udt', 'unas', 'gre', 'rarp', 'rsvp', 'ipnip']


#### Rank Aggregation

In [None]:
print(result_skb_chi2)
print(result_skb_af)
print(result_skb_mi)
print(result_vt)
print(result_be)
print(result_rfe)
print(result_sfm_tb)

['SrcBytes', 'TotBytes', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'TotPkts', 'State', 'Dur', 'tcp', 'Dir', 'udp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']
['State', 'tcp', 'udp', 'Dir', 'Sport', 'DstAddr', 'Dport', 'SrcAddr', 'SrcBytes', 'Dur', 'icmp', 'TotBytes', 'TotPkts', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']
['SrcAddr', 'DstAddr', 'Dur', 'TotBytes', 'Sport', 'Dport', 'SrcBytes', 'State', 'TotPkts', 'Dir', 'udp', 'tcp', 'rsvp', 'icmp', 'rtcp', 'pim', 'ipv6-icmp', 'ipx/spx', 'arp', 'ipv6', 'ipnip', 'igmp', 'llc', 'rtp', 'gre', 'esp', 'unas', 'rarp', 'udt']
['TotBytes', 'SrcBytes', 'SrcAddr', 'DstAddr', 'Dport', 'Sport', 'TotPkts', 'Dur', 'State', 'Dir', 'udp', 'tcp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipni

In [None]:
features = {
    'SrcAddr': 0, 
    'TotBytes': 0, 
    'DstAddr': 0, 
    'Sport': 0, 
    'Dport': 0, 
    'SrcBytes': 0, 
    'Dur': 0, 
    'State': 0, 
    'TotPkts': 0, 
    'tcp': 0, 
    'Dir': 0, 
    'udp': 0, 
    'icmp': 0, 
    'igmp': 0, 
    'rtp': 0, 
    'rtcp': 0, 
    'arp': 0, 
    'ipv6-icmp': 0, 
    'ipx/spx': 0, 
    'pim': 0, 
    'udt': 0, 
    'ipv6': 0, 
    'esp': 0, 
    'rarp': 0, 
    'unas': 0, 
    'gre': 0, 
    'ipnip': 0, 
    'llc': 0, 
    'rsvp': 0
}

In [None]:
for i, j in enumerate(result_skb_chi2):
    features[j] += i
for i, j in enumerate(result_skb_af):
    features[j] += i
for i, j in enumerate(result_skb_mi):
    features[j] += i
for i, j in enumerate(result_vt):
    features[j] += i
for i, j in enumerate(result_be):
    features[j] += i
for i, j in enumerate(result_rfe):
    features[j] += i
for i, j in enumerate(result_sfm_tb):
    features[j] += i

In [None]:
result_final = dict(sorted(features.items(), key=lambda item: item[1]))
result_final

{'SrcAddr': 11,
 'DstAddr': 18,
 'Dport': 24,
 'Sport': 25,
 'TotBytes': 30,
 'SrcBytes': 33,
 'State': 34,
 'Dur': 51,
 'TotPkts': 54,
 'tcp': 57,
 'Dir': 62,
 'udp': 65,
 'icmp': 89,
 'rtcp': 102,
 'arp': 120,
 'rtp': 121,
 'igmp': 124,
 'ipv6': 131,
 'ipx/spx': 132,
 'ipv6-icmp': 133,
 'pim': 134,
 'esp': 146,
 'udt': 152,
 'rsvp': 156,
 'rarp': 160,
 'unas': 165,
 'gre': 168,
 'llc': 168,
 'ipnip': 177}