In [3]:
# for reading csv & plotting
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# for features selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_regression, VarianceThreshold, SelectFromModel, RFECV

labels = ['normal', 'botnet', 'botnet_spam']

#### Import Data

In [4]:
# Load the datasets
original_df = pd.read_csv('train-train.csv')

In [5]:
df = original_df.copy()

In [6]:
print(f'{"No":<4}{"Feature":<15}{"Types":<10}{"Num Of Uniques Values":<15}{"NaN%":<20}')
print(f'{"==":<4}{"=======":<15}{"=====":<10}{"=====":<15}{"=====":<20}')
for i, a in enumerate(df):
    print(f'{str(i):<4}{a:<15}{str(df[a].dtypes):<10}{len(df[a].unique()):<15}{df[a].isna().sum() / df.shape[0] * 100:<20}')

No  Feature        Types     Num Of Uniques ValuesNaN%                
0   ipx/spx        int64     1              0.0                 
1   pim            int64     1              0.0                 
2   arp            int64     1              0.0                 
3   gre            int64     1              0.0                 
4   rtp            int64     1              0.0                 
5   rsvp           int64     1              0.0                 
6   SrcBytes       int64     33             0.0                 
7   udp            int64     2              0.0                 
8   esp            int64     1              0.0                 
9   Label          object    3              0.0                 
10  DstAddr        int64     27             0.0                 
11  Dport          int64     12             0.0                 
12  TotPkts        int64     19             0.0                 
13  igmp           int64     1              0.0                 
14  ipv6-icmp      

### Preprocessing

In [7]:
df = df.drop(
    columns=[
        'StartTime', 
        'dTos', 
        'sTos', 
        'ActivityLabel', 
        'SensorId',
        'BotnetName'
    ], errors='ignore'
)

In [8]:
num_cols = list(df.select_dtypes(include='number').columns)
print(num_cols)

['ipx/spx', 'pim', 'arp', 'gre', 'rtp', 'rsvp', 'SrcBytes', 'udp', 'esp', 'DstAddr', 'Dport', 'TotPkts', 'igmp', 'ipv6-icmp', 'Dir', 'icmp', 'SrcAddr', 'State', 'llc', 'Dur', 'Sport', 'tcp', 'ipnip', 'unas', 'rtcp', 'rarp', 'TotBytes', 'udt', 'ipv6']


In [9]:
cat_cols = list(df.select_dtypes(include='object').columns)
print(cat_cols)

['Label']


In [10]:
# simplify the label column
def categorize_label(label):
    label = str(label).lower()
    if 'botnet' in label:
        if 'spam' in label:
            return 2
        else:
            return 1
    else:
        return 0

df['Label'] = df['Label'].apply(categorize_label)

In [11]:
df.head()

Unnamed: 0,ipx/spx,pim,arp,gre,rtp,rsvp,SrcBytes,udp,esp,Label,...,Dur,Sport,tcp,ipnip,unas,rtcp,rarp,TotBytes,udt,ipv6
0,0,0,0,0,0,0,75,1,0,1,...,0.054957,17,0,0,0,0,0,278,0,0
1,0,0,0,0,0,0,62,0,0,2,...,0.0,34,1,0,0,0,0,62,0,0
2,0,0,0,0,0,0,186,0,0,2,...,9.012276,17,1,0,0,0,0,186,0,0
3,0,0,0,0,0,0,79,1,0,0,...,0.000301,41,0,0,0,0,0,208,0,0
4,0,0,0,0,0,0,186,0,0,2,...,8.755043,9,1,0,0,0,0,186,0,0


In [12]:
df.shape

(54, 30)

In [13]:
x_original = df.drop(columns=['Label'])
y = df['Label']

#### SKB-C2

In [14]:
x = x_original.copy()
x.shape

(54, 29)

In [15]:
selector = SelectKBest(score_func=chi2, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'Chi2 Score': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='Chi2 Score', ascending=False)

print(feature_scores)

      Feature     Chi2 Score        P-Value
26   TotBytes  952556.773980   0.000000e+00
6    SrcBytes   14449.625595   0.000000e+00
19        Dur    4754.819628   0.000000e+00
11    TotPkts     713.681682  1.061676e-155
9     DstAddr     155.748837   1.512062e-34
16    SrcAddr      95.069307   2.269668e-21
20      Sport      94.666105   2.776621e-21
17      State      79.197802   6.344768e-18
7         udp      22.842105   1.096225e-05
14        Dir      21.000000   2.753645e-05
10      Dport      18.534351   9.447498e-05
21        tcp      12.400000   2.029431e-03
0     ipx/spx            NaN            NaN
1         pim            NaN            NaN
2         arp            NaN            NaN
3         gre            NaN            NaN
4         rtp            NaN            NaN
5        rsvp            NaN            NaN
8         esp            NaN            NaN
12       igmp            NaN            NaN
13  ipv6-icmp            NaN            NaN
15       icmp            NaN    

In [16]:
result_skb_chi2 = list(feature_scores['Feature'])
print('skb-chi2', result_skb_chi2)

skb-chi2 ['TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'DstAddr', 'SrcAddr', 'Sport', 'State', 'udp', 'Dir', 'Dport', 'tcp', 'ipx/spx', 'pim', 'arp', 'gre', 'rtp', 'rsvp', 'esp', 'igmp', 'ipv6-icmp', 'icmp', 'llc', 'ipnip', 'unas', 'rtcp', 'rarp', 'udt', 'ipv6']


#### SKB-AF

In [17]:
x = x_original.copy()

In [18]:
selector = SelectKBest(score_func=f_classif, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'ANOVA F-Value': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='ANOVA F-Value', ascending=False)

print(feature_scores)

      Feature  ANOVA F-Value       P-Value
7         udp      47.909091  1.950534e-12
21        tcp      47.909091  1.950534e-12
14        Dir      35.700000  2.016570e-10
16    SrcAddr      21.750999  1.476706e-07
17      State      21.029694  2.186022e-07
9     DstAddr      15.202211  6.630030e-06
20      Sport       8.594616  6.071368e-04
10      Dport       5.720067  5.737360e-03
26   TotBytes       4.495216  1.592021e-02
11    TotPkts       4.131319  2.173324e-02
6    SrcBytes       2.822942  6.874585e-02
19        Dur       1.367301  2.639742e-01
0     ipx/spx            NaN           NaN
1         pim            NaN           NaN
2         arp            NaN           NaN
3         gre            NaN           NaN
4         rtp            NaN           NaN
5        rsvp            NaN           NaN
8         esp            NaN           NaN
12       igmp            NaN           NaN
13  ipv6-icmp            NaN           NaN
15       icmp            NaN           NaN
18        l

  f = msb / msw


In [19]:
result_skb_af = list(feature_scores['Feature'])
print('skb-af', result_skb_af)

skb-af ['udp', 'tcp', 'Dir', 'SrcAddr', 'State', 'DstAddr', 'Sport', 'Dport', 'TotBytes', 'TotPkts', 'SrcBytes', 'Dur', 'ipx/spx', 'pim', 'arp', 'gre', 'rtp', 'rsvp', 'esp', 'igmp', 'ipv6-icmp', 'icmp', 'llc', 'ipnip', 'unas', 'rtcp', 'rarp', 'udt', 'ipv6']


#### SKB-MI

In [20]:
x = x_original.copy()

In [21]:
selector = SelectKBest(score_func=mutual_info_regression, k='all')
x_new = selector.fit_transform(x, y)

scores = selector.scores_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'MI Score': scores})
feature_scores = feature_scores.sort_values(by='MI Score', ascending=False)

print(feature_scores)

      Feature  MI Score
16    SrcAddr  0.637324
10      Dport  0.545291
21        tcp  0.350593
17      State  0.347475
7         udp  0.332202
6    SrcBytes  0.326664
9     DstAddr  0.314992
20      Sport  0.306981
19        Dur  0.236315
14        Dir  0.231366
11    TotPkts  0.217899
4         rtp  0.206031
26   TotBytes  0.178452
22      ipnip  0.108390
0     ipx/spx  0.093806
1         pim  0.074286
27        udt  0.049406
24       rtcp  0.035093
25       rarp  0.031932
12       igmp  0.029881
13  ipv6-icmp  0.019897
15       icmp  0.000000
18        llc  0.000000
8         esp  0.000000
5        rsvp  0.000000
3         gre  0.000000
23       unas  0.000000
2         arp  0.000000
28       ipv6  0.000000


In [22]:
result_skb_mi = list(feature_scores['Feature'])
print('skb-mi', result_skb_mi)

skb-mi ['SrcAddr', 'Dport', 'tcp', 'State', 'udp', 'SrcBytes', 'DstAddr', 'Sport', 'Dur', 'Dir', 'TotPkts', 'rtp', 'TotBytes', 'ipnip', 'ipx/spx', 'pim', 'udt', 'rtcp', 'rarp', 'igmp', 'ipv6-icmp', 'icmp', 'llc', 'esp', 'rsvp', 'gre', 'unas', 'arp', 'ipv6']


#### Variance Threshold (VT)

In [23]:
x = x_original.copy()

In [24]:
# Example: Compute variance for all features
feature_scores = pd.DataFrame({
    'Feature': x.columns,
    'Variance': np.var(x, axis=0)  # Variance across rows (samples)
})
feature_scores = feature_scores.sort_values(by='Variance', ascending=False)

print(feature_scores['Variance'])

TotBytes     1.374804e+09
SrcBytes     1.995295e+06
Dur          1.104748e+05
TotPkts      1.753657e+03
Sport        1.527394e+02
DstAddr      9.223765e+01
SrcAddr      2.145988e+01
State        1.093690e+01
Dport        9.089163e+00
udp          2.280521e-01
tcp          2.280521e-01
Dir          2.222222e-01
unas         0.000000e+00
llc          0.000000e+00
ipnip        0.000000e+00
rtcp         0.000000e+00
rarp         0.000000e+00
udt          0.000000e+00
ipx/spx      0.000000e+00
icmp         0.000000e+00
pim          0.000000e+00
ipv6-icmp    0.000000e+00
igmp         0.000000e+00
esp          0.000000e+00
rsvp         0.000000e+00
rtp          0.000000e+00
gre          0.000000e+00
arp          0.000000e+00
ipv6         0.000000e+00
Name: Variance, dtype: float64


In [25]:
result_vt = list(feature_scores['Feature'])

In [26]:
print('vt', result_vt)

vt ['TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'Sport', 'DstAddr', 'SrcAddr', 'State', 'Dport', 'udp', 'tcp', 'Dir', 'unas', 'llc', 'ipnip', 'rtcp', 'rarp', 'udt', 'ipx/spx', 'icmp', 'pim', 'ipv6-icmp', 'igmp', 'esp', 'rsvp', 'rtp', 'gre', 'arp', 'ipv6']


#### Backward Elimination (BE)

In [27]:
x = x_original.copy()

In [28]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier model
model = DecisionTreeClassifier()
rank = []

# Backward Elimination
cols = list(x.columns)
while len(cols) > 0:
    # Fit the model
    model.fit(x[cols], y)

    # Get feature importances
    feature_importances = model.feature_importances_

    # Create a pandas Series with feature importances and corresponding columns
    imp_series = pd.Series(feature_importances, index=cols)

    # Identify the least important feature
    least_important_feature = imp_series.idxmin()

    # If the least important feature's importance is below a certain threshold (e.g., 0.01), remove it
    if imp_series.min() < 0.01:
        rank.append(least_important_feature)
        cols.remove(least_important_feature)
    else:
        break

selected_features_BE = cols
print(selected_features_BE)

x_new = x[selected_features_BE]

['SrcBytes', 'DstAddr', 'Dport', 'SrcAddr', 'tcp', 'TotBytes']


In [29]:
print(rank) #this is still reversed

['ipx/spx', 'pim', 'arp', 'gre', 'rtp', 'rsvp', 'esp', 'igmp', 'udp', 'ipv6-icmp', 'TotPkts', 'Dir', 'icmp', 'State', 'llc', 'Sport', 'ipnip', 'unas', 'rtcp', 'Dur', 'rarp', 'udt', 'ipv6']


In [30]:
# Refit the model with the selected features
model.fit(x[selected_features_BE], y)

# Get feature importances for the selected features
feature_importances = model.feature_importances_

# Create a pandas Series to rank the selected features
ranking = pd.Series(feature_importances, index=selected_features_BE)

# Sort by importance
ranking = ranking.sort_values(ascending=False)
print("Feature Ranking:\n", ranking)


Feature Ranking:
 tcp         0.326316
TotBytes    0.299959
DstAddr     0.169981
Dport       0.085403
SrcAddr     0.080247
SrcBytes    0.038095
dtype: float64


In [31]:
a = list(ranking.index)
a

['tcp', 'TotBytes', 'DstAddr', 'Dport', 'SrcAddr', 'SrcBytes']

In [32]:
rank.reverse()

In [33]:
result_be = a + rank
print('be', result_be)

be ['tcp', 'TotBytes', 'DstAddr', 'Dport', 'SrcAddr', 'SrcBytes', 'ipv6', 'udt', 'rarp', 'Dur', 'rtcp', 'unas', 'ipnip', 'Sport', 'llc', 'State', 'icmp', 'Dir', 'TotPkts', 'ipv6-icmp', 'udp', 'igmp', 'esp', 'rsvp', 'rtp', 'gre', 'arp', 'pim', 'ipx/spx']


#### Recursive Feature Elimination (RFE)

In [34]:
x = x_original.copy()

In [35]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
model = DecisionTreeClassifier()

# Initialize RFE
rfe = RFE(estimator=model, n_features_to_select=1)  # Rank all features
rfe.fit(x, y)

In [36]:
# Get feature rankings
ranking = pd.Series(rfe.ranking_, index=x.columns)
ranking = ranking.sort_values()

print("Feature Rankings (lower is better):\n", ranking)

Feature Rankings (lower is better):
 Dport         1
tcp           2
TotPkts       3
DstAddr       4
SrcAddr       5
SrcBytes      6
unas          7
Sport         8
ipnip         9
Dur          10
llc          11
State        12
Dir          13
ipv6-icmp    14
igmp         15
esp          16
rtcp         17
icmp         18
rarp         19
TotBytes     20
udp          21
udt          22
ipv6         23
rsvp         24
rtp          25
gre          26
arp          27
pim          28
ipx/spx      29
dtype: int32


In [37]:
result_rfe = list(ranking.index)
print('rfe', result_rfe)

rfe ['Dport', 'tcp', 'TotPkts', 'DstAddr', 'SrcAddr', 'SrcBytes', 'unas', 'Sport', 'ipnip', 'Dur', 'llc', 'State', 'Dir', 'ipv6-icmp', 'igmp', 'esp', 'rtcp', 'icmp', 'rarp', 'TotBytes', 'udp', 'udt', 'ipv6', 'rsvp', 'rtp', 'gre', 'arp', 'pim', 'ipx/spx']


#### SelectFromModel-TreeBase (SFM-TB)

In [38]:
x = x_original.copy()

In [39]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier  # Example tree-based model

# Initialize the tree-based model
model = RandomForestClassifier(random_state=42)

In [40]:
# Fit the model
sfm = SelectFromModel(estimator=model, threshold='mean')  # Use 'mean' threshold or specify a custom one
sfm.fit(x, y)

# Get the selected features
selected_features = x.columns[sfm.get_support()]
print("Selected features:", selected_features)

Selected features: Index(['SrcBytes', 'udp', 'DstAddr', 'Dport', 'TotPkts', 'Dir', 'SrcAddr',
       'State', 'Dur', 'Sport', 'tcp', 'TotBytes'],
      dtype='object')


In [41]:
# Fit the model directly to get feature importances
model.fit(x, y)
feature_importances = model.feature_importances_

# Rank the features
ranking = pd.Series(feature_importances, index=x.columns).sort_values(ascending=False)
print("Feature Importances:\n", ranking)

Feature Importances:
 DstAddr      0.136709
Dport        0.126013
SrcAddr      0.100315
Sport        0.095884
TotBytes     0.088912
SrcBytes     0.075509
Dur          0.073852
udp          0.071668
tcp          0.069040
State        0.068124
TotPkts      0.055781
Dir          0.038195
llc          0.000000
ipnip        0.000000
unas         0.000000
rtcp         0.000000
rarp         0.000000
udt          0.000000
ipx/spx      0.000000
icmp         0.000000
pim          0.000000
ipv6-icmp    0.000000
igmp         0.000000
esp          0.000000
rsvp         0.000000
rtp          0.000000
gre          0.000000
arp          0.000000
ipv6         0.000000
dtype: float64


In [42]:
result_sfm_tb = list(ranking.index)
print('sfm_tb', result_sfm_tb)

sfm_tb ['DstAddr', 'Dport', 'SrcAddr', 'Sport', 'TotBytes', 'SrcBytes', 'Dur', 'udp', 'tcp', 'State', 'TotPkts', 'Dir', 'llc', 'ipnip', 'unas', 'rtcp', 'rarp', 'udt', 'ipx/spx', 'icmp', 'pim', 'ipv6-icmp', 'igmp', 'esp', 'rsvp', 'rtp', 'gre', 'arp', 'ipv6']


#### Rank Aggregation

In [43]:
print(result_skb_chi2)
print(result_skb_af)
print(result_skb_mi)
print(result_vt)
print(result_be)
print(result_rfe)
print(result_sfm_tb)

['TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'DstAddr', 'SrcAddr', 'Sport', 'State', 'udp', 'Dir', 'Dport', 'tcp', 'ipx/spx', 'pim', 'arp', 'gre', 'rtp', 'rsvp', 'esp', 'igmp', 'ipv6-icmp', 'icmp', 'llc', 'ipnip', 'unas', 'rtcp', 'rarp', 'udt', 'ipv6']
['udp', 'tcp', 'Dir', 'SrcAddr', 'State', 'DstAddr', 'Sport', 'Dport', 'TotBytes', 'TotPkts', 'SrcBytes', 'Dur', 'ipx/spx', 'pim', 'arp', 'gre', 'rtp', 'rsvp', 'esp', 'igmp', 'ipv6-icmp', 'icmp', 'llc', 'ipnip', 'unas', 'rtcp', 'rarp', 'udt', 'ipv6']
['SrcAddr', 'Dport', 'tcp', 'State', 'udp', 'SrcBytes', 'DstAddr', 'Sport', 'Dur', 'Dir', 'TotPkts', 'rtp', 'TotBytes', 'ipnip', 'ipx/spx', 'pim', 'udt', 'rtcp', 'rarp', 'igmp', 'ipv6-icmp', 'icmp', 'llc', 'esp', 'rsvp', 'gre', 'unas', 'arp', 'ipv6']
['TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'Sport', 'DstAddr', 'SrcAddr', 'State', 'Dport', 'udp', 'tcp', 'Dir', 'unas', 'llc', 'ipnip', 'rtcp', 'rarp', 'udt', 'ipx/spx', 'icmp', 'pim', 'ipv6-icmp', 'igmp', 'esp', 'rsvp', 'rtp', 'gre', 'arp', 'ipv

In [44]:
features = {
    'SrcAddr': 0, 
    'TotBytes': 0, 
    'DstAddr': 0, 
    'Sport': 0, 
    'Dport': 0, 
    'SrcBytes': 0, 
    'Dur': 0, 
    'State': 0, 
    'TotPkts': 0, 
    'tcp': 0, 
    'Dir': 0, 
    'udp': 0, 
    'icmp': 0, 
    'igmp': 0, 
    'rtp': 0, 
    'rtcp': 0, 
    'arp': 0, 
    'ipv6-icmp': 0, 
    'ipx/spx': 0, 
    'pim': 0, 
    'udt': 0, 
    'ipv6': 0, 
    'esp': 0, 
    'rarp': 0, 
    'unas': 0, 
    'gre': 0, 
    'ipnip': 0, 
    'llc': 0, 
    'rsvp': 0
}

In [45]:
for i, j in enumerate(result_skb_chi2):
    features[j] += i
for i, j in enumerate(result_skb_af):
    features[j] += i
for i, j in enumerate(result_skb_mi):
    features[j] += i
for i, j in enumerate(result_vt):
    features[j] += i
for i, j in enumerate(result_be):
    features[j] += i
for i, j in enumerate(result_rfe):
    features[j] += i
for i, j in enumerate(result_sfm_tb):
    features[j] += i

In [46]:
result_final = dict(sorted(features.items(), key=lambda item: item[1]))
result_final

{'SrcAddr': 24,
 'DstAddr': 25,
 'Dport': 30,
 'SrcBytes': 32,
 'tcp': 33,
 'TotBytes': 44,
 'Sport': 46,
 'Dur': 47,
 'TotPkts': 55,
 'State': 56,
 'udp': 68,
 'Dir': 71,
 'ipnip': 106,
 'llc': 115,
 'unas': 117,
 'rtcp': 123,
 'rarp': 128,
 'ipx/spx': 130,
 'udt': 132,
 'icmp': 134,
 'ipv6-icmp': 134,
 'pim': 135,
 'igmp': 136,
 'rtp': 141,
 'esp': 142,
 'rsvp': 152,
 'gre': 157,
 'arp': 161,
 'ipv6': 168}