In [1]:
# for reading csv & plotting
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# for features selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_regression, VarianceThreshold, SelectFromModel, RFECV

labels = ['normal', 'botnet', 'botnet_spam']

#### Import Data

In [2]:
# Load the datasets
original_df = pd.read_csv('train-train.csv')

In [3]:
df = original_df.copy()

In [4]:
print(f'{"No":<4}{"Feature":<15}{"Types":<10}{"Num Of Uniques Values":<15}{"NaN%":<20}')
print(f'{"==":<4}{"=======":<15}{"=====":<10}{"=====":<15}{"=====":<20}')
for i, a in enumerate(df):
    print(f'{str(i):<4}{a:<15}{str(df[a].dtypes):<10}{len(df[a].unique()):<15}{df[a].isna().sum() / df.shape[0] * 100:<20}')

No  Feature        Types     Num Of Uniques ValuesNaN%                
0   DstAddr        int64     28             0.0                 
1   rtcp           int64     1              0.0                 
2   Label          object    3              0.0                 
3   arp            int64     1              0.0                 
4   unas           int64     1              0.0                 
5   tcp            int64     2              0.0                 
6   State          int64     8              0.0                 
7   udp            int64     2              0.0                 
8   TotPkts        int64     24             0.0                 
9   ipv6-icmp      int64     1              0.0                 
10  llc            int64     1              0.0                 
11  Dur            float64   54             0.0                 
12  SrcAddr        int64     16             0.0                 
13  rarp           int64     1              0.0                 
14  rtp            

### Preprocessing

In [5]:
df = df.drop(
    columns=[
        'StartTime', 
        'dTos', 
        'sTos', 
        'ActivityLabel', 
        'SensorId',
        'BotnetName'
    ], errors='ignore'
)

In [6]:
num_cols = list(df.select_dtypes(include='number').columns)
print(num_cols)

['DstAddr', 'rtcp', 'arp', 'unas', 'tcp', 'State', 'udp', 'TotPkts', 'ipv6-icmp', 'llc', 'Dur', 'SrcAddr', 'rarp', 'rtp', 'udt', 'pim', 'TotBytes', 'Dir', 'gre', 'ipv6', 'icmp', 'SrcBytes', 'rsvp', 'ipx/spx', 'Sport', 'esp', 'igmp', 'ipnip', 'Dport']


In [7]:
cat_cols = list(df.select_dtypes(include='object').columns)
print(cat_cols)

['Label']


In [8]:
# simplify the label column
def categorize_label(label):
    label = str(label).lower()
    if 'botnet' in label:
        if 'spam' in label:
            return 2
        else:
            return 1
    else:
        return 0

df['Label'] = df['Label'].apply(categorize_label)

In [9]:
df.head()

Unnamed: 0,DstAddr,rtcp,Label,arp,unas,tcp,State,udp,TotPkts,ipv6-icmp,...,ipv6,icmp,SrcBytes,rsvp,ipx/spx,Sport,esp,igmp,ipnip,Dport
0,0,0,1,0,0,0,0,1,2,0,...,0,0,81,0,0,14,0,0,0,5
1,24,0,2,0,0,1,5,0,1,0,...,0,0,62,0,0,27,0,0,0,2
2,19,0,2,0,0,1,7,0,3,0,...,0,0,186,0,0,10,0,0,0,2
3,1,0,0,0,0,1,3,0,29,0,...,0,0,1517,0,0,35,0,0,0,11
4,14,0,2,0,0,1,2,0,72,0,...,0,0,5573,0,0,18,0,0,0,6


In [10]:
df.shape

(54, 30)

In [11]:
x_original = df.drop(columns=['Label'])
y = df['Label']

#### SKB-C2

In [12]:
x = x_original.copy()
x.shape

(54, 29)

In [13]:
selector = SelectKBest(score_func=chi2, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'Chi2 Score': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='Chi2 Score', ascending=False)

print(feature_scores)

      Feature    Chi2 Score       P-Value
16   TotBytes  70880.628599  0.000000e+00
10        Dur  13746.869631  0.000000e+00
21   SrcBytes   8128.532478  0.000000e+00
24      Sport    186.323250  3.470726e-41
0     DstAddr    118.978261  1.459483e-26
11    SrcAddr     68.379747  1.417512e-15
7     TotPkts     54.588710  1.400272e-12
5       State     53.338028  2.616943e-12
17        Dir     21.000000  2.753645e-05
6         udp     21.000000  2.753645e-05
28      Dport     17.635179  1.481049e-04
4         tcp     10.500000  5.247518e-03
1        rtcp           NaN           NaN
2         arp           NaN           NaN
3        unas           NaN           NaN
8   ipv6-icmp           NaN           NaN
9         llc           NaN           NaN
12       rarp           NaN           NaN
13        rtp           NaN           NaN
14        udt           NaN           NaN
15        pim           NaN           NaN
18        gre           NaN           NaN
19       ipv6           NaN       

In [14]:
result_skb_chi2 = list(feature_scores['Feature'])
print('skb-chi2', result_skb_chi2)

skb-chi2 ['TotBytes', 'Dur', 'SrcBytes', 'Sport', 'DstAddr', 'SrcAddr', 'TotPkts', 'State', 'Dir', 'udp', 'Dport', 'tcp', 'rtcp', 'arp', 'unas', 'ipv6-icmp', 'llc', 'rarp', 'rtp', 'udt', 'pim', 'gre', 'ipv6', 'icmp', 'rsvp', 'ipx/spx', 'esp', 'igmp', 'ipnip']


#### SKB-AF

In [15]:
x = x_original.copy()

In [16]:
selector = SelectKBest(score_func=f_classif, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'ANOVA F-Value': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='ANOVA F-Value', ascending=False)

print(feature_scores)

      Feature  ANOVA F-Value       P-Value
4         tcp      35.700000  2.016570e-10
6         udp      35.700000  2.016570e-10
17        Dir      35.700000  2.016570e-10
5       State      15.527979  5.410426e-06
24      Sport      14.941868  7.808701e-06
11    SrcAddr      11.478293  7.658250e-05
0     DstAddr       9.646696  2.797177e-04
28      Dport       5.981932  4.636707e-03
10        Dur       3.033752  5.690142e-02
21   SrcBytes       0.489711  6.156552e-01
7     TotPkts       0.387359  6.808277e-01
16   TotBytes       0.357910  7.008776e-01
1        rtcp            NaN           NaN
2         arp            NaN           NaN
3        unas            NaN           NaN
8   ipv6-icmp            NaN           NaN
9         llc            NaN           NaN
12       rarp            NaN           NaN
13        rtp            NaN           NaN
14        udt            NaN           NaN
15        pim            NaN           NaN
18        gre            NaN           NaN
19       ip

  f = msb / msw


In [17]:
result_skb_af = list(feature_scores['Feature'])
print('skb-af', result_skb_af)

skb-af ['tcp', 'udp', 'Dir', 'State', 'Sport', 'SrcAddr', 'DstAddr', 'Dport', 'Dur', 'SrcBytes', 'TotPkts', 'TotBytes', 'rtcp', 'arp', 'unas', 'ipv6-icmp', 'llc', 'rarp', 'rtp', 'udt', 'pim', 'gre', 'ipv6', 'icmp', 'rsvp', 'ipx/spx', 'esp', 'igmp', 'ipnip']


#### SKB-MI

In [18]:
x = x_original.copy()

In [19]:
selector = SelectKBest(score_func=mutual_info_regression, k='all')
x_new = selector.fit_transform(x, y)

scores = selector.scores_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'MI Score': scores})
feature_scores = feature_scores.sort_values(by='MI Score', ascending=False)

print(feature_scores)

      Feature  MI Score
10        Dur  0.588825
0     DstAddr  0.493428
28      Dport  0.480441
5       State  0.477989
21   SrcBytes  0.476600
7     TotPkts  0.473049
16   TotBytes  0.305632
4         tcp  0.231371
6         udp  0.224790
17        Dir  0.197476
11    SrcAddr  0.180604
8   ipv6-icmp  0.172914
25        esp  0.170057
24      Sport  0.151912
14        udt  0.102109
3        unas  0.083649
13        rtp  0.020714
9         llc  0.009643
22       rsvp  0.003646
2         arp  0.000000
1        rtcp  0.000000
12       rarp  0.000000
19       ipv6  0.000000
18        gre  0.000000
15        pim  0.000000
20       icmp  0.000000
23    ipx/spx  0.000000
26       igmp  0.000000
27      ipnip  0.000000


In [20]:
result_skb_mi = list(feature_scores['Feature'])
print('skb-mi', result_skb_mi)

skb-mi ['Dur', 'DstAddr', 'Dport', 'State', 'SrcBytes', 'TotPkts', 'TotBytes', 'tcp', 'udp', 'Dir', 'SrcAddr', 'ipv6-icmp', 'esp', 'Sport', 'udt', 'unas', 'rtp', 'llc', 'rsvp', 'arp', 'rtcp', 'rarp', 'ipv6', 'gre', 'pim', 'icmp', 'ipx/spx', 'igmp', 'ipnip']


#### Variance Threshold (VT)

In [21]:
x = x_original.copy()

In [22]:
# Example: Compute variance for all features
feature_scores = pd.DataFrame({
    'Feature': x.columns,
    'Variance': np.var(x, axis=0)  # Variance across rows (samples)
})
feature_scores = feature_scores.sort_values(by='Variance', ascending=False)

print(feature_scores['Variance'])

TotBytes     6.608248e+08
SrcBytes     1.363106e+07
Dur          3.515250e+05
TotPkts      1.241085e+03
Sport        2.000964e+02
DstAddr      9.573525e+01
SrcAddr      1.790432e+01
Dport        9.771262e+00
State        6.862826e+00
udp          2.222222e-01
Dir          2.222222e-01
tcp          2.222222e-01
rtcp         0.000000e+00
rarp         0.000000e+00
llc          0.000000e+00
ipv6-icmp    0.000000e+00
unas         0.000000e+00
arp          0.000000e+00
pim          0.000000e+00
udt          0.000000e+00
rtp          0.000000e+00
icmp         0.000000e+00
ipv6         0.000000e+00
gre          0.000000e+00
rsvp         0.000000e+00
ipx/spx      0.000000e+00
esp          0.000000e+00
igmp         0.000000e+00
ipnip        0.000000e+00
Name: Variance, dtype: float64


In [23]:
result_vt = list(feature_scores['Feature'])

In [24]:
print('vt', result_vt)

vt ['TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'Sport', 'DstAddr', 'SrcAddr', 'Dport', 'State', 'udp', 'Dir', 'tcp', 'rtcp', 'rarp', 'llc', 'ipv6-icmp', 'unas', 'arp', 'pim', 'udt', 'rtp', 'icmp', 'ipv6', 'gre', 'rsvp', 'ipx/spx', 'esp', 'igmp', 'ipnip']


#### Backward Elimination (BE)

In [25]:
x = x_original.copy()

In [26]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier model
model = DecisionTreeClassifier()
rank = []

# Backward Elimination
cols = list(x.columns)
while len(cols) > 0:
    # Fit the model
    model.fit(x[cols], y)

    # Get feature importances
    feature_importances = model.feature_importances_

    # Create a pandas Series with feature importances and corresponding columns
    imp_series = pd.Series(feature_importances, index=cols)

    # Identify the least important feature
    least_important_feature = imp_series.idxmin()

    # If the least important feature's importance is below a certain threshold (e.g., 0.01), remove it
    if imp_series.min() < 0.01:
        rank.append(least_important_feature)
        cols.remove(least_important_feature)
    else:
        break

selected_features_BE = cols
print(selected_features_BE)

x_new = x[selected_features_BE]

['DstAddr', 'Dur', 'Dir', 'Sport', 'Dport']


In [27]:
print(rank) #this is still reversed

['rtcp', 'arp', 'unas', 'tcp', 'udp', 'ipv6-icmp', 'llc', 'TotPkts', 'State', 'rarp', 'rtp', 'udt', 'pim', 'TotBytes', 'SrcAddr', 'gre', 'ipv6', 'icmp', 'rsvp', 'SrcBytes', 'ipx/spx', 'esp', 'igmp', 'ipnip']


In [28]:
# Refit the model with the selected features
model.fit(x[selected_features_BE], y)

# Get feature importances for the selected features
feature_importances = model.feature_importances_

# Create a pandas Series to rank the selected features
ranking = pd.Series(feature_importances, index=selected_features_BE)

# Sort by importance
ranking = ranking.sort_values(ascending=False)
print("Feature Ranking:\n", ranking)


Feature Ranking:
 Dur        0.460775
Dir        0.291667
Sport      0.109954
DstAddr    0.093954
Dport      0.043651
dtype: float64


In [29]:
a = list(ranking.index)
a

['Dur', 'Dir', 'Sport', 'DstAddr', 'Dport']

In [30]:
rank.reverse()

In [31]:
result_be = a + rank
print('be', result_be)

be ['Dur', 'Dir', 'Sport', 'DstAddr', 'Dport', 'ipnip', 'igmp', 'esp', 'ipx/spx', 'SrcBytes', 'rsvp', 'icmp', 'ipv6', 'gre', 'SrcAddr', 'TotBytes', 'pim', 'udt', 'rtp', 'rarp', 'State', 'TotPkts', 'llc', 'ipv6-icmp', 'udp', 'tcp', 'unas', 'arp', 'rtcp']


#### Recursive Feature Elimination (RFE)

In [32]:
x = x_original.copy()

In [33]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
model = DecisionTreeClassifier()

# Initialize RFE
rfe = RFE(estimator=model, n_features_to_select=1)  # Rank all features
rfe.fit(x, y)

0,1,2
,estimator,DecisionTreeClassifier()
,n_features_to_select,1
,step,1
,verbose,0
,importance_getter,'auto'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [34]:
# Get feature rankings
ranking = pd.Series(rfe.ranking_, index=x.columns)
ranking = ranking.sort_values()

print("Feature Rankings (lower is better):\n", ranking)

Feature Rankings (lower is better):
 Dur           1
Dir           2
Sport         3
DstAddr       4
esp           5
Dport         6
ipnip         7
igmp          8
ipx/spx       9
rsvp         10
SrcBytes     11
icmp         12
ipv6         13
gre          14
SrcAddr      15
TotBytes     16
pim          17
udt          18
rtp          19
rarp         20
llc          21
ipv6-icmp    22
TotPkts      23
State        24
udp          25
tcp          26
unas         27
arp          28
rtcp         29
dtype: int64


In [35]:
result_rfe = list(ranking.index)
print('rfe', result_rfe)

rfe ['Dur', 'Dir', 'Sport', 'DstAddr', 'esp', 'Dport', 'ipnip', 'igmp', 'ipx/spx', 'rsvp', 'SrcBytes', 'icmp', 'ipv6', 'gre', 'SrcAddr', 'TotBytes', 'pim', 'udt', 'rtp', 'rarp', 'llc', 'ipv6-icmp', 'TotPkts', 'State', 'udp', 'tcp', 'unas', 'arp', 'rtcp']


#### SelectFromModel-TreeBase (SFM-TB)

In [36]:
x = x_original.copy()

In [37]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier  # Example tree-based model

# Initialize the tree-based model
model = RandomForestClassifier(random_state=42)

In [38]:
# Fit the model
sfm = SelectFromModel(estimator=model, threshold='mean')  # Use 'mean' threshold or specify a custom one
sfm.fit(x, y)

# Get the selected features
selected_features = x.columns[sfm.get_support()]
print("Selected features:", selected_features)

Selected features: Index(['DstAddr', 'tcp', 'State', 'udp', 'TotPkts', 'Dur', 'SrcAddr',
       'TotBytes', 'Dir', 'SrcBytes', 'Sport', 'Dport'],
      dtype='object')


In [39]:
# Fit the model directly to get feature importances
model.fit(x, y)
feature_importances = model.feature_importances_

# Rank the features
ranking = pd.Series(feature_importances, index=x.columns).sort_values(ascending=False)
print("Feature Importances:\n", ranking)

Feature Importances:
 Dur          0.174753
Dport        0.103041
SrcAddr      0.090243
DstAddr      0.090098
SrcBytes     0.089044
Sport        0.075241
State        0.073038
TotBytes     0.071923
udp          0.067436
TotPkts      0.064622
tcp          0.054888
Dir          0.045672
rtcp         0.000000
rarp         0.000000
llc          0.000000
ipv6-icmp    0.000000
unas         0.000000
arp          0.000000
pim          0.000000
udt          0.000000
rtp          0.000000
icmp         0.000000
ipv6         0.000000
gre          0.000000
rsvp         0.000000
ipx/spx      0.000000
esp          0.000000
igmp         0.000000
ipnip        0.000000
dtype: float64


In [40]:
result_sfm_tb = list(ranking.index)
print('sfm_tb', result_sfm_tb)

sfm_tb ['Dur', 'Dport', 'SrcAddr', 'DstAddr', 'SrcBytes', 'Sport', 'State', 'TotBytes', 'udp', 'TotPkts', 'tcp', 'Dir', 'rtcp', 'rarp', 'llc', 'ipv6-icmp', 'unas', 'arp', 'pim', 'udt', 'rtp', 'icmp', 'ipv6', 'gre', 'rsvp', 'ipx/spx', 'esp', 'igmp', 'ipnip']


#### Rank Aggregation

In [41]:
print(result_skb_chi2)
print(result_skb_af)
print(result_skb_mi)
print(result_vt)
print(result_be)
print(result_rfe)
print(result_sfm_tb)

['TotBytes', 'Dur', 'SrcBytes', 'Sport', 'DstAddr', 'SrcAddr', 'TotPkts', 'State', 'Dir', 'udp', 'Dport', 'tcp', 'rtcp', 'arp', 'unas', 'ipv6-icmp', 'llc', 'rarp', 'rtp', 'udt', 'pim', 'gre', 'ipv6', 'icmp', 'rsvp', 'ipx/spx', 'esp', 'igmp', 'ipnip']
['tcp', 'udp', 'Dir', 'State', 'Sport', 'SrcAddr', 'DstAddr', 'Dport', 'Dur', 'SrcBytes', 'TotPkts', 'TotBytes', 'rtcp', 'arp', 'unas', 'ipv6-icmp', 'llc', 'rarp', 'rtp', 'udt', 'pim', 'gre', 'ipv6', 'icmp', 'rsvp', 'ipx/spx', 'esp', 'igmp', 'ipnip']
['Dur', 'DstAddr', 'Dport', 'State', 'SrcBytes', 'TotPkts', 'TotBytes', 'tcp', 'udp', 'Dir', 'SrcAddr', 'ipv6-icmp', 'esp', 'Sport', 'udt', 'unas', 'rtp', 'llc', 'rsvp', 'arp', 'rtcp', 'rarp', 'ipv6', 'gre', 'pim', 'icmp', 'ipx/spx', 'igmp', 'ipnip']
['TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'Sport', 'DstAddr', 'SrcAddr', 'Dport', 'State', 'udp', 'Dir', 'tcp', 'rtcp', 'rarp', 'llc', 'ipv6-icmp', 'unas', 'arp', 'pim', 'udt', 'rtp', 'icmp', 'ipv6', 'gre', 'rsvp', 'ipx/spx', 'esp', 'igmp', 'ipni

In [42]:
features = {
    'SrcAddr': 0, 
    'TotBytes': 0, 
    'DstAddr': 0, 
    'Sport': 0, 
    'Dport': 0, 
    'SrcBytes': 0, 
    'Dur': 0, 
    'State': 0, 
    'TotPkts': 0, 
    'tcp': 0, 
    'Dir': 0, 
    'udp': 0, 
    'icmp': 0, 
    'igmp': 0, 
    'rtp': 0, 
    'rtcp': 0, 
    'arp': 0, 
    'ipv6-icmp': 0, 
    'ipx/spx': 0, 
    'pim': 0, 
    'udt': 0, 
    'ipv6': 0, 
    'esp': 0, 
    'rarp': 0, 
    'unas': 0, 
    'gre': 0, 
    'ipnip': 0, 
    'llc': 0, 
    'rsvp': 0
}

In [43]:
for i, j in enumerate(result_skb_chi2):
    features[j] += i
for i, j in enumerate(result_skb_af):
    features[j] += i
for i, j in enumerate(result_skb_mi):
    features[j] += i
for i, j in enumerate(result_vt):
    features[j] += i
for i, j in enumerate(result_be):
    features[j] += i
for i, j in enumerate(result_rfe):
    features[j] += i
for i, j in enumerate(result_sfm_tb):
    features[j] += i

In [44]:
result_final = dict(sorted(features.items(), key=lambda item: item[1]))
result_final

{'Dur': 11,
 'DstAddr': 25,
 'Sport': 33,
 'Dport': 36,
 'SrcBytes': 39,
 'Dir': 42,
 'TotBytes': 54,
 'SrcAddr': 56,
 'State': 70,
 'TotPkts': 76,
 'udp': 83,
 'tcp': 89,
 'ipv6-icmp': 115,
 'rarp': 119,
 'llc': 119,
 'rtcp': 124,
 'udt': 124,
 'esp': 127,
 'unas': 127,
 'rtp': 128,
 'pim': 132,
 'arp': 133,
 'rsvp': 133,
 'ipv6': 134,
 'icmp': 135,
 'gre': 137,
 'ipx/spx': 142,
 'igmp': 148,
 'ipnip': 151}