## I. Importing libraries

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

## II. Read the Data

In [4]:
train_df = pd.read_csv('scaled_train.csv')
train_df

Unnamed: 0.1,Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label,proto_target_encoded,service_target_encoded,state_target_encoded
0,0,0.179469,0.4,0.2,-0.131902,0.007260,-0.025220,-0.010417,0.892857,-0.009738,...,0,0,0.0,-0.285714,-0.3,0,0,-0.995621,-0.002417,-0.002039
1,1,0.970450,1.2,3.6,0.233129,37.976407,-0.025185,-1.000000,0.884921,-0.009803,...,0,0,0.0,-0.285714,0.2,0,0,-0.995493,-0.000370,-0.002362
2,2,2.427244,0.6,1.4,-0.050613,11.816697,-0.025700,-1.000000,0.884921,-0.009880,...,0,0,0.0,-0.142857,0.2,0,0,-0.992300,0.000000,0.000000
3,3,2.514830,1.0,1.0,0.151840,0.549909,-0.025704,-1.000000,0.884921,-0.009867,...,1,1,0.0,-0.142857,-0.3,0,0,-0.994307,0.129674,-0.001643
4,4,0.670406,0.8,0.4,0.079755,0.094374,-0.025546,0.000000,0.884921,-0.009801,...,0,0,0.0,-0.142857,3.5,0,0,-0.998246,-0.001737,-0.004067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,175336,-0.002355,0.0,-0.2,-0.242331,-0.148820,0.863309,0.000000,-0.115079,0.560186,...,0,0,0.0,3.000000,2.0,0,1,0.000919,0.997583,0.997562
175337,175337,0.754692,0.8,0.6,0.145706,0.172414,-0.025544,0.000000,0.884921,-0.009798,...,0,0,0.0,-0.285714,-0.3,0,1,-0.998246,-0.001737,-0.004067
175338,175338,-0.002355,0.0,-0.2,-0.242331,-0.148820,0.863309,0.000000,-0.115079,0.560186,...,0,0,0.0,0.000000,0.8,0,1,0.005291,1.004978,0.997961
175339,175339,-0.002355,0.0,-0.2,-0.242331,-0.148820,0.863309,0.000000,-0.115079,0.560186,...,0,0,0.0,3.857143,2.6,0,1,0.004434,1.000011,0.998558


In [6]:
train_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [7]:
train_df.shape

(175341, 43)

## III. Feature Selection using Hybrid Technique

### 1. Correlation Matrix

In [12]:
def correlation_filter(df, threshold = 0.85):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool_))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns = to_drop, axis = 1)

In [15]:
df_filtered = correlation_filter(train_df.drop(columns=['label']))
print("Remaining features after correlation filter:", df_filtered.columns)
print("Length of columns:", len(df_filtered.columns))

Remaining features after correlation filter: Index(['dur', 'spkts', 'dpkts', 'rate', 'sttl', 'dttl', 'sload', 'dload',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'tcprtt',
       'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
       'ct_state_ttl', 'ct_dst_ltm', 'is_ftp_login', 'ct_flw_http_mthd',
       'proto_target_encoded', 'service_target_encoded',
       'state_target_encoded'],
      dtype='object')
Length of columns: 28


### 2. RFE with Random Forest

In [42]:
x = train_df[df_filtered.columns]
y = train_df['label']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [52]:
feature_counts = [5, 10, 12, 15]

# Store results
results = []

# Iterate over the range of feature counts
for n_features in feature_counts:
    print(f"Testing with {n_features} features...")
    
    # Apply RFE with Random Forest
    rf = RandomForestClassifier(n_jobs = -1, class_weight = 'balanced', random_state = 42)
    rfe = RFE(estimator=rf, n_features_to_select=n_features)
    X_rfe_train = rfe.fit_transform(X_train, y_train)
    X_rfe_test = rfe.transform(X_test)

    # Get selected feature names
    selected_feature_names = df_filtered.columns[rfe.support_]
    
    # Train the model on selected features
    rf_final = RandomForestClassifier(n_jobs = -1, class_weight = 'balanced', random_state = 42)
    rf_final.fit(X_rfe_train, y_train)
    
    # Evaluate the model
    y_pred = rf_final.predict(X_rfe_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store the results
    results.append({'n_features': n_features, 'accuracy': accuracy, 'features': selected_feature_names})
    print(f"Accuracy with {n_features} features: {accuracy:.2f}")

# Find the best number of features
best_result = max(results, key=lambda x: x['accuracy'])
print(f"Best number of features: {best_result['n_features']} with accuracy: {best_result['accuracy']:.2f}")
print(f"Features selected with best number of features: {', '.join(best_result['features'])}")

Testing with 5 features...
Accuracy with 5 features: 0.94
Testing with 10 features...
Accuracy with 10 features: 0.95
Testing with 12 features...
Accuracy with 12 features: 0.95
Testing with 15 features...
Accuracy with 15 features: 0.95
Best number of features: 15 with accuracy: 0.95
Features selected with best number of features: dur, rate, sttl, dttl, sload, dload, sinpkt, dinpkt, tcprtt, smean, dmean, ct_srv_src, ct_state_ttl, proto_target_encoded, service_target_encoded


## 3. RFE with XGBoost

In [51]:
feature_counts = [5, 10, 12, 15]
results = []

for n_features in feature_counts:
    print(f"Testing with {n_features} features...")
    
    xgb = XGBClassifier(n_jobs = -1, random_state = 42, eval_metric = 'logloss')
    rfe = RFE(estimator=xgb, n_features_to_select=n_features)
    X_rfe_train = rfe.fit_transform(X_train, y_train)
    X_rfe_test = rfe.transform(X_test)
    selected_feature_names = df_filtered.columns[rfe.support_]
    xgb_final = XGBClassifier(n_jobs = -1, random_state = 42, eval_metric = 'logloss')
    xgb_final.fit(X_rfe_train, y_train)
    y_pred = xgb_final.predict(X_rfe_test)
    accuracy = accuracy_score(y_test, y_pred)

    results.append({'n_features': n_features, 'accuracy': accuracy, 'features': selected_feature_names})
    print(f"Accuracy with {n_features} features: {accuracy:.2f}")

best_result = max(results, key=lambda x: x['accuracy'])
print(f"Best number of features: {best_result['n_features']} with accuracy: {best_result['accuracy']:.2f}")
print(f"Features selected with best number of features: {', '.join(best_result['features'])}")

Testing with 5 features...
Accuracy with 5 features: 0.95
Testing with 10 features...
Accuracy with 10 features: 0.94
Testing with 12 features...
Accuracy with 12 features: 0.94
Testing with 15 features...
Accuracy with 15 features: 0.94
Best number of features: 5 with accuracy: 0.95
Features selected with best number of features: dpkts, sttl, smean, ct_srv_src, proto_target_encoded


In [58]:
train_data = train_df[['dpkts', 'sttl', 'smean', 'ct_srv_src', 'proto_target_encoded']]
train_data['label'] = train_df['label']
train_data

Unnamed: 0,dpkts,sttl,smean,ct_srv_src,proto_target_encoded,label
0,0.2,-0.010417,-0.697674,-0.4,-0.995621,0
1,3.6,-1.000000,-0.488372,3.8,-0.995493,0
2,1.4,-1.000000,-0.627907,0.2,-0.992300,0
3,1.0,-1.000000,-0.488372,-0.4,-0.994307,0
4,0.4,0.000000,-0.465116,3.8,-0.998246,0
...,...,...,...,...,...,...
175336,-0.2,0.000000,-0.372093,1.9,0.000919,1
175337,0.6,0.000000,-0.255814,-0.4,-0.998246,1
175338,-0.2,0.000000,-0.372093,0.7,0.005291,1
175339,-0.2,0.000000,-0.372093,2.5,0.004434,1


In [59]:
train_data.to_csv('final_train.csv')