In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif

In [2]:
#loading encoded data from csv

df = pd.read_csv("../../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")

In [3]:
# Function to calculate Gain Ratio between a feature and the target variable
def calculate_gain_ratio(feature, target):
    mutual_info = mutual_info_classif(feature.values.reshape(-1, 1), target)
    entropy_feature = -sum(mutual_info)
    entropy_target = -sum(mutual_info_classif(target.values.reshape(-1, 1), target))
    if entropy_target == 0:
        return 0  # Avoid division by zero
    return entropy_feature / entropy_target

In [6]:
# Calculate Gain Ratio for each feature
feature_gain_ratio_values = {}
target_column = 'binaryoutcome'

In [7]:
for feature_column in df.columns[:-1]:  # Exclude the target column
    gain_ratio_value = calculate_gain_ratio(df[feature_column], df[target_column])
    feature_gain_ratio_values[feature_column] = gain_ratio_value


In [8]:
# Sort the results in descending order
sorted_results = sorted(feature_gain_ratio_values.items(), key=lambda x: x[1], reverse=True)


In [9]:
sorted_results

[('src_bytes', 0.7944693425165142),
 ('service', 0.6584089850320466),
 ('dst_bytes', 0.6186411087678977),
 ('flag', 0.5169036936655363),
 ('same_srv_rate', 0.5158688294609312),
 ('diff_srv_rate', 0.5068708800860247),
 ('dst_host_srv_count', 0.47149855098181587),
 ('dst_host_same_srv_rate', 0.4379657859498922),
 ('logged_in', 0.4113786546928319),
 ('dst_host_serror_rate', 0.40050322354641754),
 ('dst_host_diff_srv_rate', 0.39894913516931957),
 ('dst_host_srv_serror_rate', 0.3956625786120282),
 ('serror_rate', 0.390384089807984),
 ('srv_serror_rate', 0.37662815841154895),
 ('count', 0.3747829071696078),
 ('dst_host_srv_diff_host_rate', 0.262952008032936),
 ('level', 0.2164044365929448),
 ('dst_host_count', 0.20504653315654717),
 ('dst_host_same_src_port_rate', 0.18301484932285284),
 ('srv_diff_host_rate', 0.1363682903391294),
 ('srv_count', 0.09212899461821034),
 ('dst_host_srv_rerror_rate', 0.08703517890087313),
 ('protocol_type', 0.0760603525198568),
 ('rerror_rate', 0.0567030068986512

In [10]:
# Create a DataFrame from the sorted results
result_df = pd.DataFrame(sorted_results, columns=['Feature', 'Gain Ratio'])

In [11]:
# Save the results to an Excel file
result_df.to_excel('gain_ratio_results.xlsx', index=False)

In [12]:
result_df

Unnamed: 0,Feature,Gain Ratio
0,src_bytes,0.794469
1,service,0.658409
2,dst_bytes,0.618641
3,flag,0.516904
4,same_srv_rate,0.515869
5,diff_srv_rate,0.506871
6,dst_host_srv_count,0.471499
7,dst_host_same_srv_rate,0.437966
8,logged_in,0.411379
9,dst_host_serror_rate,0.400503
