In [4]:
import pandas as pd
import numpy as np

In [2]:
#loading encoded data from csv

df = pd.read_csv("../../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")

In [5]:
# Function to calculate Gini Index
def calculate_gini_index(data):
    classes, counts = np.unique(data, return_counts=True)
    probabilities = counts / len(data)
    gini_index = 1 - np.sum(probabilities**2)
    return gini_index


In [6]:
# Function to calculate Weighted Gini Index
def calculate_weighted_gini_index(feature, target):
    unique_values = feature.unique()
    weighted_gini_index = 0

    for value in unique_values:
        subset_target = target[feature == value]
        weight = len(subset_target) / len(target)
        gini_index = calculate_gini_index(subset_target)
        weighted_gini_index += weight * gini_index

    return weighted_gini_index

In [7]:
# Calculate Gini Index for each feature
feature_gini_index_values = {}
target_column = 'binaryoutcome'

In [8]:
for feature_column in df.columns[:-1]:  # Exclude the target column
    gini_index_value = calculate_weighted_gini_index(df[feature_column], df[target_column])
    feature_gini_index_values[feature_column] = gini_index_value

In [9]:
# Sort the results in ascending order
sorted_results = sorted(feature_gini_index_values.items(), key=lambda x: x[1])

In [10]:
# Create a DataFrame from the sorted results
result_df = pd.DataFrame(sorted_results, columns=['Feature', 'Gini Index'])

In [11]:
result_df

Unnamed: 0,Feature,Gini Index
0,src_bytes,0.063043
1,service,0.129299
2,dst_bytes,0.149816
3,flag,0.198776
4,diff_srv_rate,0.204492
5,same_srv_rate,0.205581
6,dst_host_srv_count,0.220301
7,dst_host_same_srv_rate,0.232535
8,dst_host_diff_srv_rate,0.248657
9,logged_in,0.260573


In [12]:
# Save the results to an Excel file
result_df.to_excel('gini_index_results.xlsx', index=False)