In [1]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
import numpy as np

In [2]:
#loading encoded data from csv

df = pd.read_csv("../../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")

In [4]:
# Function to calculate Entropy
def calculate_entropy(data):
    _, counts = np.unique(data, return_counts=True)
    probabilities = counts / len(data)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

In [5]:
# Function to calculate Symmetric Uncertainty between a feature and the target variable
def calculate_symmetric_uncertainty(feature, target):
    mutual_info = mutual_info_classif(feature.values.reshape(-1, 1), target)
    entropy_feature = calculate_entropy(feature)
    entropy_target = calculate_entropy(target)
    
    if entropy_feature + entropy_target == 0:
        return 0  # Avoid division by zero
    
    symmetric_uncertainty = 2 * mutual_info / (entropy_feature + entropy_target)

    return symmetric_uncertainty[0]  # Extract the value from the array


In [6]:
# Calculate Symmetric Uncertainty for each feature
feature_su_values = {}
target_column = 'binaryoutcome'


In [7]:
for feature_column in df.columns[:-1]:  # Exclude the target column
    su_value = calculate_symmetric_uncertainty(df[feature_column], df[target_column])
    feature_su_values[feature_column] = su_value

In [8]:
# Sort the results in descending order
sorted_results = sorted(feature_su_values.items(), key=lambda x: x[1], reverse=True)

In [9]:
# Create a DataFrame from the sorted results
result_df = pd.DataFrame(sorted_results, columns=['Feature', 'Symmetric Uncertainty'])

In [10]:
# Save the results to an Excel file
result_df.to_excel('symmetric_uncertainty_results.xlsx', index=False)