In [1]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
import numpy as np

In [2]:
#loading encoded data from csv

df = pd.read_csv("../../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")

In [3]:
# Function to calculate Entropy
def calculate_entropy(data):
    _, counts = np.unique(data, return_counts=True)
    probabilities = counts / len(data)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

In [4]:
# Function to calculate Information Gain between a feature and the target variable
def calculate_information_gain(feature, target):
    entropy_target = calculate_entropy(target)
    entropy_feature = calculate_entropy(feature)
    mutual_info = mutual_info_classif(feature.values.reshape(-1, 1), target)
    
    information_gain = entropy_target - entropy_feature + mutual_info

    return information_gain[0]  # Extract the value from the array


In [5]:
# Calculate Information Gain for each feature
feature_information_gain_values = {}
target_column = 'binaryoutcome'

In [6]:
for feature_column in df.columns[:-1]:  # Exclude the target column
    information_gain_value = calculate_information_gain(df[feature_column], df[target_column])
    feature_information_gain_values[feature_column] = information_gain_value

In [7]:
# Sort the results in descending order
sorted_results = sorted(feature_information_gain_values.items(), key=lambda x: x[1], reverse=True)

In [8]:
# Create a DataFrame from the sorted results
result_df = pd.DataFrame(sorted_results, columns=['Feature', 'Information Gain'])

In [9]:
# Save the results to an Excel file
result_df.to_excel('information_gain_results.xlsx', index=False)

In [10]:
result_df

Unnamed: 0,Feature,Information Gain
0,is_host_login,0.997581
1,num_outbound_cmds,0.996547
2,urgent,0.995402
3,land,0.99395
4,num_shells,0.992347
5,su_attempted,0.989938
6,num_failed_logins,0.985044
7,root_shell,0.983342
8,num_file_creations,0.968503
9,num_access_files,0.965693


## Corrected INFO GAIN

In [1]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
import numpy as np

In [2]:
# Function to calculate Entropy
def calculate_entropy(data):
    _, counts = np.unique(data, return_counts=True)
    probabilities = counts / len(data)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

In [3]:
# Function to calculate Conditional Entropy
def calculate_conditional_entropy(feature, target):
    feature_values = np.unique(feature)
    weighted_feature_entropies = np.array([np.sum(feature == value) / len(feature) * calculate_entropy(target[feature == value]) for value in feature_values])
    return np.sum(weighted_feature_entropies)


In [4]:
# Function to calculate Information Gain between a feature and the target variable
def calculate_information_gain(feature, target):
    entropy_target = calculate_entropy(target)
    conditional_entropy = calculate_conditional_entropy(feature, target)
    
    information_gain = entropy_target - conditional_entropy

    return information_gain


In [5]:
#loading encoded data from csv

df = pd.read_csv("../../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")

In [10]:
# Calculate Information Gain for each feature
feature_information_gain_values = {}
target_column = 'binaryoutcome'

In [11]:
for feature_column in df.columns[:-1]:  # Exclude the target column
    information_gain_value = calculate_information_gain(df[feature_column], df[target_column])
    feature_information_gain_values[feature_column] = information_gain_value

In [12]:
# Sort the results in descending order
sorted_results = sorted(feature_information_gain_values.items(), key=lambda x: x[1], reverse=True)

In [13]:
# Create a DataFrame from the sorted results
result_df = pd.DataFrame(sorted_results, columns=['Feature', 'Information Gain'])

In [14]:
result_df

Unnamed: 0,Feature,Information Gain
0,src_bytes,0.822395
1,service,0.671567
2,dst_bytes,0.642459
3,diff_srv_rate,0.519832
4,flag,0.519387
5,same_srv_rate,0.510794
6,dst_host_srv_count,0.47801
7,dst_host_same_srv_rate,0.438669
8,dst_host_diff_srv_rate,0.41175
9,dst_host_serror_rate,0.406502


In [15]:
# Save the results to an Excel file
result_df.to_excel('information_gain_results.xlsx', index=False)