In [9]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [8]:
def passi_luuka_feat_sel(data, measure='luca', p=1):
    num_classes = int(np.max(data[:, -1]) + 1)  # Number of classes
    num_samples = data.shape[0]             # Number of samples
    num_features = data.shape[1] - 1        # Number of features

    data_old = data.copy()

    # Extract features and labels
    data_v = data[:, :num_features]
    data_c = data[:, num_features]

    # Initialize MinMaxScaler
    scaler = MinMaxScaler()

    # Fit the scaler to the feature data and transform it
    data_v = scaler.fit_transform(data_v)

    # Concatenate scaled features with labels
    data = np.hstack((data_v, data_c.reshape(-1, 1)))

    # Forming idealvec using arithmetic mean
    idealvec_s = np.zeros((num_classes, num_features))
    for k in range(num_classes):
        idealvec_s[k, :] = np.mean(data[data[:, -1] == k][:, :num_features], axis=0)

    # Sample data
    datalearn_s = data[:, :num_features]

    # Initialize similarity matrix
    sim = np.zeros((num_classes, num_samples, num_features))

    # Compute similarities
    for i in range(num_classes):
        mask = (data[:, -1] == i)
        sim[i][mask] = (1 - np.abs(idealvec_s[i] ** p - datalearn_s[mask] ** p)) ** (1 / p)

    A = np.sum(sim[0], axis=0)
    B = np.sum(sim[1], axis=0)

    Mu = A / (A + B)

    H = -(Mu * np.log(Mu) + (1 - Mu) * np.log(1 - Mu))

    return H


In [10]:
df = pd.read_csv("../../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")

In [11]:
data = np.array(df)

In [20]:
p_luuka_entropy = passi_luuka_feat_sel(data)

In [22]:
feature_entropy_values = {}

for feature_column in df.columns[:-1]:  # Exclude the target column
    idx = df.columns.get_loc(feature_column)
    feature_entropy_values[feature_column] = p_luuka_entropy[idx]

In [23]:
# Sort the results in descending order
sorted_results = sorted(feature_entropy_values.items(), key=lambda x: x[1])


In [24]:
sorted_results

[('dst_host_srv_serror_rate', 0.6220370297819582),
 ('srv_serror_rate', 0.6240198676675358),
 ('serror_rate', 0.6258052872785959),
 ('dst_host_serror_rate', 0.6263134871652549),
 ('same_srv_rate', 0.6638089566073053),
 ('srv_rerror_rate', 0.6686237522545229),
 ('dst_host_srv_rerror_rate', 0.6687463874678481),
 ('rerror_rate', 0.6692347132348808),
 ('dst_host_rerror_rate', 0.6718650027462356),
 ('logged_in', 0.6802573250080093),
 ('count', 0.6820047789010824),
 ('dst_host_same_src_port_rate', 0.683461201288031),
 ('service', 0.6845612765787346),
 ('flag', 0.6864304149179581),
 ('dst_host_diff_srv_rate', 0.6869653573503149),
 ('diff_srv_rate', 0.688718657889495),
 ('dst_host_srv_diff_host_rate', 0.6889881078118948),
 ('level', 0.689044575746235),
 ('wrong_fragment', 0.689507464492956),
 ('duration', 0.6903243925094846),
 ('land', 0.6907398215255411),
 ('src_bytes', 0.6907503751479369),
 ('dst_bytes', 0.6907520357597836),
 ('num_outbound_cmds', 0.6907538405042426),
 ('is_host_login', 0.69

In [25]:
# Create a DataFrame from the sorted results
result_df = pd.DataFrame(sorted_results, columns=['Feature', 'PassiLuukaEntropy'])

In [26]:
result_df

Unnamed: 0,Feature,PassiLuukaEntropy
0,dst_host_srv_serror_rate,0.622037
1,srv_serror_rate,0.62402
2,serror_rate,0.625805
3,dst_host_serror_rate,0.626313
4,same_srv_rate,0.663809
5,srv_rerror_rate,0.668624
6,dst_host_srv_rerror_rate,0.668746
7,rerror_rate,0.669235
8,dst_host_rerror_rate,0.671865
9,logged_in,0.680257


In [27]:
# Save the results to an Excel file
result_df.to_excel('passi_luuka_results.xlsx', index=False)