In [4]:
import numpy as np
import pandas as pd

In [15]:


def feat_sel_sim(data, measure='luca', p=1):
    # Extracting dimensions
    l = np.max(data[:, -1])  # Number of classes
    m = data.shape[0]        # Number of samples
    t = data.shape[1] - 1    # Number of features
    
    dataold = data.copy()
    tmp = []
    
    # Forming idealvec using arithmetic mean
    idealvec_s = np.zeros((l, t))
    for k in range(1, int(l) + 1):
        idealvec_s[k - 1, :] = np.mean(data[data[:, -1] == k, :t], axis=0)

    # Scaling data between [0, 1]
    data_v = data[:, :t]
    data_c = data[:, t]  # Labels
    mins_v = np.min(data_v, axis=0)
    data_v = data_v + np.tile(np.abs(mins_v), (m, 1))
    tmp = np.tile(np.abs(mins_v), (l, 1))
    idealvec_s = idealvec_s + tmp
    maxs_v = np.max(data_v, axis=0)
    data_v = data_v / maxs_v
    idealvec_s = idealvec_s / np.tile(maxs_v, (l, 1))
    data = np.column_stack((data_v, data_c))
    
    # Sample data
    datalearn_s = data[:, :t]

    # Similarities
    sim = np.zeros((t, m, int(l)))
    for j in range(m):
        for i in range(t):
            for k in range(int(l)):
                sim[i, j, k] = (1 - np.abs(idealvec_s[k, i] ** p - datalearn_s[j, i]) ** p) ** (1 / p)

    # Reduce the number of dimensions in sim
    sim = sim.reshape((t, m * int(l))).T

    # Possibility for two different entropy measures
    if measure == 'luca':
        # Modifying zero and one values of the similarity values to work with De Luca's entropy measure
        delta = 1e-10
        sim[sim == 0] = delta
        sim[sim == 1] = 1 - delta
        H = np.sum(-sim * np.log(sim) - (1 - sim) * np.log(1 - sim))
    elif measure == 'park':
        H = np.sum(np.sin(np.pi / 2 * sim) + np.sin(np.pi / 2 * (1 - sim)) - 1)

    # Find the maximum feature
    index_rem = np.argmax(H)

    # Removing the feature from the data
    data_mod = np.column_stack((dataold[:, :index_rem], dataold[:, index_rem + 1:]))

    return data_mod, index_rem

# Example usage with dummy data
# Replace this with loading your MSL KDD dataset
# data = np.loadtxt('path/to/your/msl_kdd_dataset.csv', delimiter=',')
# data_mod, index_rem = feat_sel_sim(data)


In [103]:
data = pd.read_csv("../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")
df = pd.read_csv("../data/BinaryClassify/train_nsl_kdd_binary_encoded.csv")

In [104]:
data=data.values.astype(int)

In [24]:
data_mod, index_rem = feat_sel_sim(data)

  data_v = data_v / maxs_v
  idealvec_s = idealvec_s / np.tile(maxs_v, (l, 1))


In [26]:
data_mod = data

In [28]:
for i in range(data.shape[1]):
    data_mod, index_rem = feat_sel_sim(data_mod)
    print(data_mod.shape)
    print(index_rem)

  data_v = data_v / maxs_v
  idealvec_s = idealvec_s / np.tile(maxs_v, (l, 1))


(125972, 42)
0
(125972, 41)
0
(125972, 40)
0
(125972, 39)
0
(125972, 38)
0
(125972, 37)
0
(125972, 36)
0
(125972, 35)
0
(125972, 34)
0
(125972, 33)
0


KeyboardInterrupt: 

In [27]:
data.shape

(125972, 43)

In [106]:
import numpy as np

def feat_sel_sim(data, measure='luca', p=1):
    # Check if p is provided, otherwise set default value
    if p is None:
        p = 1
    
    # Check if measure is provided, otherwise set default value
    if measure is None:
        measure = 'luca'

    # Get the number of classes, samples, and features
    l = np.max(data[:, -1])
    m = data.shape[0]
    t = data.shape[1] - 1

    dataold = data.copy()

    # Forming idealvec using arithmetic mean
    idealvec_s = np.zeros((l, t))
    for k in range(1, l+1):
        idealvec_s[k-1, :] = np.mean(data[data[:, -1] == k, 0:t], axis=0)

    # Scaling data between [0,1]
    data_v = data[:, 0:t]
    data_c = data[:, t]
    mins_v = np.min(data_v, axis=0)
    data_v = data_v + np.ones(data_v.shape) * np.abs(mins_v)

    tmp = np.tile(np.abs(mins_v), (l, 1))
    idealvec_s = idealvec_s + tmp

    maxs_v = np.max(data_v, axis=0)
    # Check if maxs_v contains zero values to avoid division by zero
    maxs_v_nonzero = np.where(maxs_v == 0, 1, maxs_v)
    data_v = data_v / maxs_v_nonzero  # Broadcasting corrected here

    # Replace NaN or Inf values with zeros
    data_v = np.nan_to_num(data_v)
    idealvec_s = idealvec_s / np.tile(maxs_v_nonzero, (l, 1))
    # Replace NaN or Inf values with zeros
    idealvec_s = np.nan_to_num(idealvec_s)

    data = np.column_stack((data_v, data_c))

    # Sample data
    datalearn_s = data[:, 0:t]

    # Similarities
    sim = np.zeros((t, m, l))
    for j in range(m):
        for i in range(t):
            for k in range(l):
                sim[i, j, k] = (1 - np.abs(idealvec_s[k, i]**p - datalearn_s[j, i])**p)**(1/p)

    # Reduce number of dimensions in sim
    sim = sim.reshape(t, m*l).T

    # Possibility for two different entropy measures
    if measure == 'luca':
        # Modifying zero and one values of the similarity values to work with De Luca's entropy measure
        delta = 1E-10
        sim[sim == 0] = delta
        sim[sim == 1] = 1 - delta
        H = np.sum(-sim * np.log(sim) - (1 - sim) * np.log(1 - sim))

    elif measure == 'park':
        H = np.sum(np.sin(np.pi/2 * sim) + np.sin(np.pi/2 * (1 - sim)) - 1)

    # Find maximum feature
    index_rem = np.argmax(H)

    # Removing feature from the data
    data_mod = np.column_stack((dataold[:, 0:index_rem], dataold[:, index_rem+1:]))

    return data_mod, index_rem

# Example usage:
# data = ...  # provide your data matrix
# data_mod, index_rem = feat_sel_sim(data, measure='luca', p=1)


In [58]:
measure='luca'
p=1

In [59]:
# Check if p is provided, otherwise set default value
if p is None:
    p = 1
    
    # Check if measure is provided, otherwise set default value
if measure is None:
    measure = 'luca'

In [79]:
    # Get the number of classes, samples, and features
l = np.max(data[:, -1])+1
m = data.shape[0]
t = data.shape[1] - 1

In [80]:
dataold = data.copy()
tmp = []

In [82]:
# Forming idealvec using arithmetic mean
idealvec_s = np.zeros((l, t))
for k in range(0, int(l) + 1):
    idealvec_s[k, :] = np.mean(data[data[:, -1] == k, :t], axis=0)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


IndexError: index 2 is out of bounds for axis 0 with size 2

In [78]:
idealvec_s

array([[1.68589899e+02, 1.16520151e+00, 2.70085682e+01, 8.60372427e+00,
        1.31334671e+04, 4.32974952e+03, 1.03947017e-04, 0.00000000e+00,
        1.48495738e-04, 2.30658430e-01, 1.38101037e-03, 7.10656054e-01,
        5.07083247e-01, 2.03439161e-03, 2.04924119e-03, 5.62932494e-01,
        2.22743607e-02, 6.08832527e-04, 7.49903478e-03, 0.00000000e+00,
        1.48495738e-05, 1.29636779e-02, 2.25182501e+01, 2.76860355e+01,
        5.98437825e-03, 4.88550979e-03, 4.13560631e-02, 4.07620801e-02,
        9.41047192e-01, 1.46416798e-02, 6.81446942e-02, 1.47431885e+02,
        1.90288215e+02, 6.25954085e-01, 7.72177838e-04, 3.75545722e-02,
        1.72255056e-03, 9.65222298e-04, 4.90035936e-04, 2.99961391e-02,
        1.86956134e-02, 2.03159247e+01]])

In [67]:
l

1

In [83]:
import numpy as np

def feat_sel_sim(data, measure='luca', p=1):
    # Check if p is provided, otherwise set default value
    if p is None:
        p = 1
    
    # Check if measure is provided, otherwise set default value
    if measure is None:
        measure = 'luca'

    l = int(np.max(data[:, -1])) + 1  # Increment by 1 for class label 0
    m = data.shape[0]
    t = data.shape[1] - 1

    dataold = data.copy()
    tmp = []

    # Forming idealvec using arithmetic mean
    idealvec_s = np.zeros((l, t))
    for k in range(l):
        idealvec_s[k, :] = np.mean(data[data[:, -1] == k, 0:t], axis=0)

    # Scaling data between [0, 1]
    data_v = data[:, 0:t]
    data_c = data[:, t]
    mins_v = np.min(data_v, axis=0)
    data_v = data_v + np.ones(data_v.shape) * np.abs(mins_v)

    tmp = np.tile(np.abs(mins_v), (l, 1))
    idealvec_s = idealvec_s + tmp

    maxs_v = np.max(data_v, axis=0)
    maxs_v_nonzero = np.where(maxs_v == 0, 1, maxs_v)
    data_v = data_v / maxs_v_nonzero

    data_v = np.nan_to_num(data_v)
    idealvec_s = idealvec_s / np.tile(maxs_v_nonzero, (l, 1))
    idealvec_s = np.nan_to_num(idealvec_s)

    data = np.column_stack((data_v, data_c))

    # Sample data
    datalearn_s = data[:, 0:t]

    # Similarities
    sim = np.zeros((t, m, l))
    for j in range(m):
        for i in range(t):
            for k in range(l):
                sim[i, j, k] = (1 - np.abs(idealvec_s[k, i]**p - datalearn_s[j, i])**p)**(1/p)

    # Reduce number of dimensions in sim
    sim = sim.reshape(t, m*l).T

    # Possibility for two different entropy measures
    if measure == 'luca':
        delta = 1E-10
        sim[sim == 0] = delta
        sim[sim == 1] = 1 - delta
        H = np.sum(-sim * np.log(sim) - (1 - sim) * np.log(1 - sim), axis=1)

    elif measure == 'park':
        H = np.sum(np.sin(np.pi/2 * sim) + np.sin(np.pi/2 * (1 - sim)) - 1, axis=1)

    # Find maximum feature
    index_rem = np.argmax(H)

    # Removing feature from the data
    data_mod = np.column_stack((dataold[:, 0:index_rem], dataold[:, index_rem+1:]))

    return data_mod, index_rem

# Example usage:
# data = ...  # provide your data matrix with class labels 0 and 1
# data_mod, index_rem = feat_sel_sim(data, measure='luca', p=1)


In [84]:
data_mod, index_rem = feat_sel_sim(data, measure='luca', p=1)

In [85]:
index_rem

808

In [101]:
import numpy as np

def feat_sel_sim(data, measure='luca', p=1):
    """
    Feature selection using similarity measure and fuzzy entropy measures.

    Args:
        data (numpy.ndarray): Data matrix containing features and class values.
        measure (str, optional): Fuzzy entropy measure to use.
                                  Options are 'luca' or 'park'. Defaults to 'luca'.
        p (float, optional): Parameter of Lukasiewicz similarity measure.
                              Defaults to 1.

    Returns:
        tuple: (data_mod, index_rem)
            - data_mod (numpy.ndarray): Data without the removed feature.
            - index_rem (int): Index of the removed feature in the original data.
    """

    l = np.max(data[:, -1])  # Number of classes
    m = data.shape[0]        # Number of samples
    t = data.shape[1] - 1     # Number of features
    dataold = data.copy()

    # Form ideal vectors using arithmetic means for each class
    idealvec = np.zeros((l, t))
    for k in range(l):
        idealvec[k, :] = np.mean(data[data[:, -1] == k, :-1], axis=0)

    # Scale data between [0, 1]
    data_v = data[:, :-1]
    data_c = data[:, -1]  # Labels
    mins_v = np.min(data_v, axis=0)
    data_v = data_v + np.ones_like(data_v) * mins_v
    maxs_v = np.max(data_v, axis=0)
    maxs_v = np.max(data_v, axis=0)
    for i in range(t):
        if maxs_v[i] == 0:  # Check for zero
            data_v[:, i] = 0  # Set entire column to zero
            idealvec[:, i] = 0  # Set corresponding values in idealvec to zero
        else:
            data_v[:, i] /= maxs_v[i]
            idealvec[:, i] /= maxs_v[i]
    data_v = data_v / maxs_v
    idealvec = idealvec / np.tile(maxs_v, (l, 1))
    data = np.hstack((data_v, data_c[:, np.newaxis]))

    # Calculate similarities
    sim = np.zeros((t, m, l))
    for j in range(m):
        for i in range(t):
            for k in range(l):
                sim[i, j, k] = (1 - np.abs(idealvec[k, i]**p - data[j, i]**p)**p)**(1/p)

    sim = sim.reshape(t, m * l).T

    # Calculate fuzzy entropy
    if measure == 'luca':
        delta = 1e-10
        sim[sim == 0] = delta
        sim[sim == 1] = 1 - delta
        H = -np.sum(sim * np.log(sim) + (1 - sim) * np.log(1 - sim), axis=1)
    elif measure == 'park':
        H = np.sum(np.sin(np.pi / 2 * sim) + np.sin(np.pi / 2 * (1 - sim)) - 1, axis=1)

    # Find the feature with maximum entropy (corrected calculation)
    index_rem = np.argmax(H)  # Directly find the index of the maximum element

    # Remove the feature from the data (corrected indexing)
    data_mod = np.hstack((dataold[:, :index_rem], dataold[:, index_rem + 1:]))

    return data_mod, index_rem


In [105]:
data_mod, index_rem = feat_sel_sim(data, measure='luca', p=1)

TypeError: 'numpy.float64' object cannot be interpreted as an integer

In [95]:
index_rem

0

In [96]:
data_mod, index_rem = feat_sel_sim(data_mod, measure='luca', p=1)

  data_v = data_v / maxs_v
  idealvec = idealvec / np.tile(maxs_v, (l, 1))


In [113]:
import numpy as np

def fuzzy_entropy_feature_selection(data, labels, similarity_measure='euclidean'):
    """
    Performs feature selection using fuzzy entropy and similarity measures.

    Args:
        data: Data matrix (features as rows, samples as columns).
        labels: Class labels for each sample.
        similarity_measure: Similarity measure to use ('euclidean' or 'cosine').

    Returns:
        Indices of selected features with the lowest fuzzy entropy.
    """

    classes = np.unique(labels)
    num_features = data.shape[0]

    # Calculate ideal vectors for each class
    ideal_vectors = []
    for c in classes:
        class_data = data[:, labels == c]
        ideal_vector = np.mean(class_data, axis=1)  # You can adjust this based on your needs
        ideal_vectors.append(ideal_vector)

    # Calculate similarity between each feature vector and the ideal vectors
    similarity_matrix = np.zeros((num_features, len(classes)))
    for i in range(num_features):  # Iterate over features (rows)
        for j, ideal_vector in enumerate(ideal_vectors):
            if similarity_measure == 'euclidean':
                similarity = 1 / (1 + np.linalg.norm(data[i, :] - ideal_vector))
            elif similarity_measure == 'cosine':
                similarity = np.dot(data[i, :], ideal_vector.T) / (np.linalg.norm(data[i, :]) * np.linalg.norm(ideal_vector))


            else:
                raise ValueError("Invalid similarity measure")
            similarity_matrix[i, j] = similarity

    # Calculate fuzzy entropy for each feature
    fuzzy_entropy = -np.sum(similarity_matrix * np.log2(similarity_matrix), axis=1)

    # Rank features by fuzzy entropy
    feature_ranking = np.argsort(fuzzy_entropy)

    return feature_ranking

# Example usage:
data = np.random.rand(10, 50)  # Example data matrix
labels = np.random.randint(0, 2, 50)  # Example class labels

selected_features = fuzzy_entropy_feature_selection(data, labels, similarity_measure='cosine')
print("Selected features:", selected_features)


ValueError: shapes (50,) and (10,) not aligned: 50 (dim 0) != 10 (dim 0)

In [114]:
np.random.seed(42)  # Set seed for reproducibility

num_samples = 20  # Number of samples
num_features = 5  # Number of features
num_classes = 2  # Number of classes

# Generate random feature values
data = np.random.randint(low=0, high=10, size=(num_samples, num_features))

# Assign class labels randomly
labels = np.random.randint(low=0, high=num_classes, size=num_samples)

# Optional: Add some structure (e.g., make one feature perfectly correlated with labels)
data[:, 2] = labels  # Set feature 2 to match class labels

# Combine features and labels into a single NumPy array
data = np.hstack((data, labels[:, np.newaxis]))

# Print the generated array to verify its structure
print(data)

[[6 3 1 4 6 1]
 [9 2 1 7 4 1]
 [3 7 0 2 5 0]
 [4 1 1 5 1 1]
 [4 0 0 5 8 0]
 [0 9 0 6 3 0]
 [8 2 1 2 6 1]
 [4 8 1 1 3 1]
 [8 1 0 8 9 0]
 [4 1 0 6 7 0]
 [2 0 1 1 7 1]
 [3 1 1 5 9 1]
 [3 5 1 9 1 1]
 [9 3 0 6 8 0]
 [7 4 0 4 7 0]
 [9 8 0 0 8 0]
 [6 8 0 0 7 0]
 [7 2 0 7 2 0]
 [2 0 0 9 6 0]
 [9 8 1 8 7 1]]


In [125]:
import numpy as np

def feat_sel_sim(data, measure='luca', p=1):
    # Output initialization
    data_mod = None
    index_rem = None
    
    if p is None:
        p = 1

    if measure is None:
        measure = 'luca'

    l = np.max(data[:, -1])  # #-classes
    m = data.shape[0]        # #-samples
    t = data.shape[1] - 1    # #-features

    dataold = np.copy(data)
    tmp = []

    # Forming idealvec using arithmetic mean
    idealvec_s = np.zeros((l, t))
    for k in range(1, l + 1):
        idealvec_s[k-1, :] = np.mean(data[data[:, -1] == k, :t], axis=0)

    # Scaling data between [0,1]
    data_v = data[:, :t]
    data_c = data[:, t]  # labels
    mins_v = np.min(data_v, axis=0)
    ones = np.ones(data_v.shape)
    data_v = data_v + ones * np.abs(np.diag(mins_v))
    tmp = np.tile(np.abs(mins_v), (l, 1))
    idealvec_s = idealvec_s + tmp
    maxs_v = np.max(data_v, axis=0)
    data_v = data_v  / maxs_v
    idealvec_s = idealvec_s / np.tile(maxs_v, (l, 1))
    data = np.column_stack((data_v, data_c))

    # Sample data
    datalearn_s = data[:, :t]

    # Similarities
    sim = np.zeros((t, m, l))
    for j in range(m):
        for i in range(t):
            for k in range(l):
                sim[i, j, k] = (1 - np.abs(idealvec_s[k, i] ** p - datalearn_s[j, i]) ** p) ** (1 / p)

    # Reduce the number of dimensions in sim
    sim = np.reshape(sim, (t, m * l), order='F')

    # Possibility for two different entropy measures
    if measure == 'luca':
        # Modifying zero and one values of the similarity values to work with De Luca's entropy measure
        delta = 1E-10
        sim[sim == 0] = delta
        sim[sim == 1] = 1 - delta
        H = np.sum(-sim * np.log(sim) - (1 - sim) * np.log(1 - sim))

    elif measure == 'park':
        H = np.sum(np.sin(np.pi / 2 * sim) + np.sin(np.pi / 2 * (1 - sim)) - 1)

    # Find the maximum feature
    index_rem = np.argmax(H)

    # Removing feature from the data
    data_mod = np.column_stack((dataold[:, :index_rem], dataold[:, index_rem + 1:]))
    
    return data_mod, index_rem


In [120]:
data[:,-1] = data[:,-1]+1

In [126]:
data_mod, index_rem = feat_sel_sim(data, measure='luca', p=1)

ValueError: operands could not be broadcast together with shapes (20,5) (5,5) 