In [1]:
# inputs:
#    X: pandas.DataFrame, features
#    y: pandas.Series, target variable
#    K: number of features to select
def min_redun_max_relev(X, y, k):
    # compute F-statistics and initialize correlation matrix
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # initialize list of feature scores
    scores = []
    scores_ith = []

    redundancy = []
    relevancy = []
    # repeat K times
    for i in range(k):
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)

        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        relevancy.append(F.loc[not_selected])
        redundancy.append(corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001))

        scores_ith.append(score)
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

        # add feature name and score to list of feature scores
        scores.append((best, score[best]))
        
    # create DataFrame of feature scores
    score_df = pd.DataFrame(scores, columns=['mRMR', 'Highest_score_each_iteration'])
    return scores,selected,scores_ith,score_df,relevancy,redundancy
    

In [2]:
import pandas as pd
import numpy as np
import time

from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('bodyPerformance.csv')

In [9]:
df.shape

(13392, 12)

In [5]:
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [11]:
label_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

df['class'] = df['class'].replace(label_mapping)

label_mapping = {'M': 1, 'F': 2}

df['gender'] = df['gender'].replace(label_mapping)

In [19]:
# df = df.iloc[:,1:]

In [8]:
df = df.drop_duplicates()

In [12]:
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,1,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,2
1,25.0,1,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,0
2,31.0,1,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,2
3,32.0,1,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,1
4,28.0,1,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,1


In [13]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# len_row, len_col = X.shape
# print('Row: ',len_row)

# X_col = X.columns.tolist()

# # Create a StandardScaler object
# scaler = StandardScaler()

# # Fit the scaler to the dataset
# scaler.fit(X)

# # Transform the dataset using the scaler
# X_std = scaler.transform(X)

# X_std = pd.DataFrame(data=X_std, columns=X_col)

In [23]:
X_std.head()

Unnamed: 0,Mean,Variance,Median,Mode,Skewness,Kurtosis,Energy,Entropy,MinimalGrayLevel,MaximalGrayLevel,...,InverseDifferenceMoment,SumAverage,SumVariance,SumEntropy,Entropy.1,DifferenceVariance,DifferenceEntropy,Information1,Information2,MaximalCorrelationCoefficient
0,-0.180999,-0.538099,-0.199735,-0.232269,0.449734,-0.32282,-0.341649,0.163847,0.084815,0.746944,...,-0.857312,-0.183544,-0.754107,-0.073805,0.509786,-0.737144,0.979899,1.509808,-2.788799,-1.190062
1,-0.127888,-0.607917,-0.15952,-0.232269,0.565162,-0.253321,-0.276182,0.051896,-0.878989,0.697722,...,-0.709831,-0.13147,-0.75201,-0.095683,0.35066,-0.637912,0.676104,1.29155,-1.99086,-0.65138
2,0.381789,-0.730274,0.323061,0.2682,0.147512,-0.095744,-0.146361,-0.164413,0.239023,0.746944,...,-0.479718,0.379138,-0.823882,-0.243414,0.087169,-0.488147,0.322242,1.006787,-1.201779,-0.641203
3,0.729801,-0.735365,0.765427,0.730172,-0.397183,-0.181066,-0.169533,-0.217264,1.279931,-0.877382,...,-0.591242,0.729276,-0.769401,-0.316136,0.078413,-0.480788,0.338579,1.166145,-1.749946,0.176481
4,-0.010595,-0.511046,-0.038875,-0.116776,0.084859,-0.28408,-0.376526,0.185699,-0.107946,0.746944,...,-0.467975,-0.012146,-0.548011,0.208174,0.25648,-0.45079,0.209389,0.454855,-0.023527,0.175982


- Dataset train = 70%
- Dataset test = 30%

In [27]:
print(X_std[['Variance', 'Mean']])

     Variance      Mean
0   -0.538099 -0.180999
1   -0.607917 -0.127888
2   -0.730274  0.381789
3   -0.735365  0.729801
4   -0.511046 -0.010595
..        ...       ...
115  1.230429  0.544807
116  0.287835  0.365655
117  1.106398  0.475829
118  1.237495  0.592479
119 -0.157495  0.217006

[120 rows x 2 columns]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Define the range of values for k
k_values = range(1, len_col+1)

# Initialize dictionaries to store results
accs = {}
selected_each_k = {}
timer = {}

for k in k_values:
    scores, selected, scores_ith, score_df, relevancy, redundancy = min_redun_max_relev(X, y, k)
    selected_each_k[f'k = {k}'] = selected
    
    # Remove all columns from X that are not selected
    X_selected = X_train[selected]
    
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    start_time = time.time()
    model.fit(X_selected, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_selected_test = X_test[selected]
    y_pred = model.predict(X_selected_test)
    acc = accuracy_score(y_test, y_pred)
    
    times = end_time - start_time
    
    timer[f'k = {k}'] = times
    accs[f'k = {k}'] = acc

In [15]:
# Find the maximum number of values across all keys
max_values = max([len(val) if isinstance(val, list) else 1 for val in selected_each_k.values()])

# Iterate through the keys and add "n/a" values as necessary
for key, val in selected_each_k.items():
    # Check if the value is a string and split by comma if necessary
    if isinstance(val, str):
        values = val.split(',')
    else:
        values = val
        
    # If the number of values is less than the maximum, add "n/a" values
    num_values = len(values)
    if num_values < max_values:
        diff = max_values - num_values
        values += ['-'] * diff
    
    # Join the values with commas and update the dictionary
    selected_each_k[key] = values
    
selected_each_k_df = pd.DataFrame(selected_each_k)

In [17]:
selected_each_k_df.iloc[:, -1:].head(100)

Unnamed: 0,k = 11
0,sit and bend forward_cm
1,broad jump_cm
2,sit-ups counts
3,body fat_%
4,weight_kg
5,diastolic
6,gripForce
7,age
8,gender
9,systolic


In [18]:
selected_each_k_df.iloc[:, :100].head(100)

Unnamed: 0,k = 1,k = 2,k = 3,k = 4,k = 5,k = 6,k = 7,k = 8,k = 9,k = 10,k = 11
0,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm,sit and bend forward_cm
1,-,broad jump_cm,broad jump_cm,broad jump_cm,broad jump_cm,broad jump_cm,broad jump_cm,broad jump_cm,broad jump_cm,broad jump_cm,broad jump_cm
2,-,-,sit-ups counts,sit-ups counts,sit-ups counts,sit-ups counts,sit-ups counts,sit-ups counts,sit-ups counts,sit-ups counts,sit-ups counts
3,-,-,-,body fat_%,body fat_%,body fat_%,body fat_%,body fat_%,body fat_%,body fat_%,body fat_%
4,-,-,-,-,weight_kg,weight_kg,weight_kg,weight_kg,weight_kg,weight_kg,weight_kg
5,-,-,-,-,-,diastolic,diastolic,diastolic,diastolic,diastolic,diastolic
6,-,-,-,-,-,-,gripForce,gripForce,gripForce,gripForce,gripForce
7,-,-,-,-,-,-,-,age,age,age,age
8,-,-,-,-,-,-,-,-,gender,gender,gender
9,-,-,-,-,-,-,-,-,-,systolic,systolic


In [16]:
accs

{'k = 1': 0.4,
 'k = 2': 0.23333333333333334,
 'k = 3': 0.3333333333333333,
 'k = 4': 0.3333333333333333,
 'k = 5': 0.3,
 'k = 6': 0.3,
 'k = 7': 0.26666666666666666,
 'k = 8': 0.26666666666666666,
 'k = 9': 0.23333333333333334,
 'k = 10': 0.3333333333333333,
 'k = 11': 0.3,
 'k = 12': 0.23333333333333334,
 'k = 13': 0.3,
 'k = 14': 0.3333333333333333,
 'k = 15': 0.23333333333333334,
 'k = 16': 0.23333333333333334,
 'k = 17': 0.23333333333333334,
 'k = 18': 0.26666666666666666,
 'k = 19': 0.3,
 'k = 20': 0.26666666666666666,
 'k = 21': 0.3,
 'k = 22': 0.26666666666666666,
 'k = 23': 0.26666666666666666,
 'k = 24': 0.23333333333333334,
 'k = 25': 0.3,
 'k = 26': 0.3,
 'k = 27': 0.3,
 'k = 28': 0.26666666666666666,
 'k = 29': 0.36666666666666664,
 'k = 30': 0.3333333333333333,
 'k = 31': 0.36666666666666664,
 'k = 32': 0.26666666666666666,
 'k = 33': 0.3333333333333333,
 'k = 34': 0.3,
 'k = 35': 0.36666666666666664}

In [15]:
print(f"Accuracy using all columns= {accs[f'k = {len_col}']}")
print(f"Elapsed Time = {timer[f'k = {len_col}']}")
print()
print(f"Accuracy using 3 best columns= {accs[f'k = 3']}")
print(f"Elapsed Time = {timer[f'k = 3']}")
print()
print(f"Accuracy using 10 best columns= {accs[f'k = 10']}")
print(f"Elapsed Time = {timer[f'k = 10']}")
print()
best_k = max(accs, key=accs.get)
print(f"Best {best_k} with Accuracy = {accs[best_k]}")
print(f"Elapsed Time = {timer[best_k]}")
print()

Accuracy using all columns= 0.36666666666666664
Elapsed Time = 0.18005943298339844

Accuracy using 3 best columns= 0.3333333333333333
Elapsed Time = 0.1597604751586914

Accuracy using 10 best columns= 0.3333333333333333
Elapsed Time = 0.17781281471252441

Best k = 1 with Accuracy = 0.4
Elapsed Time = 0.21074366569519043



In [16]:
worsts = [3,10]
for worst in worsts:
    # USING 3 Worst columns & USING 10 Worst columns
    X_select = X_selected.iloc[:,-worst:]

    start_time = time.time()
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    model.fit(X_select, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_select_test = X_selected_test.iloc[:,-worst:]
    y_pred = model.predict(X_select_test)
    acc = accuracy_score(y_test, y_pred)

    times = end_time - start_time

    print(f"Accuracy using {worst} worst columns= {acc}")
    print(f"Elapsed Time = {times}")
    print()

Accuracy using 3 worst columns= 0.3
Elapsed Time = 0.23135709762573242

Accuracy using 10 worst columns= 0.16666666666666666
Elapsed Time = 0.15614604949951172



In [19]:
pd.DataFrame(scores, columns=['mRMR', 'Highest_score_each_iteration']).head(100)

Unnamed: 0,mRMR,Highest_score_each_iteration
0,sit and bend forward_cm,708023200.0
1,broad jump_cm,37296.45
2,sit-ups counts,7462.74
3,body fat_%,3933.49
4,weight_kg,2226.258
5,diastolic,602.6135
6,gripForce,527.1639
7,age,233.727
8,gender,160.7125
9,systolic,69.27774


In [20]:
for i in range(len(scores_ith)):
    combines = {'Relevancy':relevancy[i], 'Redundancy':redundancy[i],'MRMR':scores_ith[i]}
    scored_ith = pd.DataFrame(combines)
    scored_ith = scored_ith.sort_values('MRMR', ascending=False)
    print('ITERASI KE ', i+1)
    print(scored_ith)
    print()
    print()

ITERASI KE  1
                           Relevancy  Redundancy          MRMR
sit and bend forward_cm  7080.232375     0.00001  7.080232e+08
sit-ups counts           3453.239895     0.00001  3.453240e+08
body fat_%               1774.716607     0.00001  1.774717e+08
broad jump_cm             988.769614     0.00001  9.887696e+07
weight_kg                 642.688746     0.00001  6.426887e+07
gripForce                 253.040526     0.00001  2.530405e+07
gender                     76.729646     0.00001  7.672965e+06
diastolic                  59.809699     0.00001  5.980970e+06
age                        57.774806     0.00001  5.777481e+06
height_cm                  18.977819     0.00001  1.897782e+06
systolic                   16.608674     0.00001  1.660867e+06


ITERASI KE  2
                  Relevancy  Redundancy          MRMR
broad jump_cm    988.769614    0.026511  37296.454312
body fat_%      1774.716607    0.071269  24901.493411
sit-ups counts  3453.239895    0.177128  19495.72052