In [None]:
# inputs:
#    X: pandas.DataFrame, features
#    y: pandas.Series, target variable
#    K: number of features to select
def min_redun_max_relev(X, y, k):
    # compute F-statistics and initialize correlation matrix
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # initialize list of feature scores
    scores = []
    scores_ith = []

    redundancy = []
    relevancy = []
    # repeat K times
    for i in range(k):
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)

        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        relevancy.append(F.loc[not_selected])
        redundancy.append(corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001))

        scores_ith.append(score)
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

        # add feature name and score to list of feature scores
        scores.append((best, score[best]))
        
    # create DataFrame of feature scores
    score_df = pd.DataFrame(scores, columns=['mRMR', 'Highest_score_each_iteration'])
    return scores,selected,scores_ith,score_df,relevancy,redundancy
    

In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('extraksiSidikJari.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,mu,varians_n,deviasi,skewness,energi,entropi,smoothness,asm0,idm0,...,entropi90,energy90,asm135,idm135,stdevy135,korelasi135,kontras135,entropi135,energy135,Target
0,0,145.878641,0.028206,42.826449,2.317846,0.114321,5.502868,0.027432,0.057637,0.332372,...,6.289958,0.053554,0.04055,0.260301,9977.57043,5.5e-05,12312.538867,6.502044,0.04055,1
1,1,157.544397,0.031288,45.105789,0.728292,0.186542,4.746257,0.030339,0.138548,0.465802,...,5.233408,0.129908,0.122249,0.4147,9977.57043,5.5e-05,9158.559511,5.368045,0.122249,2
2,2,122.033879,0.023271,38.899822,6.477194,0.107866,5.576895,0.022742,0.042778,0.307723,...,6.47349,0.039466,0.029825,0.24646,9977.57043,5.5e-05,12070.446598,6.619654,0.029825,3
3,3,127.809567,0.025716,40.892211,4.81516,0.090897,5.841717,0.025071,0.042098,0.306074,...,6.746558,0.033565,0.032915,0.256447,9977.57043,5.5e-05,9972.608595,6.751549,0.032915,4
4,4,147.56159,0.029471,43.776078,1.830432,0.10928,5.582667,0.028627,0.059552,0.352693,...,6.583078,0.040549,0.028748,0.219472,9977.57043,5.5e-05,15205.186223,6.692705,0.028748,5


In [4]:
df = df.iloc[:, 1:]

In [5]:
df.head()

Unnamed: 0,mu,varians_n,deviasi,skewness,energi,entropi,smoothness,asm0,idm0,stdevy0,...,entropi90,energy90,asm135,idm135,stdevy135,korelasi135,kontras135,entropi135,energy135,Target
0,145.878641,0.028206,42.826449,2.317846,0.114321,5.502868,0.027432,0.057637,0.332372,9836.113792,...,6.289958,0.053554,0.04055,0.260301,9977.57043,5.5e-05,12312.538867,6.502044,0.04055,1
1,157.544397,0.031288,45.105789,0.728292,0.186542,4.746257,0.030339,0.138548,0.465802,10113.634109,...,5.233408,0.129908,0.122249,0.4147,9977.57043,5.5e-05,9158.559511,5.368045,0.122249,2
2,122.033879,0.023271,38.899822,6.477194,0.107866,5.576895,0.022742,0.042778,0.307723,10333.1473,...,6.47349,0.039466,0.029825,0.24646,9977.57043,5.5e-05,12070.446598,6.619654,0.029825,3
3,127.809567,0.025716,40.892211,4.81516,0.090897,5.841717,0.025071,0.042098,0.306074,9775.880965,...,6.746558,0.033565,0.032915,0.256447,9977.57043,5.5e-05,9972.608595,6.751549,0.032915,4
4,147.56159,0.029471,43.776078,1.830432,0.10928,5.582667,0.028627,0.059552,0.352693,9695.617897,...,6.583078,0.040549,0.028748,0.219472,9977.57043,5.5e-05,15205.186223,6.692705,0.028748,5


In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

len_row, len_col = X.shape
print('Row: ',len_row)

X_col = X.columns.tolist()

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the dataset
scaler.fit(X)

# Transform the dataset using the scaler
X_std = scaler.transform(X)

X_std = pd.DataFrame(data=X_std, columns=X_col)

Row:  100


In [7]:
X_std.head()

Unnamed: 0,mu,varians_n,deviasi,skewness,energi,entropi,smoothness,asm0,idm0,stdevy0,...,kontras90,entropi90,energy90,asm135,idm135,stdevy135,korelasi135,kontras135,entropi135,energy135
0,0.380053,0.227138,0.29262,-0.539125,-0.840842,0.834465,0.234943,-0.824044,-1.091012,-0.680678,...,-0.590356,0.762053,-0.698056,-0.834334,-1.093025,1.818989e-12,-2.032879e-20,0.880022,0.975745,-0.834334
1,0.938519,0.616602,0.644057,-0.976033,0.401814,-0.31891,0.622727,0.49581,0.22065,-0.395559,...,-1.079166,-0.514038,0.598706,0.610813,0.471292,1.818989e-12,-2.032879e-20,-0.192325,-0.402878,0.610813
2,-0.761448,-0.396442,-0.312803,0.60412,-0.951918,0.947312,-0.390811,-1.066426,-1.333328,-0.170035,...,0.248263,0.983722,-0.937318,-1.024045,-1.233258,1.818989e-12,-2.032879e-20,0.797711,1.118725,-1.024045
3,-0.484953,-0.087523,-0.005609,0.147291,-1.243891,1.351005,-0.080063,-1.077516,-1.349537,-0.74256,...,0.086795,1.31353,-1.03755,-0.969388,-1.132074,1.818989e-12,-2.032879e-20,0.08445,1.279072,-0.969388
4,0.46062,0.386945,0.439037,-0.673097,-0.927583,0.95611,0.394341,-0.792797,-0.891248,-0.825021,...,1.267286,1.116081,-0.918941,-1.043097,-1.506693,1.818989e-12,-2.032879e-20,1.863517,1.207534,-1.043097


- Dataset train = 70%
- Dataset test = 30%

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3)

# Define the range of values for k
k_values = range(1, len_col+1)

# Initialize dictionaries to store results
accs = {}
selected_each_k = {}
timer = {}

for k in k_values:
    scores, selected, scores_ith, score_df, relevancy, redundancy = min_redun_max_relev(X, y, k)
    selected_each_k[f'k = {k}'] = selected
    
    # Remove all columns from X that are not selected
    X_selected = X_train[selected]
    
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    start_time = time.time()
    model.fit(X_selected, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_selected_test = X_test[selected]
    y_pred = model.predict(X_selected_test)
    acc = accuracy_score(y_test, y_pred)
    
    times = end_time - start_time
    
    timer[f'k = {k}'] = times
    accs[f'k = {k}'] = acc

In [9]:
# Find the maximum number of values across all keys
max_values = max([len(val) if isinstance(val, list) else 1 for val in selected_each_k.values()])

# Iterate through the keys and add "n/a" values as necessary
for key, val in selected_each_k.items():
    # Check if the value is a string and split by comma if necessary
    if isinstance(val, str):
        values = val.split(',')
    else:
        values = val
        
    # If the number of values is less than the maximum, add "n/a" values
    num_values = len(values)
    if num_values < max_values:
        diff = max_values - num_values
        values += ['-'] * diff
    
    # Join the values with commas and update the dictionary
    selected_each_k[key] = values
    
selected_each_k_df = pd.DataFrame(selected_each_k)

In [10]:
selected_each_k_df.iloc[:, :100].head(100)

Unnamed: 0,k = 1,k = 2,k = 3,k = 4,k = 5,k = 6,k = 7,k = 8,k = 9,k = 10,...,k = 26,k = 27,k = 28,k = 29,k = 30,k = 31,k = 32,k = 33,k = 34,k = 35
0,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,...,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90,kontras90
1,-,idm135,idm135,idm135,idm135,idm135,idm135,idm135,idm135,idm135,...,idm135,idm135,idm135,idm135,idm135,idm135,idm135,idm135,idm135,idm135
2,-,-,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,...,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90,korelasi90
3,-,-,-,idm90,idm90,idm90,idm90,idm90,idm90,idm90,...,idm90,idm90,idm90,idm90,idm90,idm90,idm90,idm90,idm90,idm90
4,-,-,-,-,kontras135,kontras135,kontras135,kontras135,kontras135,kontras135,...,kontras135,kontras135,kontras135,kontras135,kontras135,kontras135,kontras135,kontras135,kontras135,kontras135
5,-,-,-,-,-,entropi90,entropi90,entropi90,entropi90,entropi90,...,entropi90,entropi90,entropi90,entropi90,entropi90,entropi90,entropi90,entropi90,entropi90,entropi90
6,-,-,-,-,-,-,entropi135,entropi135,entropi135,entropi135,...,entropi135,entropi135,entropi135,entropi135,entropi135,entropi135,entropi135,entropi135,entropi135,entropi135
7,-,-,-,-,-,-,-,asm90,asm90,asm90,...,asm90,asm90,asm90,asm90,asm90,asm90,asm90,asm90,asm90,asm90
8,-,-,-,-,-,-,-,-,energy90,energy90,...,energy90,energy90,energy90,energy90,energy90,energy90,energy90,energy90,energy90,energy90
9,-,-,-,-,-,-,-,-,-,idm45,...,idm45,idm45,idm45,idm45,idm45,idm45,idm45,idm45,idm45,idm45


In [11]:
print(f"Accuracy using all columns= {accs[f'k = {len_col}']}")
print(f"Elapsed Time = {timer[f'k = {len_col}']}")
print()
print(f"Accuracy using 3 best columns= {accs[f'k = 3']}")
print(f"Elapsed Time = {timer[f'k = 3']}")
print()
print(f"Accuracy using 10 best columns= {accs[f'k = 10']}")
print(f"Elapsed Time = {timer[f'k = 10']}")
print()
best_k = max(accs, key=accs.get)
print(f"Best {best_k} with Accuracy = {accs[best_k]}")
print(f"Elapsed Time = {timer[best_k]}")
print()

Accuracy using all columns= 0.3333333333333333
Elapsed Time = 0.18199396133422852

Accuracy using 3 best columns= 0.4666666666666667
Elapsed Time = 0.14115619659423828

Accuracy using 10 best columns= 0.4666666666666667
Elapsed Time = 0.15731000900268555

Best k = 5 with Accuracy = 0.5
Elapsed Time = 0.14145636558532715



In [12]:
worsts = [3,10]
for worst in worsts:
    # USING 3 Worst columns & USING 10 Worst columns
    X_select = X_selected.iloc[:,-worst:]

    start_time = time.time()
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    model.fit(X_select, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_select_test = X_selected_test.iloc[:,-worst:]
    y_pred = model.predict(X_select_test)
    acc = accuracy_score(y_test, y_pred)

    times = end_time - start_time

    print(f"Accuracy using {worst} worst columns= {acc}")
    print(f"Elapsed Time = {times}")
    print()

Accuracy using 3 worst columns= 0.3
Elapsed Time = 0.19614148139953613

Accuracy using 10 worst columns= 0.26666666666666666
Elapsed Time = 0.21077871322631836



In [13]:
pd.DataFrame(scores, columns=['mRMR', 'Highest_score_each_iteration']).head(100)

Unnamed: 0,mRMR,Highest_score_each_iteration
0,kontras90,3038004.0
1,idm135,33.12771
2,korelasi90,36.41094
3,idm90,27.29712
4,kontras135,22.11606
5,entropi90,18.17265
6,entropi135,14.43311
7,asm90,13.14775
8,energy90,12.45281
9,idm45,11.75746


In [14]:
for i in range(len(scores_ith)):
    combines = {'Relevancy':relevancy[i], 'Redundancy':redundancy[i],'MRMR':scores_ith[i]}
    scored_ith = pd.DataFrame(combines)
    scored_ith = scored_ith.sort_values('MRMR', ascending=False)
    print('ITERASI KE ', i+1)
    print(scored_ith)
    print()
    print()

ITERASI KE  1
                Relevancy  Redundancy          MRMR
kontras90    3.038004e+01     0.00001  3.038004e+06
korelasi90   2.122558e+01     0.00001  2.122558e+06
idm90        1.611027e+01     0.00001  1.611027e+06
idm135       1.285394e+01     0.00001  1.285394e+06
kontras135   1.116440e+01     0.00001  1.116440e+06
entropi90    1.025288e+01     0.00001  1.025288e+06
energy90     9.089673e+00     0.00001  9.089673e+05
asm90        9.089673e+00     0.00001  9.089673e+05
entropi135   9.019979e+00     0.00001  9.019979e+05
idm45        8.315262e+00     0.00001  8.315262e+05
asm135       7.998717e+00     0.00001  7.998717e+05
energy135    7.998717e+00     0.00001  7.998717e+05
entropi45    7.339152e+00     0.00001  7.339152e+05
entropi      6.449364e+00     0.00001  6.449364e+05
asm45        6.378795e+00     0.00001  6.378795e+05
energy45     6.378795e+00     0.00001  6.378795e+05
energi       6.299865e+00     0.00001  6.299865e+05
entropi0     5.536523e+00     0.00001  5.536523e+0

korelasi135  0.000000e+00    0.000010  0.000000e+00


ITERASI KE  25
                Relevancy  Redundancy          MRMR
smoothness   1.027996e+00    0.210275  4.888825e+00
varians_n    1.016248e+00    0.212522  4.781852e+00
stdevy0      1.037729e+00    0.315238  3.291891e+00
stdevy45     9.596722e-01    0.296559  3.236029e+00
stdevy90     9.019316e-01    0.282657  3.190901e+00
skewness     9.970007e-02    0.172441  5.781706e-01
mu           1.407710e-01    0.319356  4.407960e-01
kontras0     1.086703e-01    0.540776  2.009526e-01
korelasi0    7.047826e-03    0.352704  1.998230e-02
stdevy135    6.800116e-19    0.000010  6.800116e-14
korelasi135  0.000000e+00    0.000010  0.000000e+00


ITERASI KE  26
                Relevancy  Redundancy          MRMR
varians_n    1.016248e+00    0.244017  4.164658e+00
stdevy0      1.037729e+00    0.320889  3.233921e+00
stdevy45     9.596722e-01    0.304312  3.153584e+00
stdevy90     9.019316e-01    0.291876  3.090119e+00
skewness     9.970007e-02    0