In [24]:
import pandas as pd
import numpy as np
import time

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from skrebate import ReliefF
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('extraksiSidikJari.csv')
len_row, len_col = df.shape
df = df.iloc[:, 1:]

In [3]:
df.head()

Unnamed: 0,mu,varians_n,deviasi,skewness,energi,entropi,smoothness,asm0,idm0,stdevy0,...,entropi90,energy90,asm135,idm135,stdevy135,korelasi135,kontras135,entropi135,energy135,Target
0,145.878641,0.028206,42.826449,2.317846,0.114321,5.502868,0.027432,0.057637,0.332372,9836.113792,...,6.289958,0.053554,0.04055,0.260301,9977.57043,5.5e-05,12312.538867,6.502044,0.04055,1
1,157.544397,0.031288,45.105789,0.728292,0.186542,4.746257,0.030339,0.138548,0.465802,10113.634109,...,5.233408,0.129908,0.122249,0.4147,9977.57043,5.5e-05,9158.559511,5.368045,0.122249,2
2,122.033879,0.023271,38.899822,6.477194,0.107866,5.576895,0.022742,0.042778,0.307723,10333.1473,...,6.47349,0.039466,0.029825,0.24646,9977.57043,5.5e-05,12070.446598,6.619654,0.029825,3
3,127.809567,0.025716,40.892211,4.81516,0.090897,5.841717,0.025071,0.042098,0.306074,9775.880965,...,6.746558,0.033565,0.032915,0.256447,9977.57043,5.5e-05,9972.608595,6.751549,0.032915,4
4,147.56159,0.029471,43.776078,1.830432,0.10928,5.582667,0.028627,0.059552,0.352693,9695.617897,...,6.583078,0.040549,0.028748,0.219472,9977.57043,5.5e-05,15205.186223,6.692705,0.028748,5


In [16]:
# Membaca data
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])

X_col = df.iloc[:, :-1].columns.tolist()

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the dataset
scaler.fit(X)

# Transform the dataset using the scaler
X_std = scaler.transform(X)

X_std = pd.DataFrame(data=X_std, columns=X_col)

In [30]:
import numpy as np

def reliefF(X, y, k_neighbors=3):
    n_samples, n_features = X.shape
    weights = np.zeros(n_features)

    for i in range(n_samples):
        distances = np.sqrt(np.sum((X - X[i])**2, axis=1))
        nearest_indices = np.argsort(distances)[1:k_neighbors+1]
        nearest_same_class = nearest_indices[y[nearest_indices] == y[i]]
        nearest_diff_class = nearest_indices[y[nearest_indices] != y[i]]
        
        # Update feature weights
        if len(nearest_same_class) > 0:
            nearest_same_dist = distances[nearest_same_class]
            for j in range(n_features):
                weights[j] -= np.sum(np.abs(X[i, j] - X[nearest_same_class, j])) / (k_neighbors * np.mean(nearest_same_dist))
        
        if len(nearest_diff_class) > 0:
            nearest_diff_dist = distances[nearest_diff_class]
            for j in range(n_features):
                weights[j] += np.sum(np.abs(X[i, j] - X[nearest_diff_class, j])) / (k_neighbors * np.mean(nearest_diff_dist))
    
    return weights

feature_weights = reliefF(X, y)

combines = {'Features':df.iloc[:, :-1].columns, 'Weights':feature_weights}
weight_f = pd.DataFrame(combines)
weight_f = weight_f.sort_values('Weights', ascending=False)


In [31]:
weight_f.head(100)

Unnamed: 0,Features,Weights
32,kontras135,19.99524
11,kontras0,17.84603
18,kontras45,17.77476
25,kontras90,14.97657
23,stdevy90,9.879209
16,stdevy45,9.495293
9,stdevy0,9.309977
0,mu,0.5301213
2,deviasi,0.1269797
3,skewness,0.08677431


In [56]:
list_feat = weight_f['Features'].tolist()
print(list_feat)

['kontras135', 'kontras0', 'kontras45', 'kontras90', 'stdevy90', 'stdevy45', 'stdevy0', 'mu', 'deviasi', 'skewness', 'entropi45', 'entropi0', 'entropi135', 'entropi90', 'entropi', 'idm45', 'idm90', 'idm0', 'idm135', 'asm0', 'energy0', 'energi', 'energy90', 'asm90', 'asm45', 'energy45', 'asm135', 'energy135', 'varians_n', 'smoothness', 'korelasi0', 'korelasi45', 'korelasi90', 'stdevy135', 'korelasi135']


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.4)

accs = {}
timer = {}
for num_feat in range(len_col-1):
    X_selected = X_train[list_feat[:num_feat+1]]
    
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    start_time = time.time()
    model.fit(X_selected, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_selected_test = X_test[list_feat[:num_feat+1]]
    y_pred = model.predict(X_selected_test)
    acc = accuracy_score(y_test, y_pred)
    
    times = end_time - start_time
    
    timer[f'k = {num_feat+1}'] = times
    accs[f'k = {num_feat+1}'] = acc

In [54]:
print(f"Accuracy using all columns= {accs[f'k = {len_col-1}']}")
print(f"Elapsed Time = {timer[f'k = {len_col-1}']}")
print()
print(f"Accuracy using 3 best columns= {accs[f'k = 3']}")
print(f"Elapsed Time = {timer[f'k = 3']}")
print()
print(f"Accuracy using 10 best columns= {accs[f'k = 10']}")
print(f"Elapsed Time = {timer[f'k = 10']}")
print()
best_k = max(accs, key=accs.get)
print(f"Best {best_k} with Accuracy = {accs[best_k]}")
print(f"Elapsed Time = {timer[best_k]}")
print()

Accuracy using all columns= 0.3
Elapsed Time = 0.18304872512817383

Accuracy using 3 best columns= 0.35
Elapsed Time = 0.18988299369812012

Accuracy using 10 best columns= 0.375
Elapsed Time = 0.18204116821289062

Best k = 4 with Accuracy = 0.375
Elapsed Time = 0.1906278133392334



In [55]:
worsts = [3,10]
for worst in worsts:
    # USING 3 Worst columns & USING 10 Worst columns
    X_select = X_selected[list_feat[-worst:]]

    start_time = time.time()
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    model.fit(X_select, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_select_test = X_selected_test[list_feat[-worst:]]
    y_pred = model.predict(X_select_test)
    acc = accuracy_score(y_test, y_pred)

    times = end_time - start_time

    print(f"Accuracy using {worst} worst columns= {acc}")
    print(f"Elapsed Time = {times}")
    print()

Accuracy using 3 worst columns= 0.175
Elapsed Time = 0.1933002471923828

Accuracy using 10 worst columns= 0.225
Elapsed Time = 0.18997740745544434

