In [165]:
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import StandardScaler
from skrebate import ReliefF
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('extraksiSidikJari.csv')
len_row, len_col = df.shape
df = df.iloc[:, 1:]
df.fillna(df.mode().iloc[0], inplace=True) # Replace NaN values with mode of corresponding column
# use the factorize() function to convert the column of strings to integers
# non_int_columns = df.select_dtypes(include=['object']).columns.tolist()
# for i in non_int_columns:
#     df[i] = pd.factorize(df[i])[0] + 1
    
# Delete duplicate rows
df = df.drop_duplicates()

In [168]:
print(len_row, len_col)

100 37


In [6]:
label_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

df['class'] = df['class'].replace(label_mapping)

label_mapping = {'M': 1, 'F': 2}

df['gender'] = df['gender'].replace(label_mapping)

In [169]:
df.head()

Unnamed: 0,mu,varians_n,deviasi,skewness,energi,entropi,smoothness,asm0,idm0,stdevy0,...,korelasi90,kontras90,entropi90,energy90,asm135,idm135,kontras135,entropi135,energy135,Target
0,145.878641,0.028206,42.826449,2.317846,0.114321,5.502868,0.027432,0.057637,0.332372,9836.113792,...,6.5e-05,7281.685486,6.289958,0.053554,0.04055,0.260301,12312.538867,6.502044,0.04055,1
1,157.544397,0.031288,45.105789,0.728292,0.186542,4.746257,0.030339,0.138548,0.465802,10113.634109,...,7e-05,6264.299347,5.233408,0.129908,0.122249,0.4147,9158.559511,5.368045,0.122249,2
2,122.033879,0.023271,38.899822,6.477194,0.107866,5.576895,0.022742,0.042778,0.307723,10333.1473,...,5.5e-05,9027.148515,6.47349,0.039466,0.029825,0.24646,12070.446598,6.619654,0.029825,3
3,127.809567,0.025716,40.892211,4.81516,0.090897,5.841717,0.025071,0.042098,0.306074,9775.880965,...,5.7e-05,8691.077101,6.746558,0.033565,0.032915,0.256447,9972.608595,6.751549,0.032915,4
4,147.56159,0.029471,43.776078,1.830432,0.10928,5.582667,0.028627,0.059552,0.352693,9695.617897,...,4.3e-05,11148.095429,6.583078,0.040549,0.028748,0.219472,15205.186223,6.692705,0.028748,5


In [167]:
df = df.drop(['stdevy135','korelasi135'], axis=1)

In [170]:
# Membaca data
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])

X_col = df.iloc[:, :-1].columns.tolist()

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the dataset
scaler.fit(X)

# Transform the dataset using the scaler
X_std = scaler.transform(X)

df_X_std = pd.DataFrame(data=X_std, columns=X_col)

In [171]:
def reliefF(X, y, k_neighbors=3):
    # Get the number of samples and features
    n_samples, n_features = X.shape
    
    # Initialize an array to store the feature weights
    weights = np.zeros(n_features)
    
    # Compute class probabilities
    class_probs = np.bincount(y) / len(y)
    
    # Get maximum and minimum values for each feature
    max_val = np.max(X, axis=0)
    min_val = np.min(X, axis=0)
    
    # Set the number of iterations to the number of samples
    m = n_samples
    
    # Initialize a list to store the indices of selected samples
    selected_indices = []
    
    # Iterate m times to select random samples
    for i in range(m):
        # Find valid indices that have not been selected
        valid_indices = np.setdiff1d(np.arange(n_samples), selected_indices)
        
        # Choose a random index from the valid indices
        random_index = np.random.choice(valid_indices)
        
        # Add the selected index to the list of selected indices
        selected_indices.append(random_index)
        
        # Calculate distances between the randomly selected sample and all other samples
        distances = np.sum(np.abs(X - X[random_index]), axis=1)
        
        # Sort the distances to find the indices of the nearest neighbors
        nearest_indices = np.argsort(distances)
        
        # Get indices of k_neighbors nearest samples from the same class
        nearest_same_class = nearest_indices[(y[nearest_indices] == y[random_index]) & (nearest_indices != random_index)][:k_neighbors]
        
        # Get indices of k_neighbors nearest samples from a different class
        nearest_diff_class = nearest_indices[(y[nearest_indices] != y[random_index])][:k_neighbors]
        
        # Update feature weights for hits 
        for j in range(n_features):
            # Compute the weights for hits (nearest neighbors from the same class)
            nearest_same_dist = distances[nearest_same_class]
            maxmin = (max_val[j] - min_val[j]) if (max_val[j] - min_val[j]) != 0 else 1
            weights_same = np.abs(X[random_index, j] - X[nearest_same_class, j]) / maxmin
            weights[j] -= np.sum(weights_same) / (k_neighbors * m)
            
            # Compute the weights for misses (nearest neighbors from different classes)
            weights_diff = 0
            for C, class_prob in enumerate(class_probs):
                if C != y[random_index]:
                    miss_prob = class_prob / (1 - class_probs[y[random_index]])
                    nearest_diff_dist = distances[nearest_diff_class]
                    miss_weight = np.abs(X[random_index, j] - X[nearest_diff_class, j])
                    weights_diff += miss_prob * np.sum(miss_weight) / (k_neighbors * m)
            
            weights[j] += weights_diff / (k_neighbors * m)
    
    # Return the computed feature weights
    return weights


In [172]:
# Assuming X_std and y are defined
feature_weights = reliefF(X, y)

combines = {'Features': df.iloc[:, :-1].columns, 'Weights': feature_weights}
weight_f = pd.DataFrame(combines)
weight_f = weight_f.sort_values('Weights', ascending=False)


In [173]:
weight_f.head(100)

Unnamed: 0,Features,Weights
18,kontras45,3.648022
30,kontras135,3.395117
11,kontras0,3.084248
25,kontras90,2.494686
23,stdevy90,1.126367
9,stdevy0,1.101112
16,stdevy45,1.092566
17,korelasi45,-0.072963
24,korelasi90,-0.0906
0,mu,-0.120025


In [182]:
list_feat = weight_f['Features'].tolist()
X_train, X_test, y_train, y_test = train_test_split(df_X_std, y, test_size = 0.3)

accs = {}
timer = {}
for num_feat in range(len_col-1):
    X_selected = X_train[list_feat[:num_feat+1]]
    
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    start_time = time.time()
    model.fit(X_selected, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_selected_test = X_test[list_feat[:num_feat+1]]
    y_pred = model.predict(X_selected_test)
    acc = accuracy_score(y_test, y_pred)
    
    times = end_time - start_time
    
    timer[f'k = {num_feat+1}'] = times
    accs[f'k = {num_feat+1}'] = acc

In [183]:
print(f"Accuracy using all columns= {accs[f'k = {len_col-1}']}")
print(f"Elapsed Time = {timer[f'k = {len_col-1}']}")
print()
print(f"Accuracy using 3 best columns= {accs[f'k = 3']}")
print(f"Elapsed Time = {timer[f'k = 3']}")
print()
print(f"Accuracy using 10 best columns= {accs[f'k = 10']}")
print(f"Elapsed Time = {timer[f'k = 10']}")
print()
best_k = max(accs, key=accs.get)
print(f"Best {best_k} with Accuracy = {accs[best_k]}")
print(f"Elapsed Time = {timer[best_k]}")
print()

Accuracy using all columns= 0.4666666666666667
Elapsed Time = 0.18749165534973145

Accuracy using 3 best columns= 0.36666666666666664
Elapsed Time = 0.17126893997192383

Accuracy using 10 best columns= 0.4
Elapsed Time = 0.22539114952087402

Best k = 17 with Accuracy = 0.5666666666666667
Elapsed Time = 0.21403217315673828



In [184]:
worsts = [3,10]
for worst in worsts:
    # USING 3 Worst columns & USING 10 Worst columns
    X_select = X_selected[list_feat[-worst:]]

    
    # Train a random forest model on the training data
    model = RandomForestClassifier(random_state = 42)
    start_time = time.time()
    model.fit(X_select, y_train)
    end_time = time.time()

    # Evaluate the model on the testing data
    X_select_test = X_selected_test[list_feat[-worst:]]
    y_pred = model.predict(X_select_test)
    acc = accuracy_score(y_test, y_pred)

    times = end_time - start_time

    print(f"Accuracy using {worst} worst columns= {acc}")
    print(f"Elapsed Time = {times}")
    print()

Accuracy using 3 worst columns= 0.3333333333333333
Elapsed Time = 0.18242645263671875

Accuracy using 10 worst columns= 0.3
Elapsed Time = 0.1740589141845703

