In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import joblib
from sklearn.model_selection import KFold

In [16]:
df = pd.read_csv(r"/kaggle/input/feature-common-language/audio_features_partial.csv")
df.head()

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,spectral_rolloff,zero_crossing_rate,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,label,file_path
0,0.712224,0.05574,2989.050985,2193.800068,5666.294643,0.275144,-202.32579,27.614292,4.094784,5.301181,...,4.752774,-0.985637,-6.752584,-4.679379,-5.478848,-0.866508,-1.919669,-0.634521,Arabic,/kaggle/input/preprocess-common-language/proce...
1,0.598403,0.074004,2372.315827,2065.56102,4795.64951,0.178041,-137.41476,59.931843,5.077963,-5.712012,...,-12.469353,-2.134825,-10.989368,-1.460541,-4.485021,-0.408789,-8.211143,-5.170048,Arabic,/kaggle/input/preprocess-common-language/proce...
2,0.756316,0.046051,3274.178654,2196.474265,5870.572917,0.353054,-233.02031,22.69155,10.057923,3.829097,...,-1.657243,-5.409642,-4.017134,-6.744406,-1.69763,-0.387302,0.829549,1.29211,Arabic,/kaggle/input/preprocess-common-language/proce...
3,0.588983,0.061376,1948.418292,2049.242741,4186.921296,0.137682,-199.0449,80.80687,31.45138,-1.297673,...,-4.202263,0.065943,-9.312079,-7.16406,-3.08204,-8.046175,-3.083879,-2.018449,Arabic,/kaggle/input/preprocess-common-language/proce...
4,0.647222,0.069638,1705.618989,1824.714129,3730.709877,0.09554,-325.47556,92.37382,17.725632,31.867613,...,3.169903,4.538502,-1.073114,-1.204524,-0.108214,-4.80346,-2.882802,-1.455632,Arabic,/kaggle/input/preprocess-common-language/proce...


In [17]:
df.duplicated().sum()

0

In [18]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder with your actual labels
label_encoder.fit(df['label'])

# Transform actual labels to numeric labels
df['numeric_labels'] = label_encoder.transform(df['label'])

In [19]:
X = df.drop(columns=['label','numeric_labels','file_path'])
y = df['numeric_labels']

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [21]:
from collections import Counter

# Jumlah kelas sebelum SMOTE
print("Jumlah kelas sebelum SMOTE:")
print(Counter(y))

# Jumlah kelas setelah SMOTE
print("\nJumlah kelas setelah SMOTE:")
print(Counter(y_resampled))

Jumlah kelas sebelum SMOTE:
Counter({2: 3570, 39: 3177, 19: 3012, 6: 3003, 27: 2955, 41: 2784, 24: 2766, 0: 2751, 20: 2733, 37: 2691, 10: 2550, 33: 2547, 18: 2520, 21: 2484, 42: 2472, 8: 2442, 40: 2367, 26: 2331, 30: 2283, 28: 2277, 23: 2268, 32: 2238, 7: 2202, 31: 2196, 43: 2181, 15: 2073, 44: 2073, 5: 2016, 34: 1977, 9: 1962, 25: 1947, 12: 1932, 1: 1914, 16: 1890, 29: 1884, 14: 1863, 3: 1830, 17: 1830, 4: 1797, 11: 1773, 35: 1758, 36: 1752, 38: 1737, 22: 1695, 13: 1623})

Jumlah kelas setelah SMOTE:
Counter({0: 3570, 4: 3570, 9: 3570, 27: 3570, 18: 3570, 24: 3570, 23: 3570, 13: 3570, 20: 3570, 11: 3570, 16: 3570, 40: 3570, 28: 3570, 15: 3570, 21: 3570, 8: 3570, 31: 3570, 29: 3570, 5: 3570, 7: 3570, 41: 3570, 44: 3570, 3: 3570, 2: 3570, 14: 3570, 43: 3570, 36: 3570, 26: 3570, 34: 3570, 12: 3570, 37: 3570, 35: 3570, 42: 3570, 33: 3570, 17: 3570, 25: 3570, 6: 3570, 30: 3570, 39: 3570, 22: 3570, 10: 3570, 1: 3570, 19: 3570, 32: 3570, 38: 3570})


In [22]:
### split data 80% 20%

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.2, random_state=42,
                                                    stratify=y_resampled)

In [23]:
from sklearn.preprocessing import StandardScaler

# Inisialisasi StandardScaler
scaler = StandardScaler()

# Fit scaler pada data training dan transform kedua set data
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [24]:
pd.DataFrame(X_train_normalized)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,-0.125270,-0.462618,-0.063722,0.962710,0.546778,-0.530245,-0.215304,-0.176643,1.412404,0.383635,...,0.193716,0.060471,0.656303,-0.022266,0.317852,-0.352507,-1.010202,-1.367641,-1.260068,-1.444051
1,-0.447086,-0.164635,-0.081286,0.298297,0.031393,-0.255482,0.851859,-0.201491,-0.137365,0.157049,...,-1.101902,-1.626334,0.223691,0.817619,-1.296437,-0.614610,-0.134165,-3.339167,-0.671473,0.503199
2,-0.136721,0.693945,-0.652197,0.593003,-0.029296,-1.373536,0.804041,0.107182,0.606830,0.598271,...,-0.054259,-1.238785,-1.046840,-2.431179,-2.554992,-2.492506,-0.366025,-0.626432,0.076768,0.491064
3,-1.041236,1.421366,-0.560796,0.131813,-0.337850,-0.264703,0.680879,-0.053193,0.069079,0.717965,...,-1.921328,0.441973,0.176668,-2.161943,0.177355,0.315965,-1.888467,0.640302,0.116856,-1.150069
4,-0.923043,-0.815273,0.428687,0.336918,0.459523,0.063171,0.295056,-0.775594,-1.244932,-2.012682,...,0.224048,0.497766,0.512246,-0.309098,-0.438342,0.097450,0.281906,0.694139,0.955619,-1.263811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128515,-1.069659,0.820296,-1.411504,-1.652472,-1.413538,-0.950462,-2.531171,1.026397,-0.282676,2.174811,...,-1.828143,0.285871,0.049287,0.720958,0.020781,-1.278091,-0.628007,0.212564,-1.642634,-2.222433
128516,0.000360,-0.463296,-0.023205,0.176157,-0.034989,-0.226584,0.809894,-0.064593,-0.741426,-1.760898,...,0.442690,0.649209,-0.596835,-1.256466,-0.773914,-1.425531,-1.128419,-0.704997,-0.171163,0.474619
128517,-2.370420,1.866536,-1.352947,-1.059491,-1.572022,-0.799900,-0.373726,1.257349,0.578946,0.872449,...,0.865331,-0.488842,0.075970,0.050136,-1.373727,0.271099,-0.194054,-0.750243,-1.434839,0.372517
128518,0.146890,-0.653630,0.677426,0.721152,0.754992,0.370645,0.048390,-0.953738,0.222994,0.610338,...,-0.512452,-1.230825,0.102356,0.297172,-0.461706,-1.169560,-0.183656,0.060295,0.011697,0.617881


In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.metrics import make_scorer, accuracy_score
from tqdm.auto import tqdm

In [28]:
# Inisialisasi KNN
knn = KNeighborsClassifier(n_neighbors=1)

# Parameter grid
param_grid = {
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2],  # Hanya relevan untuk metrik Minkowski
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Buat GridSearchCV tanpa custom scorer
grid_search = GridSearchCV(
    knn, 
    param_grid, 
    cv=3, 
    n_jobs=1,  # Run sequentially
    verbose=1  # This will show progress
)


# Fit RandomizedSearchCV
grid_search.fit(X_train, y_train)

# Tampilkan parameter terbaik
print("Parameter terbaik:", grid_search.best_params_)
print("Skor terbaik:", grid_search.best_score_)

# Buat DataFrame dari hasil
results = pd.DataFrame(grid_search.cv_results_)

# Urutkan berdasarkan mean test score dari yang tertinggi ke terendah
results = results.sort_values('mean_test_score', ascending=False)

# Tampilkan kolom yang relevan
columns_to_display = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
print(results[columns_to_display].to_string(index=False))

# Gunakan model terbaik untuk prediksi
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)

# Evaluasi model terbaik
print("Akurasi model terbaik:", accuracy_score(y_test, y_pred))
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, y_pred))

# Simpan hasil ke CSV jika diperlukan
results[columns_to_display].to_csv('grid_search_results.csv', index=False)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Parameter terbaik: {'algorithm': 'auto', 'metric': 'manhattan', 'p': 1, 'weights': 'uniform'}
Skor terbaik: 0.6061235605353251
                                                                          params  mean_test_score  std_test_score  rank_test_score
     {'algorithm': 'auto', 'metric': 'manhattan', 'p': 2, 'weights': 'distance'}         0.606124        0.000058                1
      {'algorithm': 'auto', 'metric': 'manhattan', 'p': 2, 'weights': 'uniform'}         0.606124        0.000058                1
     {'algorithm': 'auto', 'metric': 'manhattan', 'p': 1, 'weights': 'distance'}         0.606124        0.000058                1
      {'algorithm': 'auto', 'metric': 'manhattan', 'p': 1, 'weights': 'uniform'}         0.606124        0.000058                1
     {'algorithm': 'auto', 'metric': 'minkowski', 'p': 1, 'weights': 'distance'}         0.606124        0.000058                1
      {'algorithm': 'auto