In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import joblib
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv(r"/kaggle/input/feature-common-language/audio_features_partial.csv")
df.head()

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,spectral_rolloff,zero_crossing_rate,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,label,file_path
0,0.712224,0.05574,2989.050985,2193.800068,5666.294643,0.275144,-202.32579,27.614292,4.094784,5.301181,...,4.752774,-0.985637,-6.752584,-4.679379,-5.478848,-0.866508,-1.919669,-0.634521,Arabic,/kaggle/input/preprocess-common-language/proce...
1,0.598403,0.074004,2372.315827,2065.56102,4795.64951,0.178041,-137.41476,59.931843,5.077963,-5.712012,...,-12.469353,-2.134825,-10.989368,-1.460541,-4.485021,-0.408789,-8.211143,-5.170048,Arabic,/kaggle/input/preprocess-common-language/proce...
2,0.756316,0.046051,3274.178654,2196.474265,5870.572917,0.353054,-233.02031,22.69155,10.057923,3.829097,...,-1.657243,-5.409642,-4.017134,-6.744406,-1.69763,-0.387302,0.829549,1.29211,Arabic,/kaggle/input/preprocess-common-language/proce...
3,0.588983,0.061376,1948.418292,2049.242741,4186.921296,0.137682,-199.0449,80.80687,31.45138,-1.297673,...,-4.202263,0.065943,-9.312079,-7.16406,-3.08204,-8.046175,-3.083879,-2.018449,Arabic,/kaggle/input/preprocess-common-language/proce...
4,0.647222,0.069638,1705.618989,1824.714129,3730.709877,0.09554,-325.47556,92.37382,17.725632,31.867613,...,3.169903,4.538502,-1.073114,-1.204524,-0.108214,-4.80346,-2.882802,-1.455632,Arabic,/kaggle/input/preprocess-common-language/proce...


In [4]:
df.duplicated().sum()

0

In [5]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder with your actual labels
label_encoder.fit(df['label'])

# Transform actual labels to numeric labels
df['numeric_labels'] = label_encoder.transform(df['label'])

In [6]:
X = df.drop(columns=['label','numeric_labels','file_path'])
y = df['numeric_labels']

In [7]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [8]:
from collections import Counter

# Jumlah kelas sebelum SMOTE
print("Jumlah kelas sebelum SMOTE:")
print(Counter(y))

# Jumlah kelas setelah SMOTE
print("\nJumlah kelas setelah SMOTE:")
print(Counter(y_resampled))

Jumlah kelas sebelum SMOTE:
Counter({2: 3570, 39: 3177, 19: 3012, 6: 3003, 27: 2955, 41: 2784, 24: 2766, 0: 2751, 20: 2733, 37: 2691, 10: 2550, 33: 2547, 18: 2520, 21: 2484, 42: 2472, 8: 2442, 40: 2367, 26: 2331, 30: 2283, 28: 2277, 23: 2268, 32: 2238, 7: 2202, 31: 2196, 43: 2181, 15: 2073, 44: 2073, 5: 2016, 34: 1977, 9: 1962, 25: 1947, 12: 1932, 1: 1914, 16: 1890, 29: 1884, 14: 1863, 3: 1830, 17: 1830, 4: 1797, 11: 1773, 35: 1758, 36: 1752, 38: 1737, 22: 1695, 13: 1623})

Jumlah kelas setelah SMOTE:
Counter({0: 3570, 4: 3570, 9: 3570, 27: 3570, 18: 3570, 24: 3570, 23: 3570, 13: 3570, 20: 3570, 11: 3570, 16: 3570, 40: 3570, 28: 3570, 15: 3570, 21: 3570, 8: 3570, 31: 3570, 29: 3570, 5: 3570, 7: 3570, 41: 3570, 44: 3570, 3: 3570, 2: 3570, 14: 3570, 43: 3570, 36: 3570, 26: 3570, 34: 3570, 12: 3570, 37: 3570, 35: 3570, 42: 3570, 33: 3570, 17: 3570, 25: 3570, 6: 3570, 30: 3570, 39: 3570, 22: 3570, 10: 3570, 1: 3570, 19: 3570, 32: 3570, 38: 3570})


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.15, random_state=42,
                                                    stratify=y_resampled)

In [10]:
from sklearn.preprocessing import StandardScaler

# Inisialisasi StandardScaler
scaler = StandardScaler()

# Fit scaler pada data training dan transform kedua set data
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [11]:
pd.DataFrame(X_train_normalized)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.582034,-0.406956,-0.531578,0.290200,-0.229217,-0.810377,0.751004,0.309937,0.150633,0.245647,...,-0.954706,-1.645642,-0.738238,-0.625199,-0.328946,-1.619852,-1.118937,-0.381337,-0.036970,-0.709218
1,1.460424,-0.838015,1.086674,1.006211,1.027872,0.961832,-0.119324,-0.695537,0.713919,-0.193797,...,1.591395,0.035205,0.895463,0.519906,0.952624,0.910742,0.839988,0.347857,1.085478,1.424219
2,0.053463,1.206179,-1.425506,-0.800742,-1.422655,-1.136345,-0.735763,1.107606,0.744095,1.590166,...,0.900279,0.503818,-0.335897,-0.575068,0.071210,0.356524,0.366233,0.547966,0.021219,0.455161
3,0.429476,0.161088,0.280372,0.327998,0.306816,0.215003,1.401919,-0.276785,-0.403517,0.347474,...,0.089280,-0.362908,-0.171992,0.350655,-0.017171,0.420310,0.590368,-0.279936,0.517072,0.032433
4,-0.766364,-0.418856,0.040892,0.673203,0.374364,-0.422581,0.370732,-0.317502,0.521316,-0.223565,...,-0.031470,-0.213825,-0.061401,-0.893875,-0.128419,-0.636235,-1.691928,-2.193984,-0.690640,-1.756480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136547,-0.326621,-0.363583,0.890655,0.978043,1.090333,0.705216,0.739100,-0.741826,-0.068819,-0.613114,...,-0.239633,0.211900,-0.334186,-1.044798,0.019457,-0.685120,-0.554165,0.609366,-0.652576,-0.150498
136548,0.145052,-1.101250,0.583430,0.723028,0.744013,0.441511,-0.151909,-0.666811,0.014441,-0.297896,...,-0.192679,0.010025,0.101929,-0.745057,0.451718,1.320971,0.494942,1.124170,1.218560,-0.209068
136549,-2.113605,-0.751136,-0.196731,0.967075,0.588841,-1.066331,-0.223444,-0.199056,0.923485,0.074654,...,0.137754,0.039006,0.383723,-0.246038,0.538013,-0.178155,-0.104413,-1.023905,-1.373848,-1.612716
136550,0.474722,0.917518,-0.078406,0.201561,0.228375,-0.644907,0.623080,-0.541088,-0.380098,1.282401,...,0.323344,2.360557,1.968953,0.935470,-0.310959,2.269708,1.095870,0.035977,0.673887,1.690333


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
import numpy as np

np.random.seed(42)

# Base models
rf = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=None,
    bootstrap=False,
    random_state=42
)
knn = KNeighborsClassifier(
    n_neighbors=1)

dt = DecisionTreeClassifier(
	random_state=42)

# Final estimator
final_estimator = LogisticRegression(multi_class='multinomial', max_iter=2000)

# Function to get out-of-fold predictions
def get_oof_pred(model, X, y, cv=3):
    oof_pred = cross_val_predict(model, X, y, cv=cv, method='predict_proba')
    return oof_pred

# Get out-of-fold predictions for base models
rf_oof = get_oof_pred(rf, X_train_normalized, y_train)
knn_oof = get_oof_pred(knn, X_train_normalized, y_train)
dt_oof = get_oof_pred(dt, X_train_normalized, y_train)

# Combine out-of-fold predictions
X_train_meta = np.hstack([rf_oof, knn_oof, dt_oof])

# Fit final estimator
final_estimator.fit(X_train_meta, y_train)

# Fit base models on entire training data
rf.fit(X_train_normalized, y_train)
knn.fit(X_train_normalized, y_train)
dt.fit(X_train_normalized, y_train)

# Get predictions for test data
rf_test = rf.predict_proba(X_test_normalized)
knn_test = knn.predict_proba(X_test_normalized)
dt_test = dt.predict_proba(X_test_normalized)

# Combine test predictions
X_test_meta = np.hstack([rf_test, knn_test, dt_test])

# Final prediction
final_pred = final_estimator.predict(X_test_meta)

# Calculate accuracy
accuracy = accuracy_score(y_test, final_pred)
print(f"Accuracy of the stacking model: {accuracy:.5f}")

Accuracy of the stacking model: 0.88514


In [13]:
from sklearn.metrics import accuracy_score, classification_report

# Generate and print classification report
print("\nClassification Report:")
print(classification_report(y_test, final_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.82      0.86       536
           1       0.86      0.88      0.87       536
           2       0.91      0.88      0.89       536
           3       0.84      0.87      0.85       535
           4       0.88      0.86      0.87       535
           5       0.89      0.86      0.87       536
           6       0.86      0.74      0.80       535
           7       0.89      0.94      0.91       536
           8       0.89      0.87      0.88       535
           9       0.89      0.95      0.92       535
          10       0.88      0.82      0.85       536
          11       0.86      0.85      0.85       536
          12       0.86      0.87      0.86       535
          13       0.88      0.93      0.90       535
          14       0.90      0.86      0.88       536
          15       0.88      0.95      0.91       535
          16       0.94      0.96      0.95       535
   