In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import joblib
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv(r"/kaggle/input/feature-common-language/audio_features_partial.csv")
df.head()

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,spectral_rolloff,zero_crossing_rate,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,label,file_path
0,0.712224,0.05574,2989.050985,2193.800068,5666.294643,0.275144,-202.32579,27.614292,4.094784,5.301181,...,4.752774,-0.985637,-6.752584,-4.679379,-5.478848,-0.866508,-1.919669,-0.634521,Arabic,/kaggle/input/preprocess-common-language/proce...
1,0.598403,0.074004,2372.315827,2065.56102,4795.64951,0.178041,-137.41476,59.931843,5.077963,-5.712012,...,-12.469353,-2.134825,-10.989368,-1.460541,-4.485021,-0.408789,-8.211143,-5.170048,Arabic,/kaggle/input/preprocess-common-language/proce...
2,0.756316,0.046051,3274.178654,2196.474265,5870.572917,0.353054,-233.02031,22.69155,10.057923,3.829097,...,-1.657243,-5.409642,-4.017134,-6.744406,-1.69763,-0.387302,0.829549,1.29211,Arabic,/kaggle/input/preprocess-common-language/proce...
3,0.588983,0.061376,1948.418292,2049.242741,4186.921296,0.137682,-199.0449,80.80687,31.45138,-1.297673,...,-4.202263,0.065943,-9.312079,-7.16406,-3.08204,-8.046175,-3.083879,-2.018449,Arabic,/kaggle/input/preprocess-common-language/proce...
4,0.647222,0.069638,1705.618989,1824.714129,3730.709877,0.09554,-325.47556,92.37382,17.725632,31.867613,...,3.169903,4.538502,-1.073114,-1.204524,-0.108214,-4.80346,-2.882802,-1.455632,Arabic,/kaggle/input/preprocess-common-language/proce...


In [4]:
df.duplicated().sum()

0

In [5]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder with your actual labels
label_encoder.fit(df['label'])

# Transform actual labels to numeric labels
df['numeric_labels'] = label_encoder.transform(df['label'])

In [6]:
X = df.drop(columns=['label','numeric_labels','file_path'])
y = df['numeric_labels']

In [7]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [8]:
from collections import Counter

# Jumlah kelas sebelum SMOTE
print("Jumlah kelas sebelum SMOTE:")
print(Counter(y))

# Jumlah kelas setelah SMOTE
print("\nJumlah kelas setelah SMOTE:")
print(Counter(y_resampled))

Jumlah kelas sebelum SMOTE:
Counter({2: 3570, 39: 3177, 19: 3012, 6: 3003, 27: 2955, 41: 2784, 24: 2766, 0: 2751, 20: 2733, 37: 2691, 10: 2550, 33: 2547, 18: 2520, 21: 2484, 42: 2472, 8: 2442, 40: 2367, 26: 2331, 30: 2283, 28: 2277, 23: 2268, 32: 2238, 7: 2202, 31: 2196, 43: 2181, 15: 2073, 44: 2073, 5: 2016, 34: 1977, 9: 1962, 25: 1947, 12: 1932, 1: 1914, 16: 1890, 29: 1884, 14: 1863, 3: 1830, 17: 1830, 4: 1797, 11: 1773, 35: 1758, 36: 1752, 38: 1737, 22: 1695, 13: 1623})

Jumlah kelas setelah SMOTE:
Counter({0: 3570, 4: 3570, 9: 3570, 27: 3570, 18: 3570, 24: 3570, 23: 3570, 13: 3570, 20: 3570, 11: 3570, 16: 3570, 40: 3570, 28: 3570, 15: 3570, 21: 3570, 8: 3570, 31: 3570, 29: 3570, 5: 3570, 7: 3570, 41: 3570, 44: 3570, 3: 3570, 2: 3570, 14: 3570, 43: 3570, 36: 3570, 26: 3570, 34: 3570, 12: 3570, 37: 3570, 35: 3570, 42: 3570, 33: 3570, 17: 3570, 25: 3570, 6: 3570, 30: 3570, 39: 3570, 22: 3570, 10: 3570, 1: 3570, 19: 3570, 32: 3570, 38: 3570})


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3, random_state=42,
                                                    stratify=y_resampled)

In [10]:
from sklearn.preprocessing import StandardScaler

# Inisialisasi StandardScaler
scaler = StandardScaler()

# Fit scaler pada data training dan transform kedua set data
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [11]:
pd.DataFrame(X_train_normalized)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,-0.364935,-0.247849,-0.890410,0.214314,-0.561075,-1.112044,0.618754,0.807991,0.786938,-1.582598,...,-0.372540,-0.425312,0.083558,0.188499,0.269625,-0.357414,-0.257548,-0.406506,-0.116368,-0.662683
1,-1.561425,-0.344321,-0.135153,0.424011,0.131912,-0.623506,0.329402,-0.386126,-0.340000,-0.284849,...,-0.261647,-0.655708,-0.219533,-0.482159,-1.300348,-0.336036,-0.162236,-1.568699,-0.613665,-0.441593
2,0.665183,-0.612016,0.750516,0.902349,0.803726,0.496568,1.061305,-0.548987,1.098359,0.939015,...,1.132156,0.817791,0.687609,-0.440398,0.416434,0.405453,1.682787,1.369895,0.290245,-1.050559
3,-1.030974,2.093594,-1.470936,-1.994267,-1.551235,-1.101425,-0.378292,1.297652,-1.920052,3.791689,...,-1.754882,1.290520,-1.386529,-1.329693,0.619006,0.477908,-0.643397,-0.029769,0.436081,-0.172504
4,0.194780,1.141861,-0.860718,-1.013396,-1.069192,-0.552857,-0.181194,1.020571,-1.213755,-0.296157,...,-0.063017,-0.148142,-1.784751,1.201975,-1.022463,0.221809,-0.767619,0.941025,-1.583400,2.624626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112450,-0.133160,-0.669898,0.272018,0.602705,0.497571,-0.091507,0.424125,-0.346555,0.084319,-1.099335,...,-0.470875,-0.623940,-0.557836,-0.275097,-0.082015,-0.352531,0.565829,-0.353458,-0.051545,0.009822
112451,-0.262421,-0.089078,0.276614,-0.028210,0.196182,0.112838,1.198047,-0.642695,-0.807168,0.112235,...,0.262296,0.962201,-1.422439,-1.717809,0.246690,-0.858461,0.601352,0.470246,0.015822,0.495168
112452,2.366706,-1.512846,1.166286,0.784500,0.992960,1.409006,-0.042223,-0.704344,-0.166065,-0.228819,...,1.524984,0.901985,0.707583,0.547474,0.926508,0.417688,1.032225,1.036761,0.784477,0.441887
112453,-0.996360,-0.528871,0.698901,0.757775,0.812551,0.348989,0.063959,-0.708739,1.677042,0.055551,...,-0.501684,-0.195061,0.241821,-0.083057,-0.283776,0.640558,-0.922673,-0.655744,-0.824914,-1.049133


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
import numpy as np

np.random.seed(42)

# Base models
rf = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=None,
    bootstrap=False,
    random_state=42
)
knn = KNeighborsClassifier(
    n_neighbors=1)

dt = DecisionTreeClassifier(
	random_state=42)

# Final estimator
final_estimator = LogisticRegression(multi_class='multinomial', max_iter=2000)

# Function to get out-of-fold predictions
def get_oof_pred(model, X, y, cv=3):
    oof_pred = cross_val_predict(model, X, y, cv=cv, method='predict_proba')
    return oof_pred

# Get out-of-fold predictions for base models
rf_oof = get_oof_pred(rf, X_train_normalized, y_train)
knn_oof = get_oof_pred(knn, X_train_normalized, y_train)
dt_oof = get_oof_pred(dt, X_train_normalized, y_train)

# Combine out-of-fold predictions
X_train_meta = np.hstack([rf_oof, knn_oof, dt_oof])

# Fit final estimator
final_estimator.fit(X_train_meta, y_train)

# Fit base models on entire training data
rf.fit(X_train_normalized, y_train)
knn.fit(X_train_normalized, y_train)
dt.fit(X_train_normalized, y_train)

# Get predictions for test data
rf_test = rf.predict_proba(X_test_normalized)
knn_test = knn.predict_proba(X_test_normalized)
dt_test = dt.predict_proba(X_test_normalized)

# Combine test predictions
X_test_meta = np.hstack([rf_test, knn_test, dt_test])

# Final prediction
final_pred = final_estimator.predict(X_test_meta)

# Calculate accuracy
accuracy = accuracy_score(y_test, final_pred)
print(f"Accuracy of the stacking model: {accuracy:.5f}")

Accuracy of the stacking model: 0.84783


In [13]:
from sklearn.metrics import accuracy_score, classification_report

# Generate and print classification report
print("\nClassification Report:")
print(classification_report(y_test, final_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81      1071
           1       0.80      0.84      0.82      1071
           2       0.88      0.81      0.84      1071
           3       0.83      0.83      0.83      1071
           4       0.85      0.85      0.85      1071
           5       0.83      0.82      0.82      1071
           6       0.83      0.65      0.73      1071
           7       0.85      0.93      0.89      1071
           8       0.87      0.84      0.85      1071
           9       0.89      0.92      0.90      1071
          10       0.82      0.77      0.79      1071
          11       0.82      0.82      0.82      1071
          12       0.80      0.82      0.81      1071
          13       0.88      0.92      0.90      1071
          14       0.85      0.83      0.84      1071
          15       0.85      0.90      0.88      1071
          16       0.92      0.96      0.94      1071
   