In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump
import time



features = [
    'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 
    'MFCC_8', 'MFCC_9', 'MFCC_10', 'MFCC_11', 'MFCC_12', 'MFCC_13',
    'Chroma_1', 'Chroma_2', 'Chroma_3', 'Chroma_4', 'Chroma_5', 'Chroma_6', 
    'Chroma_7', 'Chroma_8', 'Chroma_9', 'Chroma_10', 'Chroma_11', 'Chroma_12',
    'Spectral_Centroid', 'Spectral_Bandwidth', 'Rolloff', 'Zero_Crossing_Rate', 'RMSE'
]

X_train = df_train_final[features].values
y_train = df_train_final['Speaker_ID'].values

X_dev   = df_dev_final[features].values
y_dev   = df_dev_final['Speaker_ID'].values

X_test  = df_test_final[features].values
y_test  = df_test_final['Speaker_ID'].values


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_dev_scaled   = scaler.transform(X_dev)
X_test_scaled  = scaler.transform(X_test)


dump(scaler, "scaler_all_features_final.pkl")


le = LabelEncoder()
all_labels = np.concatenate([y_train, y_dev, y_test])
le.fit(all_labels)

y_train_enc = le.transform(y_train)
y_dev_enc   = le.transform(y_dev)
y_test_enc  = le.transform(y_test)

print("Unique classes learned:", le.classes_)


svm_linear = SVC(kernel='linear', C=1.0, probability=True, random_state=42)
print("\nTraining SVM_linear...")
start_time = time.time()
svm_linear.fit(X_train_scaled, y_train_enc)
print(f"SVM_linear trained in {time.time() - start_time:.2f} seconds.")
y_dev_pred = svm_linear.predict(X_dev_scaled)
print("SVM_linear Dev Accuracy:", accuracy_score(y_dev_enc, y_dev_pred))
print("SVM_linear Dev Classification Report:")
print(classification_report(y_dev_enc, y_dev_pred))
dump(svm_linear, "svm_linear_model_final.pkl")
print("SVM_linear model saved as 'svm_linear_model_final.pkl'.\n")


rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
print("Training RandomForest...")
start_time = time.time()
rf_model.fit(X_train_scaled, y_train_enc)
print(f"RandomForest trained in {time.time() - start_time:.2f} seconds.")
y_dev_pred = rf_model.predict(X_dev_scaled)
print("RandomForest Dev Accuracy:", accuracy_score(y_dev_enc, y_dev_pred))
print("RandomForest Dev Classification Report:")
print(classification_report(y_dev_enc, y_dev_pred))
dump(rf_model, "rf_model_final.pkl")
print("RandomForest model saved as 'rf_model_final.pkl'.\n")


xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
print("Training XGBoost...")
start_time = time.time()
xgb_model.fit(X_train_scaled, y_train_enc)
print(f"XGBoost trained in {time.time() - start_time:.2f} seconds.")
y_dev_pred = xgb_model.predict(X_dev_scaled)
print("XGBoost Dev Accuracy:", accuracy_score(y_dev_enc, y_dev_pred))
print("XGBoost Dev Classification Report:")
print(classification_report(y_dev_enc, y_dev_pred))
dump(xgb_model, "xgb_model_final.pkl")
print("XGBoost model saved as 'xgb_model_final.pkl'.\n")

print("\n✅ All models have been trained, evaluated, and saved using all 30 features!")


Unique classes learned: ['103' '1034' '1040' '1069' '1081' '1088' '1098' '1116' '118' '1183'
 '1235' '1246' '125' '1263' '1334' '1355' '1363' '1447' '1455' '150'
 '1502' '1553' '1578' '1594' '1624' '163' '1723' '1737' '1743' '1841'
 '1867' '1898' '19' '1926' '196' '1963' '1970' '198' '1992' '200' '2002'
 '2007' '201' '2092' '211' '2136' '2159' '2182' '2196' '226' '2289' '229'
 '233' '2384' '2391' '2416' '2436' '248' '250' '2514' '2518' '254' '26'
 '2691' '27' '2764' '2817' '2836' '2843' '289' '2893' '2910' '2911' '2952'
 '298' '2989' '302' '307' '311' '3112' '3168' '32' '3214' '322' '3235'
 '3240' '3242' '3259' '328' '332' '3374' '3436' '3440' '3486' '3526'
 '3607' '3664' '3699' '3723' '374' '3807' '3830' '3857' '3879' '39' '3947'
 '3982' '3983' '40' '4014' '4018' '403' '405' '4051' '4088' '412' '4137'
 '4160' '4195' '4214' '426' '4267' '4297' '4340' '4362' '4397' '4406'
 '441' '4441' '445' '446' '4481' '458' '460' '4640' '4680' '4788' '481'
 '4813' '4830' '4853' '4859' '4898' '5022' '

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained in 33.48 seconds.
XGBoost Dev Accuracy: 0.7843280315190427
XGBoost Dev Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78        21
           1       0.81      0.68      0.74        19
           2       0.69      0.69      0.69        16
           3       0.80      0.76      0.78        21
           4       0.87      0.77      0.82        26
           5       0.58      0.50      0.54        44
           6       0.73      0.89      0.80        18
           7       0.85      0.88      0.87        26
           8       0.66      0.68      0.67        28
           9       0.79      0.85      0.81        13
          10       0.62      0.72      0.67        25
          11       0.89      0.74      0.81        23
          12       0.57      0.74      0.65        27
          13       0.88      0.68      0.77        22
          14       0.71      0.52      0.60        42
          15       0.82  

In [None]:

from joblib import load

models = {
    "SVM": load("svm_linear_model_final.pkl"),
    "RandomForest": load("rf_model_final.pkl"),
    "XGBoost": load("xgb_model_final.pkl")
}

# Evaluate on test set
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} on Test Set...")
    y_test_pred = model.predict(X_test_scaled)
    print(f"{model_name} Test Accuracy:", accuracy_score(y_test_enc, y_test_pred))
    print(f"{model_name} Test Classification Report:\n", classification_report(y_test_enc, y_test_pred))



Evaluating SVM on Test Set...
SVM Test Accuracy: 0.9185012392477038
SVM Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.85      0.92        20
           1       0.88      0.79      0.83        19
           2       1.00      0.94      0.97        16
           3       1.00      1.00      1.00        22
           4       0.92      0.92      0.92        26
           5       0.93      0.87      0.90        46
           6       1.00      0.95      0.97        19
           7       0.93      1.00      0.96        25
           8       0.77      0.85      0.81        27
           9       0.92      0.92      0.92        13
          10       0.62      0.60      0.61        25
          11       0.92      0.92      0.92        24
          12       0.77      0.86      0.81        28
          13       0.88      1.00      0.94        22
          14       0.83      0.91      0.87        44
          15       0.95      1.00