In [1]:
###################### Speaker Classification using SVM #################

import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [2]:
# Step 1: Feature Extraction
def extract_features(file_path, mfcc=True, chroma=True, mel=True, sr=22050):
    audio_data, _ = librosa.load(file_path, sr=sr)
    features = []
    if mfcc:
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sr)
        features.extend(np.mean(mfccs, axis=1))
    if chroma:
        chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
        features.extend(np.mean(chroma, axis=1))
    if mel:
        mel = librosa.feature.melspectrogram(y=audio_data, sr=sr)
        features.extend(np.mean(mel, axis=1))
    return features


In [3]:
# Load audio files and extract features
def load_data(file_paths,sr):
    X = []
    y = []
    for file_path in file_paths:
        features = extract_features(file_path,sr)
        X.append(features)
        # Assume file name format is "<speaker_id>_<other_info>.wav"
#         print(file_path)
        label = file_path.split("/")[-1].split("_")[0]
        y.append(label)
    return np.array(X), np.array(y)


In [4]:
import os
# Function to collect paths of all .wav files in a directory
def collect_audio_paths(directory):
    audio_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            audio_paths.append(os.path.join(root, file))
    return audio_paths

In [14]:
# Load audio files and corresponding labels
# Directory containing .wav files
audio_directory = "audio_files/"
# Sampling Rate
sr= 22050

class_names = ["cyhh","haoyu","stranger"]
# Collect paths of .wav files
file_paths = collect_audio_paths(audio_directory)

X, y = load_data(file_paths,sr)
# print(y)

# print(y_encoded)
# print("X:",X)
# print("y:",y)
print("Shape of X:", X.shape)
print(X)
# X=X.reshape((X.shape[0],X.shape[1],1))
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)

input_shape=X_train.shape[1]
print(input_shape)

# print(y_test)

Shape of X: (81, 160)
[[-5.24045471e+02  5.64239349e+01 -1.28164454e+01 ...  2.56934451e-09
   1.29407440e-09  1.21273241e-10]
 [-4.00629059e+02  9.98563309e+01 -2.24793739e+01 ...  3.50395934e-09
   1.96301198e-09  1.71816200e-10]
 [-3.86967804e+02  8.67731247e+01 -2.87876492e+01 ...  3.45428663e-09
   1.90756988e-09  1.59032662e-10]
 ...
 [-4.15776093e+02  9.71693039e+01 -5.00135803e+01 ...  2.41103604e-09
   1.49688217e-09  3.46428414e-10]
 [-4.09536560e+02  9.68211746e+01 -3.64263458e+01 ...  2.01503658e-09
   1.20051968e-09  1.13576856e-10]
 [-4.48099243e+02  7.34625473e+01 -2.37659512e+01 ...  2.01631378e-09
   1.11475174e-09  9.93241045e-11]]
(64, 160)
(64,)
160


In [15]:
# Step 3: Feature Scaling
# X_scaled=[]
# for i in range(X.shape[0]):
#     scaler = StandardScaler()
#     x_scaled = scaler.fit_transform(X[i])
#     X_scaled.append(x_scaled)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [16]:
# Step 4: Train a Binary Classification Model (SVM Example)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train, y_train)

SVC(kernel='linear', random_state=42)

In [17]:
# Step 5: Evaluation (Optional)
y_pred = svm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 1.0


In [36]:
# Step 6: Inference
# Assuming you have a new audio sample stored in the variable new_audio_file
new_audio_features = extract_features("test_audio/haoyu_audio_4.wav")
new_audio_features_scaled = scaler.transform([new_audio_features])
# print(new_audio_features_scaled)
predicted_label = svm_clf.predict(new_audio_features_scaled)
if predicted_label[0] in class_names:
    print("Predicted Speaker:", predicted_label[0])
else:
    print("Predicted Speaker: Stranger")


Predicted Speaker: cyhh
