In [39]:
import librosa
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score


In [40]:
def extract_features(y, sr):
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_delta = librosa.feature.delta(mfccs)
    mfccs_delta2 = librosa.feature.delta(mfccs, order=2)
    combined_features = np.hstack((np.mean(mfccs.T, axis=0), np.mean(mfccs_delta.T, axis=0), np.mean(mfccs_delta2.T, axis=0)))
    return combined_features


In [41]:
def augment_data(y, sr):
    noise = np.random.randn(len(y))
    y_noise = y + 0.005 * noise

    y_shift = np.roll(y, sr // 10)

    y_stretch = librosa.effects.time_stretch(y, rate=1.1)

    return [y, y_noise, y_shift, y_stretch]


In [42]:
def load_data(data_path):
    features = []
    labels = []
    for speaker in os.listdir(data_path):
        speaker_path = os.path.join(data_path, speaker)
        if os.path.isdir(speaker_path):
            for file in os.listdir(speaker_path):
                file_path = os.path.join(speaker_path, file)
                if file_path.endswith('.mp3'):
                    y, sr = librosa.load(file_path, sr=None)
                    augmented_audios = augment_data(y, sr)
                    for audio in augmented_audios:
                        mfccs = extract_features(audio, sr)
                        features.append(mfccs)
                        labels.append(speaker)
    return np.array(features), np.array(labels)

data_path = '/content/drive/MyDrive/Audio dataset'
X, y = load_data(data_path)


In [43]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [44]:
model1 = SVC(kernel='linear', probability=True)
model2 = RandomForestClassifier(n_estimators=100)
ensemble_model = VotingClassifier(estimators=[('svc', model1), ('rf', model2)], voting='soft')


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=0)

ensemble_model.fit(X_train, y_train)


In [46]:
scores = cross_val_score(ensemble_model, X, y_encoded, cv=5)
print(f"Cross-validation accuracy: {np.mean(scores)}")


Cross-validation accuracy: 0.7499999999999999


In [47]:
def match_voice(input_audio_path, model, label_encoder, scaler):
    y, sr = librosa.load(input_audio_path, sr=None)
    input_features = extract_features(y, sr).reshape(1, -1)
    input_features = scaler.transform(input_features)
    prediction = model.predict(input_features)
    prediction_prob = model.predict_proba(input_features)
    print(prediction_prob)
    if max(prediction_prob[0]) > 0.6:
        label = label_encoder.inverse_transform(prediction)[0]
        return label
    else:
        return "no match"


In [48]:
input_audio_path = '/content/Dinesh1.mp3'
result = match_voice(input_audio_path, ensemble_model, label_encoder, scaler)
print(f"Result: {result}")


[[0.48566016 0.19709141 0.31724844]]
Result: no match
