In [137]:
# pip install audiomentations

In [138]:
# pip install audiomentations


In [139]:
pip install pyrubberband


Note: you may need to restart the kernel to use updated packages.


In [10]:
import librosa
import numpy as np
from librosa.core.spectrum import _spectrogram
import os
import soundfile as sf
from tqdm import tqdm
from audiomentations import AddBackgroundNoise, PolarityInversion
from audiomentations import Compose, AddGaussianNoise, PitchShift, HighPassFilter, AddGaussianSNR,AddShortNoises
import random
from sklearn import mixture
import pickle
import pyrubberband

In [11]:
import warnings
warnings.filterwarnings('ignore')


In [12]:
speaker_folders = [
    "Benjamin_Netanyau",
    "Jens_Stoltenberg",
    "Julia_Gillard",
    "Magaret_Tarcher",
    "Nelson_Mandela"
]


In [13]:
def combine_and_save_audio(path,speaker):
    files = os.listdir(path)
    # files = sorted(files)
    # print(sorted(files))
    combined_audio = []
    temp_sr = 0
    for i in range(len(files)):
        # print(files[i])
        file_path = path + '/' + str(i) + '.wav'
        x,sr = librosa.load(file_path)
        temp_sr = sr
        combined_audio.extend(x)
    output_file_path = '../ML_project/combined_audios/' + speaker + '.wav'
    sf.write(output_file_path,combined_audio,temp_sr)


In [14]:
for i in tqdm(os.listdir('../ML_project/16000_pcm_speeches')):
    combine_and_save_audio('../ML_project/16000_pcm_speeches/'+i,i)

100%|██████████| 5/5 [00:35<00:00,  7.20s/it]


In [15]:
def combined_audio_with_noises(path,speaker):
    x,sr = librosa.load(path)
    transform = AddBackgroundNoise(
        sounds_path="../ML_project/_background_noise_",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_transform=PolarityInversion(),
        p=1.0)
    augmented_sound = transform(x, sample_rate=sr)
    output_file_path = '../ML_project/combindes_audio_with_noises/' + speaker + '.wav'
    sf.write(output_file_path,augmented_sound,sr)


In [16]:
from scipy.spatial import distance

def calculate_features(x, sr):
    x_pre_emp = librosa.effects.preemphasis(x)
    x_mfcc = librosa.feature.mfcc(y=x_pre_emp,sr=sr,n_mfcc=13)
    delta_mfcc = librosa.feature.delta(x_mfcc)
    dissimilarities = distance.cdist(x_mfcc.T, x_mfcc.T, 'euclidean')
    ldb_features = np.mean(dissimilarities, axis=1)
    combined = np.vstack((x_mfcc,delta_mfcc, ldb_features))
    return combined.T

In [147]:
# def calculate_features(x,sr=22050):
#     x_pre_emp = librosa.effects.preemphasis(x)
#     x_mfcc = librosa.feature.mfcc(y=x_pre_emp,sr=sr,n_mfcc=13)
#     delta_mfcc = librosa.feature.delta(x_mfcc)
#     double_delta_mfcc = librosa.feature.delta(delta_mfcc)
#     combined = np.vstack((x_mfcc,delta_mfcc,double_delta_mfcc))
#     return combined.T


In [148]:
# random.uniform(3,10)

In [17]:
def augment_audio_extract_features(path, n_steps, rate, flag_value):
    x, sr = librosa.load(path)
    
    augmented_pitch_shift = librosa.effects.pitch_shift(x, sr=sr, n_steps=n_steps)
    augmented_time_stretch = librosa.effects.time_stretch(x, rate = rate)

    transform_BGN = AddBackgroundNoise(
        sounds_path="../ML_project/_background_noise_",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_transform=PolarityInversion(),
        p=1.0)
    
    augmented_noise = transform_BGN(x, sample_rate=sr)
    
    add_gaussian_white_noise_transform = AddGaussianSNR(
        min_snr_db=5.0,
        max_snr_db=40.0,
        p=0.2
    )
    
    add_short_noises_transform = AddShortNoises(
        sounds_path="../ML_project/combined_audios",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_rms="relative_to_whole_input",
        min_time_between_sounds=2.0,
        max_time_between_sounds=8.0,
        noise_transform=PolarityInversion(),
        p=0.5
    )
    
    add_gaussian_white_noise = add_gaussian_white_noise_transform(x, sr)
    add_short_noises = add_short_noises_transform(x, sr)

    features_x = calculate_features(x, sr)
    pitch_shift_features = calculate_features(augmented_pitch_shift, sr)
    
    # Concatenate the original and time-stretched signals
    concatenated_time_stretch = np.concatenate([x, augmented_time_stretch])
    
    # Calculate features for the concatenated signal
    time_stretch_features = calculate_features(concatenated_time_stretch, sr)
    
    augmented_noise_feature = calculate_features(augmented_noise, sr)

    if flag_value > 0.5:
        white_noise_features = calculate_features(add_gaussian_white_noise, sr)
        short_voices_feature = calculate_features(add_short_noises, sr)
        return np.vstack((features_x, white_noise_features, pitch_shift_features, 
                          time_stretch_features, augmented_noise_feature, short_voices_feature))
    return np.vstack((features_x, pitch_shift_features, time_stretch_features, augmented_noise_feature))


In [18]:
def augment_test_audio_randomly(path,n_steps,rate):
    x,sr = librosa.load(path)
    augmented_pitch_shift = librosa.effects.pitch_shift(x, sr=sr, n_steps=n_steps)


    augmented_time_stretch =librosa.effects.time_stretch(x, rate = rate)


    transform_BGN = AddBackgroundNoise(
        sounds_path="../ML_project/_background_noise_",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_transform=PolarityInversion(),
        p=1.0)
    augmented_noise = transform_BGN(x, sample_rate=sr)
    add_gaussian_white_noise_transform = AddGaussianSNR(
        min_snr_db=5.0,
        max_snr_db=40.0,
        p=0.2
    )
    add_short_noises_transform = AddShortNoises(
        sounds_path="../ML_project/combined_audios",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_rms="relative_to_whole_input",
        min_time_between_sounds=2.0,
        max_time_between_sounds=8.0,
        noise_transform=PolarityInversion(),
        p=0.5
    )
    add_gaussian_white_noise = add_gaussian_white_noise_transform(x,sr)
    add_short_noises = add_short_noises_transform(x,sr)
    
    features_x = calculate_features(x,sr)
    pitch_shift_features = calculate_features(augmented_pitch_shift,sr)
    time_stretch_features = calculate_features(augmented_time_stretch,sr)
    augmented_noise_feature = calculate_features(augmented_noise,sr)

    white_noise_features = calculate_features(add_gaussian_white_noise,sr)
    short_voices_feature = calculate_features(add_short_noises,sr)
    a = np.random.randint(1,6)
    if a==1:
        return features_x
    elif a==2:
        return white_noise_features
    elif a==3:
        return short_voices_feature
    elif a==4:
        return pitch_shift_features
    elif a==5:
        return time_stretch_features
    return augmented_noise_feature

In [22]:
feature_space = []
feature_labels = []
for Class in os.listdir('../ML_project/16000_pcm_speeches'):
    print(Class)
    folder_data = os.listdir('../ML_project/16000_pcm_speeches'+'/'+Class)
    # split_20_data = int(len(folder_data) * 0.2)
    # split_80 = len(folder_data) - split_20_data
    # list_20 = random.sample(folder_data, split_20_data)
    # list_80 = [elem for elem in folder_data if elem not in list_20]
    # data_dict[i] = folder_data
    # test_dict[i] = list_80
    
    for file in folder_data:
        n_steps = random.uniform(-4,4)
        rate = random.uniform(0.7,1.3)
        flag_value = random.uniform(0,1)
        file_path = '../ML_project/16000_pcm_speeches' + '/' + Class + '/' + file
        curr_fea = augment_audio_extract_features(file_path,n_steps=n_steps,rate=rate,\
                                                flag_value=flag_value)
        feature_space.append(curr_fea)
        feature_labels.append(Class)

Benjamin_Netanyau
Jens_Stoltenberg
Julia_Gillard
Magaret_Tarcher
Nelson_Mandela


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_space, feature_labels, test_size=0.2)

In [24]:
def data_shape_correction(data_set):
    for i in range(len(data_set)):
        data_set[i] = data_set[i].T[:,:209]
    return data_set

X_train = data_shape_correction(X_train)
chotu = X_train[0].shape[1]
badu = X_train[0].shape[1]
for i in X_train:
    if chotu > i.shape[1]:
        chotu = i.shape[1]
    if badu < i.shape[1]:
        badu = i.shape[1]
print(chotu)
print(badu)

209
209


In [25]:
print(X_train[0].shape)

(27, 209)


In [26]:
# for i in range(len(X_train)):
#     X_train[i] = X_train[i].T[:209]
    
# print(len(X_train[4]))
X_train = np.array(X_train)
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
from sklearn.svm import SVC
classifier = SVC(kernel='rbf')
print(type(X_train_flattened))
classifier.fit(X_train_flattened, y_train)


<class 'numpy.ndarray'>


In [156]:
X_train_flattened.shape

(6000, 5643)

In [None]:
X_test = np.sample

In [27]:
# Use the trained SVM to predict labels for the test set
X_test = data_shape_correction(X_test)
X_test = np.array(X_test)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)
print((X_train.shape))
y_pred = classifier.predict(X_test_flattened)

(6000, 27, 209)


In [28]:
from sklearn.metrics import accuracy_score
# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 96.60%


In [29]:
import pickle

# Assuming 'model' is your trained model
with open('svm_mfcc_ldb_column_sliced_209.pkl', 'wb') as file:
    pickle.dump(classifier, file)


In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
# Accuracy
acc = accuracy_score(y_true=y_test, y_pred=y_pred)
print(f"Accuracy: {acc:.4f}")

# Precision
precision = precision_score(y_true=y_test, y_pred=y_pred, average='weighted')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_true=y_test, y_pred=y_pred, average='weighted')
print(f"Recall: {recall:.4f}")

# F1 Score
f1 = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')
print(f"F1 Score: {f1:.4f}")

# Classification Report
class_report = classification_report(y_true=y_test, y_pred=y_pred)
print("Classification Report:\n", class_report)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9660
Precision: 0.9661
Recall: 0.9660
F1 Score: 0.9659
Classification Report:
                    precision    recall  f1-score   support

Benjamin_Netanyau       0.94      0.94      0.94       296
 Jens_Stoltenberg       0.98      0.93      0.96       302
    Julia_Gillard       0.96      0.98      0.97       299
  Magaret_Tarcher       0.95      0.97      0.96       326
   Nelson_Mandela       0.99      1.00      1.00       277

         accuracy                           0.97      1500
        macro avg       0.97      0.97      0.97      1500
     weighted avg       0.97      0.97      0.97      1500

Confusion Matrix:
 [[279   5   3   9   0]
 [  9 282   7   4   0]
 [  1   0 294   2   2]
 [  7   1   1 317   0]
 [  0   0   0   0 277]]


In [153]:
# pip install pyrubberband


In [None]:
# model_path = '../ML_project/Models2'
# models = [pickle.load(open(model_path+'/'+fname,'rb')) for fname in os.listdir(model_path)]
# speakers   = [fname.split('.')[0] for fname in os.listdir(model_path)] 

In [None]:
y_test = []
y_pred = []
for Class,x in tqdm(test_dict.items()):
    for file in x:
      n_steps = random.uniform(-4,4)
      rate = random.uniform(0.7,1.3)
      file_path = '../ML_project/16000_pcm_speeches' + '/' + Class + '/' + file
      curr_fea = augment_test_audio_randomly(file_path,n_steps,rate)
      # print(curr_fea.shape)
      y_test.append(Class)
      log_likelihood = np.zeros(len(models)) 
    #   curr_fea = curr_fea.reshape(1,-1)
      for i in range(len(models)):
          gmm = models[i]         #checking with each model one by one
          scores = np.array(gmm.score(curr_fea))
          log_likelihood[i] = scores.sum()
    
      winner = np.argmax(log_likelihood)
      y_pred.append(speakers[winner])

100%|██████████| 5/5 [08:25<00:00, 101.08s/it]


In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_true=y_test,y_pred=y_pred)
print(acc)

0.9906666666666667


# SVM