In [22]:
import librosa
import numpy as np
from librosa.core.spectrum import _spectrogram
import os
import soundfile as sf
from tqdm import tqdm
from audiomentations import AddBackgroundNoise, PolarityInversion
from audiomentations import Compose, AddGaussianNoise, PitchShift, HighPassFilter,\
    AddGaussianSNR,AddShortNoises
import random
from sklearn import mixture
import pickle
import pyrubberband

In [32]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
speaker_folders = [
    "Benjamin_Netanyau",
    "Jens_Stoltenberg",
    "Julia_Gillard",
    "Magaret_Tarcher",
    "Nelson_Mandela"
]


In [3]:
def combine_and_save_audio(path,speaker):
    files = os.listdir(path)
    # files = sorted(files)
    # print(sorted(files))
    combined_audio = []
    temp_sr = 0
    for i in range(len(files)):
        # print(files[i])
        file_path = path + '/' + str(i) + '.wav'
        x,sr = librosa.load(file_path)
        temp_sr = sr
        combined_audio.extend(x)
    output_file_path = '../ML_project/combined_audios/' + speaker + '.wav'
    sf.write(output_file_path,combined_audio,temp_sr)


In [28]:
for i in tqdm(os.listdir('../ML_project/16000_pcm_speeches')):
    combine_and_save_audio('../ML_project/16000_pcm_speeches/'+i,i)

100%|██████████| 5/5 [00:52<00:00, 10.52s/it]


In [4]:
def combined_audio_with_noises(path,speaker):
    x,sr = librosa.load(path)
    transform = AddBackgroundNoise(
        sounds_path="../ML_project/_background_noise_",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_transform=PolarityInversion(),
        p=1.0)
    augmented_sound = transform(x, sample_rate=sr)
    output_file_path = '../ML_project/combindes_audio_with_noises/' + speaker + '.wav'
    sf.write(output_file_path,augmented_sound,sr)




In [35]:
def extract_features(path):
    x,sr = librosa.load(path)
    x_pre_emp = librosa.effects.preemphasis(x)
    x_mfcc = librosa.feature.mfcc(y=x_pre_emp,sr=sr,n_mfcc=13)
    # gfccs  = gfcc(x_pre_emp, num_ceps=13,fs=sr)
    # chroma = librosa.feature.chroma_stft(y=x_pre_emp,sr=sr,n_chroma=12)
    # print(x_mfcc.shape)
    # # print(gfccs.shape)
    # print(chroma.shape)
    delta_mfcc = librosa.feature.delta(x_mfcc)
    double_delta_mfcc = librosa.feature.delta(delta_mfcc)
    # delta_gfcc = librosa.feature.delta(gfccs)
    # double_delta_gfcc = librosa.feature.delta(delta_gfcc)
    # delta_chroma = librosa.feature.delta(chroma)
    # double_delta_chroma = librosa.feature.delta(delta_chroma)
    combined = np.vstack((x_mfcc,delta_mfcc,double_delta_mfcc))
                        #   chroma.T,delta_chroma.T,double_delta_chroma.T)) 
    # print(combined.shape)'../Kaggle_challenge2/train/bed/00176480_nohash_0.wav'
    return combined.T

In [5]:
def calculate_features(x,sr=22050):
    x_pre_emp = librosa.effects.preemphasis(x)
    x_mfcc = librosa.feature.mfcc(y=x_pre_emp,sr=sr,n_mfcc=13)
    delta_mfcc = librosa.feature.delta(x_mfcc)
    double_delta_mfcc = librosa.feature.delta(delta_mfcc)
    combined = np.vstack((x_mfcc,delta_mfcc,double_delta_mfcc))
    return combined.T


In [6]:
# random.uniform(3,10)

In [26]:
def augment_audio_extract_features(path,n_steps,rate,flag_value):
    x,sr = librosa.load(path)
    # x_pre_emp = librosa.effects.preemphasis(x)
    
    
    augmented_pitch_shift = pyrubberband.pyrb.pitch_shift(x,n_steps=n_steps,sr=sr)
    augmented_time_stretch = pyrubberband.pyrb.time_stretch(x,rate=rate,sr=sr)

    transform_BGN = AddBackgroundNoise(
        sounds_path="../ML_project/_background_noise_",
        min_snr_in_db=3.0, 
        max_snr_in_db=30.0,
        noise_transform=PolarityInversion(),
        p=1.0)
    augmented_noise = transform_BGN(x, sample_rate=sr)
    add_gaussian_white_noise_transform = AddGaussianSNR(
        min_snr_db=5.0,
        max_snr_db=40.0,
        p=0.2
    )
    add_short_noises_transform = AddShortNoises(
        sounds_path="../ML_project/combined_audios",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_rms="relative_to_whole_input",
        min_time_between_sounds=2.0,
        max_time_between_sounds=8.0,
        noise_transform=PolarityInversion(),
        p=0.5
    )
    add_gaussian_white_noise = add_gaussian_white_noise_transform(x,sr)
    add_short_noises = add_short_noises_transform(x,sr)
    
    features_x = calculate_features(x,sr)
    pitch_shift_features = calculate_features(augmented_pitch_shift,sr)
    time_stretch_features = calculate_features(augmented_time_stretch,sr)
    augmented_noise_feature = calculate_features(augmented_noise,sr)
    # print(features_x.shape)
    # print(pitch_shift_features.shape)
    # print(time_stretch_features.shape)
    
    # augmented_noise_feature = calculate_features()
    if(flag_value>0.5):
        white_noise_features = calculate_features(add_gaussian_white_noise,sr)
        short_voices_feature = calculate_features(add_short_noises,sr)
        return np.vstack((features_x,white_noise_features,pitch_shift_features,\
                         time_stretch_features,augmented_noise_feature,short_voices_feature))
    return np.vstack((features_x,pitch_shift_features,time_stretch_features,augmented_noise_feature))

    


    # sf.write('../ML_project/temp.wav',augmented_noise,sr)



In [42]:
def augment_test_audio_randomly(path,n_steps,rate):
    x,sr = librosa.load(path)
    augmented_pitch_shift = pyrubberband.pyrb.pitch_shift(x,n_steps=n_steps,sr=sr)
    augmented_time_stretch = pyrubberband.pyrb.time_stretch(x,rate=rate,sr=sr)

    transform_BGN = AddBackgroundNoise(
        sounds_path="../ML_project/_background_noise_",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_transform=PolarityInversion(),
        p=1.0)
    augmented_noise = transform_BGN(x, sample_rate=sr)
    add_gaussian_white_noise_transform = AddGaussianSNR(
        min_snr_db=5.0,
        max_snr_db=40.0,
        p=0.2
    )
    add_short_noises_transform = AddShortNoises(
        sounds_path="../ML_project/combined_audios",
        min_snr_in_db=3.0,
        max_snr_in_db=30.0,
        noise_rms="relative_to_whole_input",
        min_time_between_sounds=2.0,
        max_time_between_sounds=8.0,
        noise_transform=PolarityInversion(),
        p=0.5
    )
    add_gaussian_white_noise = add_gaussian_white_noise_transform(x,sr)
    add_short_noises = add_short_noises_transform(x,sr)
    
    features_x = calculate_features(x,sr)
    pitch_shift_features = calculate_features(augmented_pitch_shift,sr)
    time_stretch_features = calculate_features(augmented_time_stretch,sr)
    augmented_noise_feature = calculate_features(augmented_noise,sr)
    # print(features_x.shape)
    # print(pitch_shift_features.shape)
    # print(time_stretch_features.shape)
    
    # augmented_noise_feature = calculate_features()
    # if(flag_value>0.5):
    white_noise_features = calculate_features(add_gaussian_white_noise,sr)
    short_voices_feature = calculate_features(add_short_noises,sr)
    #     return np.vstack((features_x,white_noise_features,pitch_shift_features,\
    #                      time_stretch_features,augmented_noise_feature,short_voices_feature))
    a = np.random.randint(1,6)
    if a==1:
        return features_x
    elif a==2:
        return white_noise_features
    elif a==3:
        return short_voices_feature
    elif a==4:
        return pitch_shift_features
    elif a==5:
        return time_stretch_features
    return augmented_noise_feature

In [29]:
# temp_feature = augment_audio_extract_features('../ML_project/16000_pcm_speeches/Benjamin_Netanyau/18.wav',4,1.2,0.3)
# temp_feature.shape

In [21]:
# for i in tqdm(os.listdir('../ML_project/combined_audios')):
#     combined_audio_with_noises('../ML_project/combined_audios/'+i,i)

In [23]:
train_dict = {}
test_dict = {}
for i in os.listdir('../ML_project/16000_pcm_speeches'):
    folder_data = os.listdir('../ML_project/16000_pcm_speeches'+'/'+i)
    split_80 = int(len(folder_data) * 0.8)
    split_20 = len(folder_data) - split_80
    list_80 = random.sample(folder_data, split_80)
    list_20 = [elem for elem in folder_data if elem not in list_80]
    train_dict[i] = list_80
    test_dict[i] = list_20

In [33]:
from tqdm import tqdm
for Class,x in tqdm(train_dict.items()):
   count = 0
   features = np.asarray(())
   for file in x:
      n_steps = random.uniform(-4,4)
      rate = random.uniform(0.7,1.3)
      flag_value = random.uniform(0,1)
      file_path = '../ML_project/16000_pcm_speeches' + '/' + Class + '/' + file
      curr_fea = augment_audio_extract_features(file_path,n_steps=n_steps,rate=rate,\
                                                flag_value=flag_value)
      # print(curr_fea.T.shape)
      if(count == 0):
         features = curr_fea
         count+=1
      else:
         features = np.vstack((features, curr_fea))
   gmm1 = mixture.GaussianMixture(n_components = 48, covariance_type='full',n_init = 3,max_iter=10000,init_params='k-means++')
   gmm1.fit(features)

   picklefile = '../ML_project/Models2/'+Class+'.gmm'
   pickle.dump(gmm1, open(picklefile, 'wb'))

100%|██████████| 5/5 [9:23:29<00:00, 6761.92s/it]  


In [34]:
model_path = '../ML_project/Models2'
models = [pickle.load(open(model_path+'/'+fname,'rb')) for fname in os.listdir(model_path)]
speakers   = [fname.split('.')[0] for fname in os.listdir(model_path)] 

In [43]:
y_test = []
y_pred = []
for Class,x in tqdm(test_dict.items()):
    for file in x:
      n_steps = random.uniform(-4,4)
      rate = random.uniform(0.7,1.3)
      file_path = '../ML_project/16000_pcm_speeches' + '/' + Class + '/' + file
      curr_fea = augment_test_audio_randomly(file_path,n_steps,rate)
      # print(curr_fea.shape)
      y_test.append(Class)
      log_likelihood = np.zeros(len(models)) 
    #   curr_fea = curr_fea.reshape(1,-1)
      for i in range(len(models)):
          gmm = models[i]         #checking with each model one by one
          scores = np.array(gmm.score(curr_fea))
          log_likelihood[i] = scores.sum()
    
      winner = np.argmax(log_likelihood)
      y_pred.append(speakers[winner])

100%|██████████| 5/5 [08:25<00:00, 101.08s/it]


In [44]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_true=y_test,y_pred=y_pred)
print(acc)

0.9906666666666667
