In [None]:
import IPython.display as ipd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import librosa
import librosa.display
import os
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Dense,ReLU,Activation,Conv2D,Lambda,Input,BatchNormalization
from keras.layers.advanced_activations import LeakyReLU 
from keras import optimizers

In [None]:
def create_dataset(num_samples, voice_dir, l, amp, x, y):
    
    i=0
    
    for voice_file in os.listdir(voice_dir)[:num_samples]:
        #len(v)/Fs = time_len(v)
        v, Fs = librosa.load(voice_dir+'/'+voice_file, sr=None)

        if len(v) >= l:
            v = v[:l]
        else:
            v_t = np.zeros(l)
            v_t[:len(v)] = v
            v = v_t

        v = v * amp/max(v)
        y_t=librosa.stft(v)     
        m=np.random.randint(0,len(v), size=np.random.choice([4,5,6]))
        for j in m:
            v[j:j+7000]=0
            
        x_t=librosa.stft(v)
        x.append(x_t)
        y.append(y_t)
        
        if i%10 == 0:
            print(i, end=' ')
        i+=1
            
    return(x, y)
            

In [None]:
num_samples=25
l = 250000
amp=1
voice_dir='/kaggle/input/common-voice/cv-valid-train/cv-valid-train'
x=[]
y=[]

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    x, y = create_dataset(num_samples, voice_dir, l, amp, x, y)

In [None]:
n=17
print('RAW')
ip=librosa.istft(x[n])
plt.figure(figsize=(8,2))
plt.plot(ip)
plt.show()
fig, ax = plt.subplots()
img = librosa.display.specshow(librosa.amplitude_to_db(np.abs(x[n]),ref=np.max),y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
plt.show()
ipd.display(ipd.Audio(data=ip, rate=48000))

print('IDEAL')
op=librosa.istft(y[n])
plt.figure(figsize=(8,2))
plt.plot(op)
plt.show()
fig, ax = plt.subplots()
img = librosa.display.specshow(librosa.amplitude_to_db(np.abs(y[n]),ref=np.max),y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
plt.show()
ipd.display(ipd.Audio(data=op, rate=48000))


In [None]:
def conv_network():
    ip=Input(shape=(1025,489,1))
    x=Conv2D(64,kernel_size = 9,strides = (1,1), padding="SAME")(ip)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    x=Conv2D(64,kernel_size = 9,strides = (1,1), padding="SAME")(x)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    x=Conv2D(32,kernel_size = 9,strides = (1,1), padding="SAME")(x)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    x=Conv2D(16,kernel_size = 9,strides = (1,1), padding="SAME")(x)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    x=Conv2D(8,kernel_size = 6,strides = (1,1), padding="SAME")(x)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    x=Conv2D(4,kernel_size = 6,strides = (1,1), padding="SAME")(x)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    x=Conv2D(2,kernel_size = 6,strides = (1,1), padding="SAME")(x)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    x=Conv2D(1,kernel_size = 6,strides = (1,1), padding="SAME")(x)
    x=BatchNormalization()(x)
    x=LeakyReLU(0.2)(x)
    
    model=Model(inputs=ip,outputs=x)
    return model

In [None]:
cn=conv_network()
cn.summary()

In [None]:
from keras.utils import plot_model as pl
pl(cn, to_file='Model1.png')

In [None]:
x=np.array(x)
y=np.array(y)
opt=optimizers.Adam(lr=0.004)
cn.compile(optimizer=opt, loss='mean_squared_error',metrics=['accuracy'])
history=cn.fit(x,y,batch_size=8,epochs=60,verbose=1,validation_split=0.1)

In [None]:
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
#23,
k=179
i=x[k]
v=y[k]
t=np.expand_dims(i, axis=0)
o=cn.predict(t)
o=np.reshape(o, i.shape)

op=librosa.istft(o)
ip=librosa.istft(i)
val=librosa.istft(v)
import librosa.display
# Raw
print('RAW')
fig, ax = plt.subplots()
img = librosa.display.specshow(librosa.amplitude_to_db(i,ref=np.max),y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
plt.show()
ipd.display(ipd.Audio(data=ip, rate=48000))
# Predict
print('PREDICTION')
fig, ax = plt.subplots()
img = librosa.display.specshow(librosa.amplitude_to_db(o,ref=np.max),y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
plt.show()
ipd.display(ipd.Audio(data=op, rate=48000))
# Ideal
print('IDEAL')
fig, ax = plt.subplots()
img = librosa.display.specshow(librosa.amplitude_to_db(v,ref=np.max),y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
plt.show()
ipd.display(ipd.Audio(data=val, rate=48000))