In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

import matplotlib.pyplot as plt
from IPython.display import Audio

from scipy.io import wavfile

In [None]:
import scipy.signal
import numpy as np
import librosa
#from noisereduce.plotting import plot_reduction_steps
from tqdm.autonotebook import tqdm
import warnings
import copy


def _stft(y, n_fft, hop_length, win_length, use_tensorflow=False):
    if use_tensorflow:
        # return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True)
        return _stft_tensorflow(y, n_fft, hop_length, win_length)
    else:
        return librosa.stft(
            y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
        )


def _istft(y, n_fft, hop_length, win_length, use_tensorflow=False):
    if use_tensorflow:
        # return librosa.istft(y, hop_length, win_length)
        return _istft_tensorflow(y.T, n_fft, hop_length, win_length)
    else:
        return librosa.istft(y, hop_length, win_length)


def _stft_librosa(y, n_fft, hop_length, win_length):
    return librosa.stft(
        y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
    )


def _istft_librosa(y, hop_length, win_length):
    return librosa.istft(y, hop_length, win_length)


def _stft_tensorflow(y, n_fft, hop_length, win_length):
    return (
        tf.signal.stft(
            y,
            win_length,
            hop_length,
            n_fft,
            pad_end=True,
            window_fn=tf.signal.hann_window,
        )
        .numpy()
        .T
    )


def _istft_tensorflow(y, n_fft, hop_length, win_length):
    return tf.signal.inverse_stft(
        y.astype(np.complex64), win_length, hop_length, n_fft
    ).numpy()


def _amp_to_db(x):
    return librosa.core.amplitude_to_db(x, ref=1.0, amin=1e-20, top_db=80.0)


def _db_to_amp(x,):
    return librosa.core.db_to_amplitude(x, ref=1.0)


def update_pbar(pbar, message):
    """ writes to progress bar
    """
    if pbar is not None:
        pbar.set_description(message)
        pbar.update(1)


def _smoothing_filter(n_grad_freq, n_grad_time):
    """Generates a filter to smooth the mask for the spectrogram
        
    Arguments:
        n_grad_freq {[type]} -- [how many frequency channels to smooth over with the mask.]
        n_grad_time {[type]} -- [how many time channels to smooth over with the mask.]
    """

    smoothing_filter = np.outer(
        np.concatenate(
            [
                np.linspace(0, 1, n_grad_freq + 1, endpoint=False),
                np.linspace(1, 0, n_grad_freq + 2),
            ]
        )[1:-1],
        np.concatenate(
            [
                np.linspace(0, 1, n_grad_time + 1, endpoint=False),
                np.linspace(1, 0, n_grad_time + 2),
            ]
        )[1:-1],
    )
    smoothing_filter = smoothing_filter / np.sum(smoothing_filter)
    return smoothing_filter


def mask_signal(sig_stft, sig_mask):
    """ Reduces amplitude of time/frequency regions of a spectrogram based upon a mask 
        
    Arguments:
        sig_stft {[type]} -- spectrogram of signal
        sig_mask {[type]} -- mask to apply to signal
    
    Returns:
        sig_stft_amp [type] -- masked signal
    """
    sig_stft_amp = sig_stft * (1 - sig_mask)
    return sig_stft_amp


def convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow=False):
    """ Convolves a gaussian filter with a mask (or any image)
    
    Arguments:
        sig_mask {[type]} -- The signal mask
        smoothing_filter {[type]} -- the filter to convolve
    
    Keyword Arguments:
        use_tensorflow {bool} -- use tensorflow.signal or scipy.signal (default: {False})
    """
    if use_tensorflow:
        smoothing_filter = smoothing_filter * (
            (np.shape(smoothing_filter)[1] - 1) / 2 + 1
        )
        smoothing_filter = smoothing_filter[:, :, tf.newaxis, tf.newaxis].astype(
            "float32"
        )
        img = sig_mask[:, :, tf.newaxis, tf.newaxis].astype("float32")
        return (
            tf.nn.conv2d(img, smoothing_filter, strides=[1, 1, 1, 1], padding="SAME")
            .numpy()
            .squeeze()
        )
    else:
        return scipy.signal.fftconvolve(sig_mask, smoothing_filter, mode="same")


def load_tensorflow(verbose=False):
    """loads tensorflow if it is available
    Used as a backend for fft and convolution
    
    Returns:
        bool -- whether to use tensorflow
    """
    try:
        # import tensorflow as tf
        globals()["tf"] = __import__("tensorflow")

        if verbose:
            available_gpus = tf.config.experimental.list_physical_devices("GPU")
            print("GPUs available: {}".format(available_gpus))
        if int(tf.__version__[0]) < 2:
            warnings.warn(
                "Tensorflow version is below 2.0, reverting to non-tensorflow backend"
            )
            return False
    except:
        warnings.warn(
            "Tensorflow is not installed, reverting to non-tensorflow backend"
        )
        return False
    return True


def reduce_noise(
    audio_clip,
    noise_clip=None,
    n_grad_freq=2,
    n_grad_time=4,
    n_fft=2048,
    win_length=2048,
    hop_length=512,
    n_std_thresh=1.5,
    prop_decrease=1.0,
    pad_clipping=True,
    use_tensorflow=False,
    verbose=False,
):
    """Remove noise from audio based upon a clip containing only noise
    Args:
        audio_clip (array): Waveform of audio
        noise_clip (array): The second parameter.
        n_grad_freq (int): how many frequency channels to smooth over with the mask.
        n_grad_time (int): how many time channels to smooth over with the mask.
        n_fft (int): number audio of frames between STFT columns.
        win_length (int): Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`..
        hop_length (int):number audio of frames between STFT columns.
        n_std_thresh (int): how many standard deviations louder than the mean dB of the noise (at each frequency level) to be considered signal
        prop_decrease (float): To what extent should you decrease noise (1 = all, 0 = none)
        pad_clipping (bool): Pad the signals with zeros to ensure that the reconstructed data is equal length to the data
        use_tensorflow (bool): Use tensorflow as a backend for convolution and fft to speed up computation
        verbose (bool): Whether to plot the steps of the algorithm
    Returns:
        array: The recovered signal with noise subtracted
    """
    # load tensorflow if you are using it as a backend
    if use_tensorflow:
        use_tensorflow = load_tensorflow(verbose)

    if verbose:
        pbar = tqdm(total=7)
    else:
        pbar = None

    # STFT over signal
    update_pbar(pbar, "STFT on signal")

    # pad signal with zeros to avoid extra frames being clipped if desired
    if pad_clipping:
        nsamp = len(audio_clip)
        audio_clip = np.pad(audio_clip, [0, hop_length], mode="constant")

    sig_stft = _stft(
        audio_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
    )
    # spectrogram of signal in dB
    sig_stft_db = _amp_to_db(np.abs(sig_stft))

    update_pbar(pbar, "STFT on noise")
    # STFT over noise
    if noise_clip is None:
        noise_stft = copy.deepcopy(sig_stft)
        noise_stft_db = copy.deepcopy(sig_stft_db)
    else:
        noise_stft = _stft(
            noise_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
        )
        noise_stft_db = _amp_to_db(np.abs(noise_stft))  # convert to dB
    # Calculate statistics over noise
    mean_freq_noise = np.mean(noise_stft_db, axis=1)
    std_freq_noise = np.std(noise_stft_db, axis=1)
    noise_thresh = mean_freq_noise + std_freq_noise * n_std_thresh

    update_pbar(pbar, "Generate mask")

    # calculate the threshold for each frequency/time bin
    db_thresh = np.repeat(
        np.reshape(noise_thresh, [1, len(mean_freq_noise)]),
        np.shape(sig_stft_db)[1],
        axis=0,
    ).T
    # mask if the signal is above the threshold
    sig_mask = sig_stft_db < db_thresh
    update_pbar(pbar, "Smooth mask")
    # Create a smoothing filter for the mask in time and frequency
    smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time)

    # convolve the mask with a smoothing filter
    sig_mask = convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow)

    sig_mask = sig_mask * prop_decrease
    update_pbar(pbar, "Apply mask")
    # mask the signal

    sig_stft_amp = mask_signal(sig_stft, sig_mask)

    update_pbar(pbar, "Recover signal")
    # recover the signal
    recovered_signal = _istft(
        sig_stft_amp, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
    )
    # fix the recovered signal length if padding signal
    if pad_clipping:
        recovered_signal = librosa.util.fix_length(recovered_signal, nsamp)

    recovered_spec = _amp_to_db(
        np.abs(
            _stft(
                recovered_signal,
                n_fft,
                hop_length,
                win_length,
                use_tensorflow=use_tensorflow,
            )
        )
    )
    if verbose:
        plot_reduction_steps(
            noise_stft_db,
            mean_freq_noise,
            std_freq_noise,
            noise_thresh,
            smoothing_filter,
            sig_stft_db,
            sig_mask,
            recovered_spec,
        )
    return recovered_signal

  """


In [None]:
!git clone https://github.com/resemble-ai/Resemblyzer.git

Cloning into 'Resemblyzer'...
remote: Enumerating objects: 602, done.[K
remote: Total 602 (delta 0), reused 0 (delta 0), pack-reused 602[K
Receiving objects: 100% (602/602), 101.46 MiB | 37.35 MiB/s, done.
Resolving deltas: 100% (107/107), done.


In [None]:
cd Resemblyzer

/content/Resemblyzer


In [None]:
!pip install webrtcvad

Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[K     |████████████████████████████████| 66 kB 2.7 MB/s 
[?25hBuilding wheels for collected packages: webrtcvad
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp37-cp37m-linux_x86_64.whl size=72370 sha256=b08f37c7abcdff68a324d33eeefd5073ae2d626e51595bbdcbdd1528a08d578a
  Stored in directory: /root/.cache/pip/wheels/11/f9/67/a3158d131f57e1c0a7d8d966a707d4a2fb27567a4fe47723ad
Successfully built webrtcvad
Installing collected packages: webrtcvad
Successfully installed webrtcvad-2.0.10


In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
from scipy import signal
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [None]:
import tensorflow_hub as hub
# Load the YAMNET model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
import os
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
import soundfile as sf
from pydub import AudioSegment
from sklearn.decomposition import PCA
pca = PCA(n_components=2,random_state=42)
PATH = '/content/drive/MyDrive'
 # Define data path
data_path = PATH + '/speech'
data_dir_list = os.listdir(data_path)
featset=[]
labl=[]
for dataset in data_dir_list:
    img_list=os.listdir(data_path+'/'+ dataset)
    print ('Loaded the audio of dataset-'+'{}\\n'.format(dataset))
    for i in range(0,len(img_list)):
      wav_fpath=data_path+'/'+ dataset+'/'+img_list[i]
      wav = preprocess_wav(wav_fpath)
      recov=reduce_noise(wav)
      mp3_file = AudioSegment.from_file(wav_fpath)
      mp3_file.export('newSong.wav', format="wav")
      wav_data, sample_rate1 = sf.read('newSong.wav', dtype=np.int16)
      sample_rate, wav_data = ensure_sample_rate(sample_rate1, recov)
      duration = len(wav_data)/sample_rate
      waveform = wav_data / tf.int16.max
      scores, embeddings, spectrogram = model(waveform)
      out=np.array(embeddings)
      t=pca.fit_transform(np.transpose(out))
      t=np.reshape(t,(t.shape[0]*t.shape[1]))
      featset.append(t)
      labl.append(dataset)

Loaded the audio of dataset-Spe2\n
Loaded the audio of dataset-Spe3\n
Loaded the audio of dataset-Spe1\n


In [None]:
features=np.array(featset)
labels=np.array(labl)

In [None]:
np.unique(labels)

array(['Spe1', 'Spe2', 'Spe3'], dtype='<U4')

In [None]:
lab=np.zeros([labels.shape[0]],np.uint8)
for i in range(0,labels.shape[0]):
  if labels[i]=='Spe1':
    lab[i]=0
  else:
    lab[i]=1

In [None]:
np.unique(lab)

array([0, 1], dtype=uint8)

In [None]:
pickle.dump(pca, open("pca_model.pkl","wb"))

Classifier model on the VOXConverse dataset

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn. model_selection import train_test_split
clf1=RandomForestClassifier()
X_train,X_test,y_train,y_test=train_test_split(features,lab,shuffle=True,test_size=0.2)
clf1.fit(X_train,y_train)
pred=clf1.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.57      0.73         7
           1       0.82      1.00      0.90        14

    accuracy                           0.86        21
   macro avg       0.91      0.79      0.82        21
weighted avg       0.88      0.86      0.84        21

[[ 4  3]
 [ 0 14]]


**Testing phase**

In [None]:
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
import soundfile as sf
#from pydub import AudioSegment

#give the file path to your audio file
audio_file_path = '/content/drive/MyDrive/BWAI Challenge 2 Dataset/Test Dataset/Test 19.wav'
wav_fpath = Path(audio_file_path)

wav = preprocess_wav(wav_fpath)
recov=reduce_noise(wav)


In [None]:
from pydub import AudioSegment

# Import the .mp3 file
mp3_file = AudioSegment.from_file(wav_fpath)

# Export the .mp3 file as wav
mp3_file.export('newSong.wav', format="wav")
wav_data, sample_rate1 = sf.read('newSong.wav', dtype=np.int16)

In [None]:
import tensorflow_hub as hub
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
  sample_rate, wav_data = ensure_sample_rate(sample_rate1, recov)
  duration = len(wav_data)/sample_rate
  waveform = wav_data / tf.int16.max
  scores, embeddings, spectrogram = model(waveform)

In [None]:
spec=np.array(embeddings)
out=pca.fit_transform(np.transpose(spec))
out=np.reshape(out,(1,out.shape[0]*out.shape[1]))

In [None]:
clf3.predict(out)

array([1])

In [None]:
import pickle
pickle.dump(clf2, open('model_binary_new.pkl','wb'))

In [None]:
import pickle
clf2= pickle.load(open('/content/drive/MyDrive/model_binary_new.pkl','rb'))

In [None]:
import os
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
import soundfile as sf
from pydub import AudioSegment
from sklearn.decomposition import PCA
pca = PCA(n_components=2,random_state=42)

PATH = '/content/drive/MyDrive/BWAI Challenge 2 Dataset'
 # Define data path
data_path = PATH + '/Test Dataset'
data_dir_list = os.listdir(data_path)
test_featset=[]
for dataset in data_dir_list:
      wav_fpath=data_path+'/'+ dataset
      print(dataset)
      wav = preprocess_wav(wav_fpath)
      recov=reduce_noise(wav)
      mp3_file = AudioSegment.from_file(wav_fpath)
      mp3_file.export('newSong.wav', format="wav")
      wav_data, sample_rate1 = sf.read('newSong.wav', dtype=np.int16)
      sample_rate, wav_data = ensure_sample_rate(sample_rate1, recov)
      duration = len(wav_data)/sample_rate
      waveform = wav_data / tf.int16.max
      scores, embeddings, spectrogram = model(waveform)
      out=np.array(embeddings)
      t=pca.fit_transform(np.transpose(out))
      t=np.reshape(t,(t.shape[0]*t.shape[1]))
      #for i in range(len(embeddings)):
      #  featset.append(embeddings[i])
      #  labl.append(dataset)
      test_featset.append(t)

Test 17.wav
Test 15.wav
Test 16.wav
Test 11.wav
Test 14.wav
Test 12.wav
Test 1.wav
Test 13.wav
Test 21.wav
Test 25.wav
Test 20.wav
Test 2.wav
Test 23.wav
Test 18.wav
Test 24.wav
Test 19.wav
Test 4.wav
Test 22.wav
Test 3.wav
Test 5.wav
Test 6.wav
Test 8.wav
Test 9.wav
Test 7.wav
Test 10.wav


In [None]:
test_feats=np.array(test_featset)

In [None]:
new_feat=np.concatenate([features,test_feats[:7]],axis=0)

In [None]:
y1=[1,1,1,1,1,1,0]  # Label of first six samples of test set

In [None]:
new_lab=np.concatenate([lab,y1],axis=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn. model_selection import train_test_split
clf2=RandomForestClassifier()
X_train,X_test,y_train,y_test=train_test_split(new_feat,new_lab,shuffle=True,test_size=0.2)
clf2.fit(X_train,y_train)
pred=clf2.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.43      0.60         7
           1       0.80      1.00      0.89        16

    accuracy                           0.83        23
   macro avg       0.90      0.71      0.74        23
weighted avg       0.86      0.83      0.80        23

[[ 3  4]
 [ 0 16]]


In [None]:
test_lab=[0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,]

In [None]:
pred=clf2.predict(test_feats[7:])

In [None]:
pred

array([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1])

In [None]:
t_lab=[1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(t_lab,pred))
print(confusion_matrix(t_lab,pred))

              precision    recall  f1-score   support

           0       0.20      1.00      0.33         1
           1       1.00      0.76      0.87        17

    accuracy                           0.78        18
   macro avg       0.60      0.88      0.60        18
weighted avg       0.96      0.78      0.84        18

[[ 1  0]
 [ 4 13]]


Second level

In [None]:
new_feat1=np.concatenate([features[27:],test_feats[:6]],axis=0)
y1_1=[0,1,0,0,0,0]  # Label of first six samples of test set

In [None]:
lab=np.zeros([77],np.uint8)
for i in range(0,labels.shape[0]):
  if labels[i]=='Spe2':
    lab[i]=0
  elif labels[i]=='Spe3':
    lab[i]=1

In [None]:
new_lab1=np.concatenate([lab,y1_1],axis=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn. model_selection import train_test_split
clf3=RandomForestClassifier()
X_train,X_test,y_train,y_test=train_test_split(new_feat1,new_lab1,shuffle=True,test_size=0.5)
clf3.fit(X_train,y_train)
pred=clf3.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))

              precision    recall  f1-score   support

           0       0.81      0.65      0.72        26
           1       0.57      0.75      0.65        16

    accuracy                           0.69        42
   macro avg       0.69      0.70      0.69        42
weighted avg       0.72      0.69      0.69        42

[[17  9]
 [ 4 12]]


In [None]:
t1=test_feats[7:12]
t2=test_feats[13:]
feat_t=np.concatenate([t1,t2],axis=0)

In [None]:
pred=clf3.predict(feat_t)

In [None]:
pred

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [None]:
t_lab=[0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(t_lab,pred))
print(confusion_matrix(t_lab,pred))

In [None]:
import pickle
pickle.dump(clf3, open('model_multi_new.pkl','wb'))

In [None]:
import pickle
clf3= pickle.load(open('/content/drive/MyDrive/model_multi_new.pkl','rb'))