In [None]:
!rm -rf ./recordings ./working
!mkdir ./recordings ./working

In [None]:
import pandas as pd
from tqdm import tqdm
import scipy.io
import numpy as np
import h5py
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

X_test = pd.read_pickle('/content/drive/MyDrive/BTP Dataset/X_test.pkl')
y_test = pd.read_pickle('/content/drive/MyDrive/BTP Dataset/y_test.pkl')


indexes = [f'{i}_index' for i in X_test.index]
X_test.index = indexes
y_test.index = indexes


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import wave
import pylab
from pathlib import Path
from scipy import signal
from scipy.io import wavfile
from sklearn.metrics import confusion_matrix
import itertools
from scipy.io.wavfile import write

# # Set paths to input and output data
INPUT_DIR = './recordings/'
OUTPUT_DIR = './working/'
fs = 23437

In [None]:
!rm -rf rec_test
!mkdir rec_test
DIR = "./rec_test/"
for i in X_test.index:
  # print(i)
  data = X_test.loc[i]
  scaled = np.int16(data / np.max(np.abs(data)) * 32767)
  write(DIR+f'{i}.wav', fs, scaled)

In [None]:
# X_train

In [None]:
# y_train.shape

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Plot first 5 WAV files as a waveform and a frequency spectrum
for i in range(5):
    signal_wave = wave.open(os.path.join(INPUT_DIR, parent_list[i]), 'r')
    sample_rate = 23437
    sig = np.frombuffer(signal_wave.readframes(sample_rate), dtype=np.int16)

    plt.figure(figsize=(12,12))
    plot_a = plt.subplot(211)
    plot_a.set_title(parent_list[i])
    plot_a.plot(sig)
    plot_a.set_xlabel('sample rate * time')
    plot_a.set_ylabel('energy')

    plot_b = plt.subplot(212)
    plot_b.specgram(sig, NFFT=1024, Fs=sample_rate, noverlap=900)
    plot_b.set_xlabel('Time')
    plot_b.set_ylabel('Frequency')

plt.show()

You can clearly see the difference in the energy distribution when different words are being pronounced. These distinct characteristics also show up in the spectograms of the recordings, which will enable us transform what was originally an audio problem to an image problem.

# Convert audio files to spectograms

We can now do the actual conversion of every audio sample. The code below uses the wave and pylab library to extract sound information and frame rate from each sample and save the result as a spectrogram. After this we've simply converted our audio problem to a image problem.

In [None]:
# print(X_train)

                0          1          2          3          4          5      \
2241_index  21.591516  22.659648  21.972992  23.422599  24.261845  23.422599   
1459_index   8.010986   5.416953   8.010986   7.171740   8.926528   9.689479   
468_index   27.389944  26.321813  28.076600  24.872206  20.981155  20.065614   
3272_index   7.553216  11.749447  14.038300  12.130922  12.741283  17.013809   
1580_index -47.302968 -49.210346 -47.226673 -47.226673 -46.082246 -44.632639   
...               ...        ...        ...        ...        ...        ...   
1095_index -22.735943 -19.684138 -25.558862 -20.065614 -18.616007 -18.463416   
1130_index  -2.975509  -2.288853  -2.746624   0.457771  -1.068132   2.136263   
1294_index   7.400626   8.010986   9.155413  12.359808  12.207218   9.079118   
860_index   13.122759  11.444266  13.656825   8.239872  10.910201  12.130922   
3174_index  -3.891051  -5.340658  -9.384298 -10.452430 -10.910201 -12.207218   

                6          7          8

In [None]:
# print(y_train)

            0
2241_index  1
1459_index  0
468_index   0
3272_index  1
1580_index  0
...        ..
1095_index  0
1130_index  0
1294_index  0
860_index   0
3174_index  1

[2657 rows x 1 columns]


In [None]:
# Utility function to get sound and frame rate info
def get_wav_info(wav_file):
    wav = wave.open(wav_file, 'r')
    frames = wav.readframes(-1)
    sound_info = pylab.frombuffer(frames, 'int16')
    frame_rate = wav.getframerate()
    wav.close()
    return sound_info, frame_rate

# For every recording, make a spectogram and save it as label_speaker_no.png
if not os.path.exists(os.path.join(OUTPUT_DIR, 'audio-images')):
    os.mkdir(os.path.join(OUTPUT_DIR, 'audio-images'))

for filename in tqdm(os.listdir(INPUT_DIR)):
    if "wav" in filename:
        file_path = os.path.join(INPUT_DIR, filename)
        file_stem = Path(file_path).stem
        # print(y_train.shape)
        target_dir = f'class_{y_train.loc[file_stem][0]}'
        dist_dir = os.path.join(os.path.join(OUTPUT_DIR, 'audio-images'), target_dir)
        file_dist_path = os.path.join(dist_dir, file_stem)
        if not os.path.exists(file_dist_path + '.png'):
            if not os.path.exists(dist_dir):
                os.mkdir(dist_dir)
            file_stem = Path(file_path).stem
            sound_info, frame_rate = get_wav_info(file_path)
            pylab.specgram(sound_info, Fs=frame_rate)
            pylab.savefig(f'{file_dist_path}.png')
            pylab.close()

for filename in tqdm(os.listdir(DIR)):
    if "wav" in filename:
        file_path = os.path.join(DIR, filename)
        file_stem = Path(file_path).stem
        # print(y_train.shape)
        target_dir = f'test_class_{y_test.loc[file_stem][0]}'
        dist_dir = os.path.join(os.path.join(OUTPUT_DIR, 'audio-images'), target_dir)
        file_dist_path = os.path.join(dist_dir, file_stem)
        if not os.path.exists(file_dist_path + '.png'):
            if not os.path.exists(dist_dir):
                os.mkdir(dist_dir)
            file_stem = Path(file_path).stem
            sound_info, frame_rate = get_wav_info(file_path)
            pylab.specgram(sound_info, Fs=frame_rate)
            pylab.savefig(f'{file_dist_path}.png')
            pylab.close()


  Z = 10. * np.log10(spec)
100%|██████████| 665/665 [02:46<00:00,  3.99it/s]


In [None]:
!zip -r dataset_test.zip ./working

  adding: working/ (stored 0%)
  adding: working/audio-images/ (stored 0%)
  adding: working/audio-images/test_class_0/ (stored 0%)
  adding: working/audio-images/test_class_0/1336_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/141_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/1206_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/218_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/387_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/70_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/194_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/254_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/73_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/196_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/874_index.png (deflated 1%)
  adding: working/audio-images/test_class_0/332_index.png (deflated 1%)
  ad

In [None]:
!du -h /content/dataset_test.zip


140M	/content/dataset_test.zip


In [None]:
!cp /content/dataset_test.zip /content/drive/MyDrive