# Preprocessing of audio files
This notebook contains preprocessing steps for audio processing.

## Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pyaudio
import os
import json

%matplotlib inline

## Getting audio file name and it's size
There are files with 0 size i.e. empty file which occured due to recording error

In [2]:
path = os.path.abspath(os.getcwd())

In [3]:
file_path = os.path.join(path, 'audioFiles')
file_path

'/run/media/rajm/programs/Projects/major-project-processing/pythonFiles/audioFiles'

In [4]:
file_list = [x for x in os.listdir(file_path) if x.split('.')[-1]=='wav']
file_list[:5]

['00538947-3390-4449-88ee-c3e719aa0261.wav',
 '00678de1-0f90-4cca-9361-3ccef5351104.wav',
 '01a78071-c22a-4303-bfc6-ff20858c7cc5.wav',
 '0291ed0f-bfa7-4d9d-8feb-34791aab8f64.wav',
 '02d7037d-7f08-4b2e-b7f0-35ee5589b06e.wav']

In [5]:
file_size = [os.stat(os.path.join(file_path, x)).st_size for x in file_list]
file_size[:5]

[70842, 88626, 0, 30485, 26866]

## Loading Labels

In [6]:
json_file = os.path.join(path, 'labels/data.json')
json_file

'/run/media/rajm/programs/Projects/major-project-processing/pythonFiles/labels/data.json'

In [7]:
df = pd.read_json(json_file)
df.head()

Unnamed: 0,_id,userId,sentenceId,fileName,gender
0,60e2d7ed552fd6002e30b8fc,FzlNJ/+dBGudrigpiH+9U9MlEqeHvOslfj66OxVrv4g=,4608,96b10244-694c-460e-bc1a-e60210c94844,male
1,60e2d811552fd6002e30b8fd,L5WMnUqwFRlZUFg48A4DRu9dYwP9srB5s2cqsA/rDZg=,7367,dd42d217-11b5-4107-b8c5-8c60939db63c,male
2,60e2d958552fd6002e30b8fe,zcRQLjrvRyhg0PDjjhxlGJ1PoM7deRWnvlx08Ja1Wl4=,8460,95ccf7f4-b198-4623-bcf8-45deb7f914e7,male
3,60e2d95a552fd6002e30b8ff,zcRQLjrvRyhg0PDjjhxlGJ1PoM7deRWnvlx08Ja1Wl4=,8460,2e2458e0-c243-44b8-aa4a-2ebc0c032214,male
4,60e2d95b552fd6002e30b900,4UO9IETvAMYKoPU5GhL4DRjqb5rNgF1FpAKkyXQ9v/c=,6969,77fb2572-65c2-47b9-968b-4e38778242c5,male


In [8]:
file_removing_zero = [filename for index, filename in enumerate(file_list) if file_size[index]!=0]
file_removing_zero[:5]

['00538947-3390-4449-88ee-c3e719aa0261.wav',
 '00678de1-0f90-4cca-9361-3ccef5351104.wav',
 '0291ed0f-bfa7-4d9d-8feb-34791aab8f64.wav',
 '02d7037d-7f08-4b2e-b7f0-35ee5589b06e.wav',
 '039df766-0e79-416c-9dfd-893e113d1709.wav']

In [9]:
print('Total File :- {}, Total File with zero :- {}'.format(len(file_list), len(file_removing_zero)))

Total File :- 520, Total File with zero :- 440


In [10]:
file_to_be_removed = [filename for index, filename in enumerate(file_list) if file_size[index]==0]
file_to_be_removed[:5]

['01a78071-c22a-4303-bfc6-ff20858c7cc5.wav',
 '03de4569-7a81-48c8-beb1-a57871cfed13.wav',
 '03f84036-bc84-409d-be57-2681623e0c15.wav',
 '1a725c4a-dc57-4173-a4a4-068e189b4fe3.wav',
 '1f62b445-ea56-4efb-bbe0-0a069ae9283d.wav']

In [11]:
list_of_index_to_be_removed = []
for index, filename in enumerate(file_to_be_removed):
    file = filename.split('.')[0]
    list_of_index_to_be_removed.append(df.index[df['fileName'] == file].tolist())
list_of_index_to_be_removed = [x[0] for x in list_of_index_to_be_removed]
print(list_of_index_to_be_removed)

[280, 181, 109, 110, 41, 239, 257, 3, 32, 134, 310, 362, 221, 222, 45, 187, 244, 227, 276, 279, 223, 55, 245, 256, 56, 4, 242, 218, 131, 282, 144, 342, 133, 166, 225, 281, 235, 353, 277, 340, 284, 283, 238, 145, 230, 234, 186, 253, 309, 229, 233, 232, 243, 231, 228, 341, 286, 338, 40, 220, 252, 188, 288, 275, 224, 237, 216, 236, 53, 287, 217, 308, 285, 363, 226, 364, 180, 339, 132, 311]


In [12]:
df1 = df.drop(list_of_index_to_be_removed, axis = 0)
df1.head()

Unnamed: 0,_id,userId,sentenceId,fileName,gender
0,60e2d7ed552fd6002e30b8fc,FzlNJ/+dBGudrigpiH+9U9MlEqeHvOslfj66OxVrv4g=,4608,96b10244-694c-460e-bc1a-e60210c94844,male
1,60e2d811552fd6002e30b8fd,L5WMnUqwFRlZUFg48A4DRu9dYwP9srB5s2cqsA/rDZg=,7367,dd42d217-11b5-4107-b8c5-8c60939db63c,male
2,60e2d958552fd6002e30b8fe,zcRQLjrvRyhg0PDjjhxlGJ1PoM7deRWnvlx08Ja1Wl4=,8460,95ccf7f4-b198-4623-bcf8-45deb7f914e7,male
5,60e2d964552fd6002e30b901,4UO9IETvAMYKoPU5GhL4DRjqb5rNgF1FpAKkyXQ9v/c=,3595,744f492e-8e7f-4a1e-a13e-d65a0f0716c4,male
6,60e2d979552fd6002e30b902,4UO9IETvAMYKoPU5GhL4DRjqb5rNgF1FpAKkyXQ9v/c=,1056,fcba2c6c-2ef1-48c6-974f-9bb07ab03309,male


In [17]:
print('{} is the number of zero byte files.'.format(df.shape[0]-df1.shape[0]))

80 is the number of zero byte files.


The labels of files with zero bytes has been removed.

In [18]:
# Exporting the file
df1.to_csv('./labels/filtered.csv', index=False)