In [246]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
import subprocess

import random
import tensorflow as tf
from tensorflow.keras import layers, models
from python_speech_features.base import logfbank as mfb
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

from librosa.effects import trim
from librosa import get_duration

from tqdm import tqdm
from prettytable import PrettyTable

In [247]:
# GLOBAL PATH VARIABLE FOR AUDIO / TXT FILES:
path = 'respiratory-sound-database/audio_and_txt_files/'

In [248]:
def get_audio_files():
    folder = os.listdir(path)

    wav_files, txt_files = [], []
    for file in folder:
        if file.endswith('_16.wav'):
            wav_files.append(file)
        elif file.endswith('.txt'):
            txt_files.append(file)
    
    print("Sanity Check for Array Lengths: {}, {}".format(len(wav_files), len(txt_files)))
    return wav_files, txt_files

In [249]:
def extract_info(filename):
    splits = filename.split('_')
    print(splits)

In [250]:
# separate files by audio and txt
wav_files, txt_files = get_audio_files()
wav_files = sorted(wav_files)
txt_files = sorted(txt_files)

Sanity Check for Array Lengths: 830, 830


In [251]:
# extract information from the wav files' filename
# 0:[patient #], 1:[recording index], 2:[chest location], 3:[acquisition mode], 4:[recording equipment]
stripped_file_info = [re.sub('_16\.wav$', '', file) for file in wav_files]
file_info = [file.split('_') for file in stripped_file_info]

In [252]:
# load the extracted data into a dataframe
demog_data = pd.read_csv('demographic-info.csv')
diag_data = pd.read_csv('respiratory-sound-database/patient_diagnosis.csv', names=['Patient Number', 'Diagnosis'])
data = pd.DataFrame(data=file_info, 
                    columns=['Patient Number', 'Recording Index', 'Chest Location', 'Acquisition Mode', 'Recording Equipment'])

data['Patient Number'] = data['Patient Number'].astype(int)
data['txt_file'] = txt_files
data['audio_file'] = wav_files

In [253]:
# put info from demog_data into data
age, sex, a_BMI, c_weight, c_height, diagnosis = [], [], [], [], [], []
demog_size = demog_data['Age'].size
size = data['Patient Number'].size

In [254]:
# TODO: Make this more efficient
# conversions = {'COPD':0, 'Pneumonia':1, 'Healthy':2, 'URTI':3, 'Bronchiectasis':4, 'Bronchiolitis':5, 'LRTI':6, 'Asthma':7}
conversion = {'COPD':0, 'Pneumonia':1, 'Healthy':2, 'URTI':3}

for j in tqdm(range(0, size)):
    for i in range(0, demog_size):
        if data['Patient Number'][j] == demog_data['Patient Number'][i]:
            age.append(demog_data['Age'][i])
            sex.append(demog_data['Sex'][i])
            a_BMI.append(demog_data['Adult BMI'][i])
            c_weight.append(demog_data['Child Weight'][i])
            c_height.append(demog_data['Child Height'][i])
        if data['Patient Number'][j] == diag_data['Patient Number'][i]:
            if diag_data['Diagnosis'][i] in conversion.keys():
                diagnosis.append(conversions[diag_data['Diagnosis'][i]])
            else:
                diagnosis.append("DELETE")
        pass

100%|██████████| 830/830 [00:04<00:00, 172.24it/s]


In [255]:
data['Age'], data['Sex'], data['Adult BMI'], data['Child Weight'], data['Child Height'], data['Diagnosis'] = age, sex, a_BMI, c_weight, c_height, diagnosis

In [256]:
# delete all cases of 'DELETE' in diagnosis (balancing data)
data = data[data.Diagnosis != 'DELETE']

In [257]:
print("New total number of samples is {}.".format(len(data.index)))

New total number of samples is 798.


In [258]:
# reinitialize data.index
data.index = [i for i in range(0, len(data.index))]

In [259]:
# Checking percent of data that is COPD (0) vs. the other classes (Pneumonia = 1, Healthy = 2, URTI = 3)
total_samples = len(data.index)
num_COPD = len(data[data.Diagnosis == 0])
num_pneum = len(data[data.Diagnosis == 1])
num_healthy = len(data[data.Diagnosis == 2])
num_URTI = len(data[data.Diagnosis == 3])

print("Proportion of samples of COPD: {}/{} = {}".format(num_COPD, total_samples, num_COPD / total_samples))
print("Proportion of samples of Pneumonia: {}/{} = {}".format(num_pneum, total_samples, num_pneum / total_samples))
print("Proportion of samples of Healthy: {}/{} = {}".format(num_healthy, total_samples, num_healthy / total_samples))
print("Proportion of samples of URTI: {}/{} = {}".format(num_URTI, total_samples, num_URTI / total_samples))

Proportion of samples of COPD: 703/798 = 0.8809523809523809
Proportion of samples of Pneumonia: 37/798 = 0.046365914786967416
Proportion of samples of Healthy: 35/798 = 0.043859649122807015
Proportion of samples of URTI: 23/798 = 0.02882205513784461


In [260]:
less_num_COPD = int(num_COPD / 10)
sub_value = num_COPD - less_num_COPD
downsampled_total = total_samples - sub_value

print("Proportion of samples of COPD: {}/{} = {}".format(less_num_COPD, downsampled_total, less_num_COPD / downsampled_total))
print("Proportion of samples of Pneumonia: {}/{} = {}".format(num_pneum, downsampled_total, num_pneum / downsampled_total))
print("Proportion of samples of Healthy: {}/{} = {}".format(num_healthy, downsampled_total, num_healthy / downsampled_total))
print("Proportion of samples of URTI: {}/{} = {}".format(num_URTI, downsampled_total, num_URTI / downsampled_total))

print("\nTo achieve these proportions, we need to remove {} samples of COPD. We are downsampling COPD by a factor of {}.".format(sub_value, less_num_COPD))

Proportion of samples of COPD: 70/165 = 0.42424242424242425
Proportion of samples of Pneumonia: 37/165 = 0.22424242424242424
Proportion of samples of Healthy: 35/165 = 0.21212121212121213
Proportion of samples of URTI: 23/165 = 0.1393939393939394

To achieve these proportions, we need to remove 633 samples of COPD. We are downsampling COPD by a factor of 70.


In [261]:
# downsample COPD by the above definitions, randomly selecting samples with diagnosis COPD
downsampling_factor = less_num_COPD
num_samples_to_remove = sub_value

indices_to_delete = []
for i in range(0, num_samples_to_remove):
    deleted = False
    while deleted == False:
        index = random.randint(0, len(data.index) - 1)
        if data['Diagnosis'][index] == 0:
            if index not in indices_to_delete:
                indices_to_delete.append(index)
                deleted = True

In [262]:
data = data.drop(indices_to_delete, axis=0)

In [263]:
# reset index
data.index = [i for i in range(0, len(data.index))]

In [264]:
# Confirming that we have the same or similar probabilities as before
total_samples = len(data.index)
num_COPD = len(data[data.Diagnosis == 0])
num_pneum = len(data[data.Diagnosis == 1])
num_healthy = len(data[data.Diagnosis == 2])
num_URTI = len(data[data.Diagnosis == 3])

print("Proportion of samples of COPD: {}/{} = {}".format(num_COPD, total_samples, num_COPD / total_samples))
print("Proportion of samples of Pneumonia: {}/{} = {}".format(num_pneum, total_samples, num_pneum / total_samples))
print("Proportion of samples of Healthy: {}/{} = {}".format(num_healthy, total_samples, num_healthy / total_samples))
print("Proportion of samples of URTI: {}/{} = {}".format(num_URTI, total_samples, num_URTI / total_samples))

Proportion of samples of COPD: 70/165 = 0.42424242424242425
Proportion of samples of Pneumonia: 37/165 = 0.22424242424242424
Proportion of samples of Healthy: 35/165 = 0.21212121212121213
Proportion of samples of URTI: 23/165 = 0.1393939393939394


In [265]:
# get MFBs for each file
mfbs = []

for file in tqdm(data['audio_file']):
    sample_rate, samples = wavfile.read(path + file)
    trimmed, trimmed_shape = trim(np.array(samples).astype(float))
    m = mfb(trimmed, samplerate=sample_rate)
    mfbs.append(m)

100%|██████████| 165/165 [00:03<00:00, 48.21it/s]


In [266]:
data['MFB'] = mfbs

In [267]:
# find minimum shape to trim other samples
min_shape = min([m.shape for m in data['MFB']])
print(min_shape)

(1979, 26)


In [268]:
# trim all other samples
data['MFB'] = [m[:1979,:] for m in data['MFB']]

In [269]:
# confirm that trimming worked by checking max shape
print(max([m.shape for m in data['MFB']]))

(1979, 26)


In [271]:
# pickle data to a file for use in training
data.to_pickle('train_data_mfbs_downsampled_70.pkl')