# U-net Denoising demo

In [80]:
# libs
from scipy.io import wavfile
import numpy as np
import pandas as pd
import os
import re

#Steps
- 1. load the data
- 2. load in ARCA23k noise dataset:
- -  "Child speech and kid speaking,"hiss", "crumpling and crinkling", "cough", "sneeze"
- 3. FFT
- 4. design unet
- 5. feed into unet
- 6. see results


In [34]:
# steps 
# (done) 0. Download ARCA23k and the circor digiscope phonocardiogram dataset
# 1. load the data
# 2. load in ARCA23k noise dataset:
# # "Child speech and kid speaking,"hiss", "crumpling and crinkling", "cough", "sneeze"
# 3. FFT
# 4. design unet
# 5. feed into unet
# 6. see results

## Dirs
pcg_datapath = '/Users/donu/Desktop/S25/ELEC 594/datasets/the-circor-digiscope-phonocardiogram-dataset-1.0.3/'
arcalabels = '/Users/donu/Desktop/S25/ELEC 594/datasets/ARCA23K_ground_truth/'
arcawavs = '/Users/donu/Desktop/S25/ELEC 594/datasets/ARCA23K_audio/'
bentley_dset = '/Users/donu/Desktop/S25/ELEC 594/datasets/bentley-dset/'

## 1. Loading the data

### 1.a Exploring PCG data

In [26]:
# Exploring the PCG data
pcg_data = pd.read_csv(pcg_datapath+'training_data.csv')

In [27]:
pcg_data.shape

(942, 23)

In [28]:
pcg_data.columns

Index(['Patient ID', 'Recording locations:', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Outcome', 'Campaign', 'Additional ID'],
      dtype='object')

In [29]:
pcg_data['Murmur'].value_counts()

Murmur
Absent     695
Present    179
Unknown     68
Name: count, dtype: int64

In [30]:
pcg_data.loc[pcg_data['Murmur'].eq('Absent') | pcg_data['Murmur'].eq('Present')]

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,
5,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,85340,AV+PV+TV+MV,Child,Male,105.0,16.6,False,Absent,,,...,,,,,,,,Normal,CC2015,
938,85341,AV+PV+TV+MV,Child,Male,92.0,15.2,False,Absent,,,...,,,,,,,,Normal,CC2015,
939,85343,AV+PV+TV+MV,Child,Female,97.0,13.5,False,Present,MV+TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
940,85345,AV+PV,Child,Female,132.0,38.1,False,Absent,,,...,,,,,,,,Normal,CC2015,


### 1.b Exploring the dataset used by the U-Net paper

In [87]:
pcg_data = pd.read_csv(bentley_dset+'set_b.csv')

In [88]:
pcg_data['label'].value_counts()

label
normal        320
murmur         95
extrastole     46
Name: count, dtype: int64

In [89]:
# set aside extrastole for now
##cg_data = pcg_data.loc[pcg_data['label'].eq('normal') | pcg_data['label'].eq('murmur')]

### 1.b.1 - Clean up horrible data formatting (only run this once)

In [94]:
#for i in range(pcg_data.shape[0]):
#  # delete "Btraining"
#  #pcg_data.loc[i,'fname'] = 'setb/'+pcg_data.iloc[i,:]['fname'].split('/')[-1][10:]
#  filename = pcg_data.iloc[i,:]['fname'].split('/')[-1]
#  cut_idx = re.search(r'\d',filename).start()
# pcg_data.loc[i,'fname'] = pcg_data.loc[i,'fname'].replace('setb','set_b')
#  pcg_data.loc[i,'fname'] = 'setb/'+filename[cut_idx:]
  

In [90]:
pcg_data # in pcg data, remove everything before first hyphen
# actually, do this for everything.

Unnamed: 0,dataset,fname,label,sublabel
0,b,setb/127_1306764300147_C2.wav,extrastole,
1,b,setb/128_1306344005749_A.wav,extrastole,
2,b,setb/130_1306347376079_D.wav,extrastole,
3,b,setb/134_1306428161797_C1.wav,extrastole,
4,b,setb/138_1306762146980_B.wav,extrastole,
...,...,...,...,...
651,b,setb/284_1311168471850_A.wav,normal,noisynormal
652,b,setb/284_1311168471850_B.wav,normal,noisynormal
653,b,setb/285_1311169246969_C.wav,normal,noisynormal
654,b,setb/296_1311682952647_C.wav,normal,noisynormal


In [83]:
# write to csv
##pcg_data.to_csv(bentley_dset+'set_b.csv',index=False)

In [85]:
#for fname in os.listdir(bentley_dset+'set_b/'):
#  # renaming
#  cutidx = re.search(r'\d',fname).start()
#  os.rename(bentley_dset+'set_b/'+fname,bentley_dset+'set_b/'+fname[cutidx:])
  

### 1.b.2 - Find Corresponding wav for each label

In [96]:
for idx, row in pcg_data.iterrows():
  filepath_local = row['fname']
  label = row['label']
  wavfile.read(bentley_dset+filepath_local)
# thank god lmoa. checkpoint here

In [None]:
# checkpoint 1. done so far:
## downlaoded the datasets (synthetic noise, circor, and bentley - the U-net one)
## renamed the horribly named files and .csv column values in the bentley dataset
# able to read the corresponding wavfile for each row in the bentley dataset (i.e. map murmurs to wav!)