# Notebook 1: Data Initial Cleaning

In [None]:
# import all required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# will need this to read wav file
from scipy.io.wavfile import read

### Read in `set_a_df`

Read in the first dataset. The naming convention on the files is different than the one on the in CSV which is used as the data dictionary. 

The following group of cells perform the following operations:
1. Read files and drop unlabeled test set used on Kaggle
2. Define a function `read_in_files` to read in files, ingest their raw audio, and output the raw audio to a dataframe
3. Run the function `read_in_files` on the files in set_a

In [None]:
# Read files and drop unlabeled test set used on Kaggle
set_a_df = pd.read_csv('453_923_bundle_archive/set_a.csv')

# remove unlabeled files within each dataframe, all other files are named in the CSV
# as they are on the hard drive
set_a_df = set_a_df[~set_a_df['label'].isna()]
set_a_df.head()

In [None]:
# Define a function `read_in_files` to read in files, ingest their raw audio, 
# and output the raw audio to a dataframe

def read_in_files(file_name_dataframe):
    '''
        This function accepts a dataframe which contains
        file names as well as their labels. It reads in
        the audio files' content and creates a new
        dataframe with the signal in it and the labels
        from the original file
        
        If the signal contains 10,000 measurements, the
        resulting dataframe will have 10,001 columns
        (10,000 measurements + 1 label)
    '''
    
    # Step 1: Find the longest file and its size
    max_size = 0
    for i, row in file_name_dataframe.iterrows():
        file_name = "453_923_bundle_archive/"+row['fname']

        a = read(file_name)
        file_as_array = np.array(a[1],dtype=float)
        if max_size < file_as_array.shape[0]:
            max_size = file_as_array.shape[0]

    
    print(f"Longest file has {max_size} measurements")

    # Step two, create an empty placeholder for 
    # each file, fill in its data, and append it
    # to a list
    list_of_files = []

    for i, row in file_name_dataframe.iterrows():
        file_name = "453_923_bundle_archive/"+row['fname']

        a = read(file_name)
        file_as_array = np.array(a[1],dtype=float)

        # The placeholder is the same size as the largest file
        # so all resulting rows end up with as many column
        # as the longest file
        placeholder_array = np.zeros((max_size,))
        placeholder_array[-file_as_array.shape[0]:] = file_as_array[:]

        list_of_files.append(placeholder_array)

    # Now we just convert the list of file data into
    # a pandas dataframe

    file_name_as_numbers_dataframe = pd.DataFrame(data=np.array(list_of_files))
    file_name_as_numbers_dataframe['label'] = file_name_dataframe['label']
    return file_name_as_numbers_dataframe


In [None]:
# Run the function `read_in_files` on the files in set_a
set_a_as_number_df = read_in_files(set_a_df)
set_a_as_number_df.head()

### Read in `set_b_df`

Read in the second dataset. The naming convention on the files is different than the one on the in CSV which is used as the data dictionary. It's actually worse than in the first dataset so before reading the files we need to clean
the naming in the CSV file.

For example, a file named `Btraining_extrastole_127_1306764300147_C.wav` in the CSV is actually named `extrastole__127_1306764300147_C.wav` on the hard drive

The following group of cells perform the following operations:
1. Read files and drop unlabeled test set used on Kaggle
2. Perform string operations to match names to their filename on the hard drive
3. Run the function `read_in_files` on the files in set_a

In [None]:
# Read files and drop unlabeled test set used on Kaggle
set_b_df = pd.read_csv('453_923_bundle_archive/set_b.csv')
set_b_df = set_b_df[~set_b_df['label'].isna()]
print(set_b_df.shape)

set_b_df.head()

In [None]:
# Perform string operations to match names to their filename on the hard drive

set_b_df['fname'] = set_b_df['fname'].str.replace('Btraining_', '')
set_b_df['fname'] = set_b_df['fname'].str.replace('normal_', 'normal__')
set_b_df['fname'] = set_b_df['fname'].str.replace('murmur_', 'murmur__')
set_b_df['fname'] = set_b_df['fname'].str.replace('extrastole_', 'extrastole__')
set_b_df['fname'] = set_b_df['fname'].str.replace('normal__noisynormal__', 'normal_noisynormal_')
set_b_df['fname'] = set_b_df['fname'].str.replace('murmur__noisymurmur__', 'murmur_noisymurmur_')


set_b_df.head()

In [None]:
# Run the function `read_in_files` on the files in set_a
set_b_as_number_df = read_in_files(set_b_df)
set_b_as_number_df.head()

### Combine both dataframes 

Let's combine both dataframes and then call the function which will read in the data

In [None]:
set_a_df.head()

In [None]:
set_b_df.head()

In [None]:
combined_df = pd.concat([set_a_df, set_b_df], axis=0)
combined_df = combined_df.reset_index().drop('index', axis=1)
print(combined_df.shape)
combined_df.head()

In [None]:
combined_as_number_df = read_in_files(combined_df)
combined_as_number_df.head()

In [None]:
# THIS CELL WILL RUN FOR A VERY LONG TIME
#combined_as_number_df.to_csv('data/audio_as_csv.csv', index=False)

In [None]:
# THIS CELL WILL RUN FOR A VERY LONG TIME
#combined_as_number_df_test = pd.read_csv('data/audio_as_csv.csv')
#combined_as_number_df_test.head()

Writing and reading the dataframe from disk takes much too long. It's better to simply read the individual files and rebuild the dataframe each time.

I will define a function named `read_original_data` and place it in a file called `boris_util.py`