In [39]:
# Import dependencies
import os
import pandas as pd

In [77]:
import wfdb
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

In [41]:
voiced_dataset = "resources/voiced_dataset/"

In [42]:
# Get all the files in the directory
files = os.listdir(voiced_dataset)

# Create lists to check dataset
voice_info = []
voice_file = []

# Loop through each file
for file in files:
    # Look through voice files only
    if file.startswith("voice"):
        
        # Isolate the info text files
        if file.endswith("-info.txt"):
            voice_id = file.split("-")[0]
            voice_info.append(voice_id)
        
        # Isolate the voice text files
        elif file.endswith(".txt"):
            voice_id = file.split(".")[0]
            voice_file.append(voice_id)

# Confirm each sample has a signal and info file
print(f'Info File: {len(voice_info)}')
print(f'Signal File: {len(voice_file)}')

Info File: 208
Signal File: 208


In [64]:
# Read in the raw data as a signal using wfdb
for file in files:
    if file.startswith("voice") and file.endswith("-info.txt"):
        voice_id = file.split("-")[0]
        record_name = voiced_dataset + voice_id
        
        record = wfdb.rdrecord(record_name)
        
        # Access the signal data
        signal_data = record.p_signal
        
        # Access the metadata from the header
        sampling_frequency = record.fs
        signal_names = record.sig_name
        
        # print(f'Sampling frequency: {sampling_frequency} Hz') # all 8000 Hz
        # print(f'Signal name: {signal_names}') # all voice files
        
        # Plot the signal data
        # plt.figure(figsize=(12, 6))
        # plt.plot(signal_data)
        # plt.title('Signal Data')
        # plt.xlabel('Sample Index')
        # plt.ylabel('Amplitude')
        # plt.show()

In [80]:
# Initialise the list to hold the dictionaries
metadata_list = []

# Parse the data in the info files
for file in files:
    # Initialise the dictionary to store the info
    voice_metadata = dict()

    # Look through info files only
    if file.startswith("voice") and file.endswith("-info.txt"):
        # Read the text file
        with open(voiced_dataset + file, 'r') as file:
            for line in file:
                # print(line.strip())
                
                # Split each line into a key, value pair using delimiter
                key, value = map(str.strip, line.split('\t'))
                
                # Ignore the empty lines by checking whitespaces
                if not line.strip():
                    continue
                else:
                    # Remove the colon
                    key = key.replace(":", "")
                    
                    # Load the data to a dictionary
                    voice_metadata[key] = value
            
            # Append the dictionary to the list
            metadata_list.append(voice_metadata)

# Display the list of dictionaries
metadata_list

[{'ID': 'voice100',
  'Age': '24',
  'Gender': 'm',
  'Diagnosis': 'healthy',
  'Occupation status': 'NU',
  'Voice Handicap Index (VHI) Score': '0',
  'Reflux Symptom Index (RSI) Score': '5',
  'Smoker': 'no',
  'Number of cigarettes smoked per day': 'NU',
  'Alcohol consumption': 'casual drinker',
  'Number of glasses containing alcoholic beverage drinked in a day': 'NU',
  "Amount of water's litres drink every day": '1,5',
  'Eating habits': '',
  'Carbonated beverages': 'almost always',
  'Amount of glasses drinked in a day': 'NU',
  'Tomatoes': 'never',
  'Coffee': 'always',
  'Number of cups of coffee drinked in a day': '3',
  'Chocolate': 'sometimes',
  'Gramme of chocolate eaten in  a day': 'NU',
  'Soft cheese': 'almost always',
  'Gramme of soft cheese eaten in a day': 'NU',
  'Citrus fruits': 'never',
  'Number of citrus fruits eaten in a day': 'NU'},
 {'ID': 'voice101',
  'Age': '60',
  'Gender': 'm',
  'Diagnosis': 'healthy',
  'Occupation status': 'NU',
  'Voice Handicap 

In [82]:
# Convert the info contents to a DataFrame
metadata_df = pd.DataFrame(metadata_list)
metadata_df.head()

Unnamed: 0,ID,Age,Gender,Diagnosis,Occupation status,Voice Handicap Index (VHI) Score,Reflux Symptom Index (RSI) Score,Smoker,Number of cigarettes smoked per day,Alcohol consumption,...,Amount of glasses drinked in a day,Tomatoes,Coffee,Number of cups of coffee drinked in a day,Chocolate,Gramme of chocolate eaten in a day,Soft cheese,Gramme of soft cheese eaten in a day,Citrus fruits,Number of citrus fruits eaten in a day
0,voice100,24,m,healthy,NU,0,5,no,NU,casual drinker,...,NU,never,always,3,sometimes,NU,almost always,NU,never,NU
1,voice101,60,m,healthy,NU,80,10,no,NU,nondrinker,...,NU,sometimes,always,4,sometimes,NU,sometimes,NU,never,NU
2,voice192,22,m,hyperkinetic dysphonia,Cook,0,10,no,NU,nondrinker,...,NU,sometimes,always,NU,always,NU,sometimes,NU,almost always,NU
3,voice193,46,f,hyperkinetic dysphonia,Housewife,0,36,yes,15,casual drinker,...,NU,sometimes,always,2,sometimes,NU,sometimes,NU,sometimes,NU
4,voice008,51,f,reflux laryngitis,Researcher,19,15,no,NU,casual drinker,...,NU,almost always,always,2,almost always,20g,sometimes,100 gr,almost always,1
