# Fetch swallow dataset
(best to do this part locally and then upload the output folder to google drive - it's pretty big initially)

download the sEMG dataset for swallows: https://rdr.ucl.ac.uk/articles/dataset/sEMG_of_Swallowing_Coughing_and_Speech/24297766

In [2]:
#IMPORTS

#machine learning libs
import torch
import torch.nn as nn
import numpy as np

#dsp libraries
import torchaudio
import librosa

#file handling libs
import os
import csv
import shutil
import pandas as pd

#visualisation libs
from PIL import Image
import matplotlib.pyplot as plt

# Step 1: Filter for just swallow dataset from sEMG

In [3]:
def bring_all_to1(source_directory, output_directory):
    """
    Copies all CSV files containing the word "swallow" in their filenames from the source directory
    and its subdirectories to the output directory.

    Args:
        source_directory (str): Path to the directory to search for files.
        output_directory (str): Path to the directory where the files will be copied.

    Returns:
        None
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Walk through the source directory and its subdirectories
    for root, dirs, files in os.walk(source_directory):
        for filename in files:
            if filename.endswith('.csv'):
                # Check if "swallow" is in the filename (case-insensitive)
                if "swallow" in filename.lower():
                    src_file_path = os.path.join(root, filename)  # Full path to the source file
                    dest_file_path = os.path.join(output_directory, filename)  # Full path to the destination file
                    shutil.copy2(src_file_path, dest_file_path)  # Copy the file
                    # Uncomment the next line to print details of each copied file
                    # print(f"Copied file: {src_file_path} to {dest_file_path}")

    print("Copied all swallow files")

# Example use
# directory = "./sEMG_of_Swallow_Cough_and_Speech/Processed"
# output_directory = "./Orig_CSV_Files"
# bring_all_to1(directory, output_directory)


In [4]:
def check_unique_audio_filenames(directory, extensions=('.wav', '.mp3')):
    """
    Checks if all audio filenames in the specified directory are unique.

    Args:
        directory (str): Path to the directory to check for duplicate filenames.
        extensions (tuple, optional): Tuple of file extensions to consider as audio files. Default is ('.wav', '.mp3').

    Returns:
        bool: True if all filenames are unique, False if there are duplicates.
    """
    filenames = set()  # Set to store unique filenames
    duplicate_filenames = set()  # Set to store duplicate filenames

    # Traverse the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename.endswith(extensions):  # Check if the file has a valid audio extension
                if filename in filenames:
                    duplicate_filenames.add(filename)  # Add to duplicates if already in the set
                else:
                    filenames.add(filename)  # Add to unique filenames set

    # Check if any duplicates were found
    if duplicate_filenames:
        print("Duplicate filenames found:")
        for dup in duplicate_filenames:
            print(dup)  # Print each duplicate filename
        return False
    else:
        print("All filenames are unique.")
        return True

# Directory to check for unique audio filenames
directory = "./Orig_CSV_Files"

# Check if filenames are unique
are_filenames_unique = check_unique_audio_filenames(directory)


All filenames are unique.


# Step 2: change CSV File to contain only contact microphone values and labels

orig csv setup columns: **EMG-submental,EMG-intercostal,,EMG-diaphragm,pneumotachometry, contact microphone, class labels**

Target csv setup columns: **contact microphone, class labels**)

In [None]:
import os
import pandas as pd

def keep_last_two_columns_in_csv(directory):
    """
    Processes each CSV file in the specified directory, retaining only the last two columns in each file.

    Args:
        directory (str): Path to the directory containing the CSV files to be processed.

    Returns:
        None
    """
    # Iterate over all files in the specified directory
    for filename in os.listdir(directory):
        csv_file_path = os.path.join(directory, filename)  # Full path to the CSV file
        df = pd.read_csv(csv_file_path)  # Read the CSV file into a DataFrame

        # Check if the DataFrame has at least two columns
        if len(df.columns) >= 2:
            df_modified = df.iloc[:, -2:]  # Keep only the last two columns
            df_modified.to_csv(csv_file_path, index=False)  # Save the modified DataFrame back to the CSV file
            # Uncomment the next line to print details of each processed file
            # print(f"Processed file: {csv_file_path}")
        else:
            continue  # Skip files that do not have enough columns
            # Uncomment the next line to print a message for files with insufficient columns
            # print(f"File {csv_file_path} does not have enough columns.")

    print("Done")

# Example usage
# directory = "./Orig_CSV_Files"  # Specify the directory containing the CSV files
# keep_last_two_columns_in_csv(directory)


# Step 3: Remove all unnecessary labels

Original label setup:
- 0 - Null (anything outside the other classes)
- 1 - Swallow phase 1 (preparation activity for swallowing such as chewing, sipping etc.)
- 2 - Swallow phase 2 (swallow reflex, larynx elevation following submental muscle contraction)
- 3 - Cough
- 4 - Speech

Target Label label setup:
- 0 - Null (anything outside the other classes)
- 1 - Swallow phase 2 (swallow reflex, larynx elevation following submental muscle contraction)
- 2 - Cough
- Removed labels:
    - Swallow phase 1 (preparation activity for swallowing such as chewing, sipping etc.)
    - Speech

In [None]:
def update_labels_in_csv(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            csv_file_path = os.path.join(directory, filename)
            df = pd.read_csv(csv_file_path)
            last_column = df.columns[-1]

            # Update labels according to the new class setup
            df[last_column] = df[last_column].apply(lambda x: 0 if x in [1, 4] else (1 if x == 2 else (2 if x == 3 else 0)))
            df.to_csv(csv_file_path, index=False)
    print("Processed all files")


directory = "./Orig_CSV_Files" # New folder in which all files are stored
update_labels_in_csv(directory)


#### Count current total seconds of audio that are swallows and coughs from all audio files
All audio files were recorded at 2kHz

- Total seconds with label None: 1672.462
- Total seconds with label Swallow: 187.7
- Total seconds with label Cough: 81.3455

In [5]:
#count total frames for each label
def count_labels_in_csv(directory):
    count_label_0 = 0
    count_label_1 = 0
    count_label_2 = 0

    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            csv_file_path = os.path.join(directory, filename)
            df = pd.read_csv(csv_file_path)

            # Get the name of the last column (assuming it's the label column)
            last_column = df.columns[-1]

            # Count rows with label 0, label 1 and label 2
            count_label_0 += (df[last_column] == 0).sum()
            count_label_1 += (df[last_column] == 1).sum()
            count_label_2 += (df[last_column] == 2).sum()
    print(f"Total rows with label 0: {count_label_0}")
    print(f"Total rows with label 1: {count_label_1}")
    print(f"Total rows with label 2: {count_label_2}")

    return count_label_0,count_label_1, count_label_2

directory = "./Orig_CSV_Files" # New folder in which all files are stored
count_label_0,count_label_1, count_label_2 = count_labels_in_csv(directory)

#convert to seconds
count_seconds_0 = count_label_0/2000
count_seconds_1 = count_label_1/2000
count_seconds_2 = count_label_2/2000
print(f"Total seconds with label None: {count_seconds_0}")
print(f"Total seconds with label Swallow: {count_seconds_1}")
print(f"Total seconds with label Cough: {count_seconds_2}")

Total rows with label 0: 1991894
Total rows with label 1: 359481
Total rows with label 2: 0
Total seconds with label None: 995.947
Total seconds with label Swallow: 179.7405
Total seconds with label Cough: 0.0
