you can just use the find and replace feature to replace COPD with healthy or asthma

In [None]:
import librosa
import librosa.display
import os
import numpy as np
import pandas as pd

In [None]:
folder_path = 'COPD/'
ctr = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        print(f"Found WAV file: {filename}")
        ctr += 1
print("no. of files", ctr)

In [None]:
folder_path = 'COPD/'
ctr1 = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        full_file_path = os.path.join(folder_path, filename)
        audio_data, sample_rate = librosa.load(full_file_path, sr=None)

        print(f"Audio data shape: {audio_data.shape}")
        print(f"Sample rate: {sample_rate}")
        ctr1 += 1
print("the no. of files", ctr1)

Audio data shape: (120000,)
Sample rate: 4000
Audio data shape: (882000,)
Sample rate: 44100
Audio data shape: (78528,)
Sample rate: 4000
Audio data shape: (120000,)
Sample rate: 4000
Audio data shape: (50112,)
Sample rate: 4000
Audio data shape: (73920,)
Sample rate: 4000
Audio data shape: (86720,)
Sample rate: 4000
Audio data shape: (102336,)
Sample rate: 4000
Audio data shape: (31424,)
Sample rate: 4000
Audio data shape: (72896,)
Sample rate: 4000
Audio data shape: (67008,)
Sample rate: 4000
Audio data shape: (65472,)
Sample rate: 4000
Audio data shape: (49856,)
Sample rate: 4000
Audio data shape: (74688,)
Sample rate: 4000
Audio data shape: (64960,)
Sample rate: 4000
Audio data shape: (62912,)
Sample rate: 4000
Audio data shape: (67776,)
Sample rate: 4000
Audio data shape: (69056,)
Sample rate: 4000
Audio data shape: (78528,)
Sample rate: 4000
Audio data shape: (50112,)
Sample rate: 4000
Audio data shape: (58304,)
Sample rate: 4000
Audio data shape: (80320,)
Sample rate: 4000
Audio

**CREATING FRAMES AND HAMMING**

In [None]:
folder_path = 'COPD/'

frame_size = 1024
hop_size = 512

all_frames = []

for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        audio_file = os.path.join(folder_path, filename)
        audio_data, sample_rate = librosa.load(audio_file, sr=None)

        if len(audio_data) < frame_size:
            print(f"Deleting {filename} - Audio duration too short.")
            # Delete the file as it is too short
            os.remove(audio_file)
            continue  # Skip the rest of the loop for this file

        frames = librosa.util.frame(audio_data, frame_length=frame_size, hop_length=hop_size).T

        all_frames.extend(frames)


In [None]:
windowed_frames = frames * np.hamming(frame_size)

In [None]:
fft_frames = np.fft.fft(windowed_frames, axis=1)

**Mel-Frequency Cepstral Coefficients**

In [None]:
# MFCC parameters
n_mfcc = 13
hop_length = 512
n_fft = 1024
all_mfccs = []

In [None]:
file_ids = []
conditions = []
mfcc_data = []

n_mfcc = 13

for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        condition_label = "COPD"

        audio_data, sample_rate = librosa.load(os.path.join(folder_path, filename), sr=None)
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)

        file_ids.append(filename)
        conditions.append(condition_label)
        mfcc_data.append(mfccs)

max_frames = max([mfcc.shape[1] for mfcc in mfcc_data])

mfcc_data_processed = []

for mfcc in mfcc_data:
    if mfcc.shape[1] < max_frames:
        mfcc_padded = np.pad(mfcc, ((0, 0), (0, max_frames - mfcc.shape[1])), mode='constant')
    else:
        mfcc_padded = mfcc[:, :max_frames]

    mfcc_data_processed.append(mfcc_padded)

mfcc_array = np.array(mfcc_data_processed)

df = pd.DataFrame(data=mfcc_array.reshape(mfcc_array.shape[0], -1))

df["ID"] = file_ids
df["Condition"] = conditions

df = df[["ID", "Condition"] + list(range(mfcc_array.shape[1]))]

df.to_csv("COPD_cough_sound_features.csv", index=False)

In [None]:
df = pd.read_csv('COPD_cough_sound_features.csv')
df.head()

Unnamed: 0,ID,Condition,0,1,2,3,4,5,6,7,8,9,10,11,12
0,195_c2.wav,COPD,-502.16647,-387.6392,-283.3648,-288.13077,-390.53928,-439.0188,-472.5774,-403.4862,-458.9426,-476.58978,-492.50092,-502.1649,-477.74066
1,172_c6.wav,COPD,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,1.427765,11.803424,29.25869,32.88694,27.433182
2,139_c1.wav,COPD,-491.23718,-480.7266,-470.3338,-464.5107,-490.07947,-412.65945,-341.5661,-352.1225,-467.7008,-488.91656,-514.46655,-533.5657,-519.4302
3,109_c6.wav,COPD,-446.0489,-410.0174,-315.77515,-373.10248,-385.96143,-301.6228,-443.59958,-490.3183,-448.6804,-416.61975,-441.0785,-464.54895,-441.43225
4,118_c3.wav,COPD,-384.6302,-359.7019,-391.5107,-492.9893,-347.17148,-321.56885,-372.28412,-506.4485,-432.78928,-452.33557,-459.54395,-455.213,-373.3252


**Spectral Centroid**

In [None]:
spectral_centroids = []

for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        audio_data, sample_rate = librosa.load(os.path.join(folder_path, filename), sr=None)

        spectral_centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate, hop_length=hop_length)

        spectral_centroid_mean = np.mean(spectral_centroid)

        spectral_centroids.append(spectral_centroid_mean)

df["Spectral Centroid"] = spectral_centroids

df.to_csv("COPD_cough_sound_features.csv", index=False)

In [None]:
df.head()

Unnamed: 0,ID,Condition,0,1,2,3,4,5,6,7,8,9,10,11,12,Spectral Centroid
0,195_c2.wav,COPD,-502.16647,-387.6392,-283.3648,-288.13077,-390.53928,-439.0188,-472.5774,-403.4862,-458.9426,-476.58978,-492.50092,-502.1649,-477.74066,123.565732
1,172_c6.wav,COPD,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,1.427765,11.803424,29.25869,32.88694,27.433182,760.072546
2,139_c1.wav,COPD,-491.23718,-480.7266,-470.3338,-464.5107,-490.07947,-412.65945,-341.5661,-352.1225,-467.7008,-488.91656,-514.46655,-533.5657,-519.4302,144.922725
3,109_c6.wav,COPD,-446.0489,-410.0174,-315.77515,-373.10248,-385.96143,-301.6228,-443.59958,-490.3183,-448.6804,-416.61975,-441.0785,-464.54895,-441.43225,90.437312
4,118_c3.wav,COPD,-384.6302,-359.7019,-391.5107,-492.9893,-347.17148,-321.56885,-372.28412,-506.4485,-432.78928,-452.33557,-459.54395,-455.213,-373.3252,114.007344


In [None]:
df['Silence'] = df['Spectral Centroid'].apply(lambda x: 1 if x == 0 else 0)

In [None]:
df.head(16)

Unnamed: 0,ID,Condition,0,1,2,3,4,5,6,7,8,9,10,11,12,Spectral Centroid,Silence
0,195_c2.wav,COPD,-502.16647,-387.6392,-283.3648,-288.13077,-390.53928,-439.0188,-472.5774,-403.4862,-458.9426,-476.58978,-492.50092,-502.1649,-477.74066,123.565732,0
1,172_c6.wav,COPD,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,1.427765,11.803424,29.25869,32.88694,27.433182,760.072546,0
2,139_c1.wav,COPD,-491.23718,-480.7266,-470.3338,-464.5107,-490.07947,-412.65945,-341.5661,-352.1225,-467.7008,-488.91656,-514.46655,-533.5657,-519.4302,144.922725,0
3,109_c6.wav,COPD,-446.0489,-410.0174,-315.77515,-373.10248,-385.96143,-301.6228,-443.59958,-490.3183,-448.6804,-416.61975,-441.0785,-464.54895,-441.43225,90.437312,0
4,118_c3.wav,COPD,-384.6302,-359.7019,-391.5107,-492.9893,-347.17148,-321.56885,-372.28412,-506.4485,-432.78928,-452.33557,-459.54395,-455.213,-373.3252,114.007344,0
5,104_c3.wav,COPD,-388.5859,-370.29153,-532.49255,-486.6878,-520.3562,-440.98712,-551.6132,-557.14124,-574.6848,-583.3087,-557.01715,-541.4152,-521.9107,108.814034,0
6,175_c5.wav,COPD,-605.3874,-554.5158,-513.2715,-573.0767,-500.68304,-438.03073,-439.70734,-365.74918,-411.73743,-433.5634,-339.9859,-459.35208,-483.6197,138.15469,0
7,104_c2.wav,COPD,-449.53156,-371.8938,-367.07578,-461.74634,-432.5731,-418.87833,-460.24323,-477.7756,-430.76407,-385.11682,-401.2796,-362.20447,-472.35275,187.859628,0
8,124_c6.wav,COPD,-270.891,-289.95966,-411.81372,-309.05798,-367.5699,-389.1186,-425.40622,-448.86478,-475.5593,-268.87527,-327.5224,-371.0781,-425.37094,126.080224,0
9,124_c3.wav,COPD,-487.30246,-472.6468,-510.56808,-359.8449,-505.825,-261.70206,-354.82812,-458.23755,-430.12604,-475.15375,-483.09662,-500.1792,-529.5053,104.513342,0


In [None]:
df.drop(columns=['Silence'], inplace=True)

# After executing this, 'df' will no longer contain the 'Silence' column.

In [None]:
df.head()

Unnamed: 0,ID,Condition,0,1,2,3,4,5,6,7,8,9,10,11,12,Spectral Centroid
0,195_c2.wav,COPD,-502.16647,-387.6392,-283.3648,-288.13077,-390.53928,-439.0188,-472.5774,-403.4862,-458.9426,-476.58978,-492.50092,-502.1649,-477.74066,123.565732
1,172_c6.wav,COPD,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,1.427765,11.803424,29.25869,32.88694,27.433182,760.072546
2,139_c1.wav,COPD,-491.23718,-480.7266,-470.3338,-464.5107,-490.07947,-412.65945,-341.5661,-352.1225,-467.7008,-488.91656,-514.46655,-533.5657,-519.4302,144.922725
3,109_c6.wav,COPD,-446.0489,-410.0174,-315.77515,-373.10248,-385.96143,-301.6228,-443.59958,-490.3183,-448.6804,-416.61975,-441.0785,-464.54895,-441.43225,90.437312
4,118_c3.wav,COPD,-384.6302,-359.7019,-391.5107,-492.9893,-347.17148,-321.56885,-372.28412,-506.4485,-432.78928,-452.33557,-459.54395,-455.213,-373.3252,114.007344


**removing silent files**

In [None]:
# Drop rows where the 'Spectral Centroid' is 0
df = df[df['Spectral Centroid'] != 0]

# After this operation, 'df' will no longer contain rows with a spectral centroid of 0. ie files that are silent

**Spectral Rolloff**

In [None]:
spectral_rolloffs = []

for filename in df['ID']:
    file_path = os.path.join(folder_path, filename)

    audio_data, sample_rate = librosa.load(file_path, sr=None)

    rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sample_rate)[0]

    rolloff_mean = np.mean(rolloff)

    spectral_rolloffs.append(rolloff_mean)

df['Spectral Roll-off'] = spectral_rolloffs

df.to_csv("COPD_cough_sound_features.csv", index=False)

In [None]:
df.head(5)

Unnamed: 0,ID,Condition,0,1,2,3,4,5,6,7,8,9,10,11,12,Spectral Centroid,Spectral Roll-off
0,195_c2.wav,COPD,-502.16647,-387.6392,-283.3648,-288.13077,-390.53928,-439.0188,-472.5774,-403.4862,-458.9426,-476.58978,-492.50092,-502.1649,-477.74066,123.565732,256.665559
1,172_c6.wav,COPD,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,-510.57242,1.427765,11.803424,29.25869,32.88694,27.433182,760.072546,930.826757
2,139_c1.wav,COPD,-491.23718,-480.7266,-470.3338,-464.5107,-490.07947,-412.65945,-341.5661,-352.1225,-467.7008,-488.91656,-514.46655,-533.5657,-519.4302,144.922725,296.10136
3,109_c6.wav,COPD,-446.0489,-410.0174,-315.77515,-373.10248,-385.96143,-301.6228,-443.59958,-490.3183,-448.6804,-416.61975,-441.0785,-464.54895,-441.43225,90.437312,162.616356
4,118_c3.wav,COPD,-384.6302,-359.7019,-391.5107,-492.9893,-347.17148,-321.56885,-372.28412,-506.4485,-432.78928,-452.33557,-459.54395,-455.213,-373.3252,114.007344,221.938776


**Zero Crossing Rate**

In [None]:
zero_crossing_rates = []

for filename in df['ID']:
    file_path = os.path.join(folder_path, filename)

    audio_data, sample_rate = librosa.load(file_path, sr=None)

    zcr = librosa.feature.zero_crossing_rate(audio_data)[0]

    zcr_mean = np.mean(zcr)

    zero_crossing_rates.append(zcr_mean)

df['Zero Crossing Rate'] = zero_crossing_rates

df.to_csv("COPD_cough_sound_features.csv", index=False)

**Delta MFCC (mean)**

In [None]:
delta_mfccs_list = []

for filename in df['ID']:

    file_path = os.path.join(folder_path, filename)

    audio_data, sample_rate = librosa.load(file_path, sr=None)

    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
    delta_mfccs = librosa.feature.delta(mfccs)
    delta_mfccs_flattened = delta_mfccs.flatten()
    delta_mfccs_list.append(delta_mfccs_flattened)

max_length = max(len(delta_mfcc) for delta_mfcc in delta_mfccs_list)

delta_mfccs_padded = np.zeros((len(delta_mfccs_list), max_length))

for i, delta_mfcc in enumerate(delta_mfccs_list):
    delta_mfccs_padded[i, :len(delta_mfcc)] = delta_mfcc

df['Delta MFCC Mean'] = [np.mean(delta_mfcc) for delta_mfcc in delta_mfccs_padded]

**Wavelet Mean & STD**

In [None]:
import pywt

wavelet_mean_list = []
wavelet_std_list = []

for filename in df['ID']:
    file_path = os.path.join(folder_path, filename)
    audio_data, sample_rate = librosa.load(file_path, sr=None)

    coeffs = pywt.wavedec(audio_data, 'db4', level=5)
    coeffs_flattened = np.hstack(coeffs)
    wavelet_mean = np.mean(coeffs_flattened)
    wavelet_std = np.std(coeffs_flattened)
    wavelet_mean_list.append(wavelet_mean)
    wavelet_std_list.append(wavelet_std)

df['Wavelet Mean'] = wavelet_mean_list
df['Wavelet STD'] = wavelet_std_list

**Linear Predictive Coding (mean)**

In [None]:
lpc_order = 10

lpc_coeffs_list = []

for filename in df['ID']:
    file_path = os.path.join(folder_path, filename)
    audio_data, sample_rate = librosa.load(file_path, sr=None)

    lpc_coeffs = librosa.lpc(audio_data, order=lpc_order)
    lpc_mean = np.mean(lpc_coeffs)

    lpc_coeffs_list.append(lpc_mean)

df['LPC Mean'] = lpc_coeffs_list

df.to_csv("COPD_cough_sound_features.csv", index=False)

**Chroma STFT (Short-Time Fourier Transform)**

In [None]:
chroma_stft_features = []

for filename in df['ID']:
    file_path = os.path.join(folder_path, filename)
    audio_data, sample_rate = librosa.load(file_path, sr=None)

    chroma_stft = librosa.feature.chroma_stft(y=audio_data, sr=sample_rate)
    chroma_stft_mean = np.mean(chroma_stft, axis=1)
    chroma_stft_features.append(chroma_stft_mean)

chroma_stft_df = pd.DataFrame(chroma_stft_features, columns=[f'Chroma_{i}' for i in range(12)])

df = pd.concat([df, chroma_stft_df], axis=1)

df.to_csv("COPD_cough_sound_features.csv", index=False)

**Root Mean Square Energy**

In [None]:
rmse_features = []

for filename in df['ID']:
    file_path = os.path.join(folder_path, filename)
    audio_data, sample_rate = librosa.load(file_path, sr=None)
    rmse = librosa.feature.rms(y=audio_data)
    rmse_mean = np.mean(rmse)
    rmse_features.append(rmse_mean)

df['RMSE'] = rmse_features

df.to_csv("COPD_cough_sound_features.csv", index=False)

In [None]:
file_path1 = 'healthy_cough_sound_features.csv'
file_path2 = 'asthma_cough_sound_features.csv'
file_path3 = 'COPD_cough_sound_features.csv'

df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)
df3 = pd.read_csv(file_path3)

combined_df = pd.concat([df1, df2, df3], ignore_index=True)
combined_df.to_csv('pre_smote_dataset.csv')