# Importing libraries and data

In [None]:
import os
import glob

TRAINING_ONLY = True

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vbookshelf/respiratory-sound-database")

print("Path to dataset files:", path)

In [None]:
dataset_path = '/root/.cache/kagglehub/datasets/vbookshelf/respiratory-sound-database/versions/2'

audio_path = os.path.join(dataset_path, 'respiratory_sound_database','Respiratory_Sound_Database', 'audio_and_txt_files')
audio_files = glob.glob(os.path.join(audio_path, '**/*.wav'), recursive=True)

# print(glob.glob(os.path.join(dataset_path, '*'), recursive=True))

print(f"Found {len(audio_files)} audio files.")

# EDA

# Data Preparation

In [None]:
import os
import glob
import random
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
patient_diagnosis = os.path.join(dataset_path, 'demographic_info.txt')
col_names = ['patient_id', 'age', 'sex', 'adult_bmi', 'child_weight', 'child_height']
df_demo = pd.read_csv(patient_diagnosis, sep=" ", header=None, names=col_names)


# Load the disease diagnosis information
diagnosis_mapping = {}
diagnosis_file = os.path.join(dataset_path, 'demographic_info.txt')
with open(diagnosis_file, 'r') as f:
    for line in f:
        parts_diag = line.strip().split()
        if len(parts_diag) >= 2:
            patient_id = parts_diag[0]
            diagnosis = parts_diag[1]
            diagnosis_mapping[patient_id] = diagnosis

In [None]:
patient_diagnosis = os.path.join(dataset_path, 'respiratory_sound_database','Respiratory_Sound_Database', 'patient_diagnosis.csv')
df_diag = pd.read_csv(patient_diagnosis, header=None, names=['patient_id', 'diagnosis'])
print(df_diag['diagnosis'].value_counts())

In [None]:
def extract_features(file_path, sr=22050, n_mfcc=13):
    """
    Extract audio features including MFCCs, spectral centroid,
    zero crossing rate, spectral bandwidth, and chroma features.
    """
    try:
        y, sr = librosa.load(file_path, sr=sr)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

    if y.size == 0:
        print(f"File {file_path} is empty.")
        return None

    # MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_std = np.std(mfcc, axis=1)

    # Spectral centroid
    spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_centroid_mean = np.mean(spec_centroid)
    spec_centroid_std = np.std(spec_centroid)

    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y)
    zcr_mean = np.mean(zcr)
    zcr_std = np.std(zcr)

    # Spectral bandwidth
    spec_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spec_bandwidth_mean = np.mean(spec_bandwidth)
    spec_bandwidth_std = np.std(spec_bandwidth)

    # Chroma features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)
    chroma_std = np.std(chroma, axis=1)

    # Combine features into a dictionary
    features = {}
    for i in range(n_mfcc):
        features[f'mfcc_{i+1}_mean'] = mfcc_mean[i]
        features[f'mfcc_{i+1}_std'] = mfcc_std[i]

    features['spec_centroid_mean'] = spec_centroid_mean
    features['spec_centroid_std'] = spec_centroid_std
    features['zcr_mean'] = zcr_mean
    features['zcr_std'] = zcr_std
    features['spec_bandwidth_mean'] = spec_bandwidth_mean
    features['spec_bandwidth_std'] = spec_bandwidth_std

    for i in range(chroma.shape[0]):
        features[f'chroma_{i+1}_mean'] = chroma_mean[i]
        features[f'chroma_{i+1}_std'] = chroma_std[i]

    return features

In [None]:
# Augmentation functions (no redefinition of time_stretch)
def pitch_shift(y, sr, n_steps=4):
    """Pitch shift the audio signal by n_steps semitones."""
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)  # Pass `sr` as a keyword argument

def add_noise(y, noise_level=0.005):
    """Inject random noise into the audio signal."""
    noise = np.random.randn(len(y)) * noise_level
    return y + noise

def change_volume(y, gain=1.5):
    """Increase or decrease the volume by a certain gain factor."""
    return y * gain

def apply_random_augmentation(y, sr):
    """Randomly apply one of the augmentations."""
    augmentation_type = random.choice(["time_stretch", "pitch_shift", "add_noise", "change_volume", None])

    if augmentation_type == "time_stretch":
        # Directly use librosa.effects.time_stretch here
        rate = random.uniform(0.8, 1.5)  # Random rate between 0.8 and 1.5
        y = librosa.effects.time_stretch(y, rate=rate)
    elif augmentation_type == "pitch_shift":
        y = pitch_shift(y, sr, n_steps=random.randint(-5, 5))  # Random pitch shift between -5 and 5 semitones
    elif augmentation_type == "add_noise":
        y = add_noise(y, noise_level=random.uniform(0.001, 0.01))  # Random noise level
    elif augmentation_type == "change_volume":
        y = change_volume(y, gain=random.uniform(0.5, 2.0))  # Random gain between 0.5 and 2.0

    return y

In [36]:
feature_list = []
labels = []

for file in audio_files:
    feats = extract_features(file)
    if feats is None:
        continue
    feature_list.append(feats)

    # Extract patient ID from the file name (first part)
    file_name = os.path.basename(file)
    parts = file_name.split('_')
    if len(parts) >= 1:
        patient_id = parts[0]
        diagnosis_row = df_diag[df_diag['patient_id'] == int(patient_id)]

        if not diagnosis_row.empty:
            diagnosis = diagnosis_row['diagnosis'].values[0]

            if diagnosis == "Healthy":
                label = "healthy"
            elif diagnosis == 'COPD':
              label = 'COPD'
            elif diagnosis == 'LRTI':
              label = 'LRTI'
            elif diagnosis == 'URTI':
              label = 'URTI'
            elif diagnosis == 'Bronchiectasis':
              label = 'Bronchiectasis'
            elif diagnosis == 'Pneumonia':
              label = 'Pneumonia'
            elif diagnosis == 'Bronchiolitis':
              label = 'Bronchiolitis'
            else:
                label = "unknown"
        else:
            label = "unknown"
            print(f"Warning: Missing diagnosis for patient {patient_id}. Label set to 'unknown'.")

    else:
        label = "unknown"

    labels.append(label)

In [40]:
# Create DataFrame from features and labels
df_features = pd.DataFrame(feature_list)
df_features['label'] = labels

df_features = df_features[df_features['label'] != 'unknown']

# Debug: Check label distribution before filtering
print("Feature DataFrame shape:", df_features.shape)
print("-"*50)
print(df_features['label'].value_counts())
# print("-"*50)
# print(df_features.head())

Feature DataFrame shape: (919, 57)
--------------------------------------------------
label
COPD              793
Pneumonia          37
healthy            35
URTI               23
Bronchiectasis     16
Bronchiolitis      13
LRTI                2
Name: count, dtype: int64


In [None]:
# Create a DataFrame from augmented features and labels
df_augmented_features = pd.DataFrame(augmented_feature_list)
df_augmented_features['label'] = augmented_labels

# Check the augmented feature DataFrame shape
print("Augmented Feature DataFrame shape:", df_augmented_features.shape)
print("-" * 50)
print(df_augmented_features['label'].value_counts())

# Combine augmented data with the original data
df_combined = pd.concat([df_features, df_augmented_features], ignore_index=True)

# Shuffle the combined dataset
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(df_combined.drop('label', axis=1),
                                                    df_combined['label'],
                                                    test_size=0.2, random_state=42)

# Check new label distribution after augmentation
print("Combined label distribution after augmentation:")
print(df_combined['label'].value_counts())

# Model Building

## RandomForest Classification

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [None]:
# Train a Random Forest classifier
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf = RandomForestClassifier(
    n_estimators=50,              # Reduce the number of trees
    max_depth=10,                 # Limit the depth of trees to prevent overfitting
    min_samples_split=6,          # Require more samples to split a node
    min_samples_leaf=4,           # Require more samples at the leaf node
    max_features=0.5,             # Consider only 50% of features at each split
    bootstrap=True,               # Use bootstrapped samples for each tree
    random_state=42
)
clf.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy*100:.2f}%")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print("-"*50)

# Classification Report
cr = classification_report(y_test, y_pred, target_names=le.classes_)
print("Classification Report:")
print(cr)
print("-"*50)

# Perform 5-fold cross-validation
rf = RandomForestClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
# Create a pipeline that first scales the data then trains an SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    # ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ('svc', SVC(
        kernel='linear',
        probability=True,
        C=0.1,
        random_state=42
        ))
])

# Define a grid of hyperparameters to search over
param_grid = {
    # 'svc__C': [0.1, 1, 10, 100],
    'svc__C': [0.1, 1, 10]
    # 'svc__gamma': [0.001, 0.01, 0.1, 1],
}

# Set up grid search with 5-fold cross-validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross validation accuracy:", grid_search.best_score_)

In [None]:
# Predict on the test set and evaluate accuracy
y_pred_svm = grid_search.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
svm = grid_search.best_estimator_
print(f"SVM Test Accuracy: {accuracy_svm*100:.2f}%")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:")
print(cm)

# Classification Report
cr = classification_report(y_test, y_pred_svm, target_names=le.classes_)
print("Classification Report:")
print(cr)

# Evaluate the best SVM pipeline using cross-validation on the training set
cv_scores_svm = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')
print("SVM Cross-validation scores:", cv_scores_svm)
print("Mean SVM CV score:", cv_scores_svm.mean())

## Ensemble

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

base_learners = [
    ('rf', RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)),
    ('svm', SVC(kernel='linear', C=0.1, random_state=42))
]

stack_model = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())
stack_model.fit(X_train, y_train)

y_pred_stack = stack_model.predict(X_test)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
print(f"Stacking Model Test Accuracy: {accuracy_stack*100:.2f}%")

In [None]:
# from sklearn.ensemble import VotingClassifier

# # Create an ensemble classifier with soft voting
# ensemble_model = VotingClassifier(
#     estimators=[('rf', clf), ('svm', svm)],
#     voting='soft',
#     weights=[3, 2]
# )

# # Train the ensemble model on the training data
# ensemble_model.fit(X_train, y_train)

# # Predict on the test set and evaluate accuracy
# y_pred_ensemble = ensemble_model.predict(X_test)
# accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
# print(f"Ensemble Model Test Accuracy: {accuracy_ensemble*100:.2f}%")

# Try out

In [None]:
def predict_lung_cancer_probability(wav_file, ensemble_model, feature_columns, label_encoder):
    # Extract features using the same function as for training
    feats = extract_features(wav_file)
    if feats is None:
        print(f"Error extracting features from {wav_file}.")
        return None

    input_data = pd.DataFrame([feats])
    input_data = input_data.reindex(columns=feature_columns, fill_value=0)

    # Predict probabilities using the ensemble model
    probas = ensemble_model.predict_proba(input_data)
    diseased_index = np.where(label_encoder.classes_ == 'diseased')[0][0]
    probability_diseased = probas[0, diseased_index]
    return probability_diseased

In [None]:
wav_tests = glob.glob('/content/test/*.wav')
feature_columns = df_features.drop("label", axis=1).columns

In [None]:
for test in wav_tests:
    probability = predict_lung_cancer_probability(test, ensemble_model, feature_columns, le)
    if probability is not None:
        print(f"Probability of lung cancer: {probability*100:.2f}%")

In [None]:
from sklearn.calibration import CalibratedClassifierCV

# Wrap your ensemble classifier with calibration
calibrated_ensemble = CalibratedClassifierCV(ensemble_model, cv=5, method='isotonic')
calibrated_ensemble.fit(X_train, y_train)

# Now predict probabilities on new samples
for test in wav_tests:
    probability = predict_lung_cancer_probability(test, calibrated_ensemble, feature_columns, le)
    if probability is not None:
        print(f"Calibrated probability of lung cancer: {probability*100:.2f}%")