In [1]:
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
import joblib


In [2]:

healthy_processed_audio = np.load("healthy_processed_audio.npy", allow_pickle=True)
unhealthy_processed_audio = np.load("unhealthy_processed_audio.npy", allow_pickle=True)

print("Healthy audio chunks shape:", healthy_processed_audio.shape)
print("Unhealthy audio chunks shape:", unhealthy_processed_audio.shape)


Healthy audio chunks shape: (42382, 10000)
Unhealthy audio chunks shape: (13934, 10000)


In [3]:
def extract_features(audio_chunks, sr=22050, n_mfcc=13):
    features = []
    for y in tqdm(audio_chunks, desc="Extracting MFCCs"):
        # Extract MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        feature_vector = np.concatenate([mfcc_mean, mfcc_std])
        features.append(feature_vector)
    return np.array(features)

# 1. Extract features
print("Extracting features for Healthy...")
X_healthy = extract_features(healthy_processed_audio)

print("Extracting features for Unhealthy...")
X_unhealthy = extract_features(unhealthy_processed_audio)

X = np.vstack([X_healthy, X_unhealthy])
y = np.array([0]*len(X_healthy) + [1]*len(X_unhealthy))  # 0=healthy, 1=unhealthy

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Extracting features for Healthy...


Extracting MFCCs: 100%|██████████████████| 42382/42382 [04:21<00:00, 162.37it/s]


Extracting features for Unhealthy...


Extracting MFCCs: 100%|██████████████████| 13934/13934 [01:19<00:00, 175.78it/s]



Accuracy: 0.859375

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91      8477
           1       0.76      0.63      0.69      2787

    accuracy                           0.86     11264
   macro avg       0.82      0.78      0.80     11264
weighted avg       0.85      0.86      0.85     11264


Confusion Matrix:
 [[7932  545]
 [1039 1748]]


In [4]:
# Because the dataset is imbalanced, we will explore different techniques to address it.

# -----------------------------
#  Just use class_weight="balanced"
# -----------------------------
clf_balanced = LogisticRegression(max_iter=1000, class_weight="balanced")
clf_balanced.fit(X_train, y_train)

y_pred_balanced = clf_balanced.predict(X_test)

print("\n=== Logistic Regression (class_weight balanced) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_balanced))
print(classification_report(y_test, y_pred_balanced))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_balanced))

# -----------------------------
# Apply SMOTE (oversampling)
# -----------------------------
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

clf_smote = LogisticRegression(max_iter=1000)
clf_smote.fit(X_train_sm, y_train_sm)

y_pred_smote = clf_smote.predict(X_test)

print("\n=== Logistic Regression with SMOTE Oversampling ===")
print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print(classification_report(y_test, y_pred_smote))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))

# -----------------------------
# Apply undersampling
# -----------------------------
undersample = RandomUnderSampler(random_state=42)
X_train_us, y_train_us = undersample.fit_resample(X_train, y_train)

clf_us = LogisticRegression(max_iter=1000)
clf_us.fit(X_train_us, y_train_us)

y_pred_us = clf_us.predict(X_test)

print("\n=== Logistic Regression with Undersampling ===")
print("Accuracy:", accuracy_score(y_test, y_pred_us))
print(classification_report(y_test, y_pred_us))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_us))



=== Logistic Regression (class_weight balanced) ===
Accuracy: 0.8425071022727273
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      8477
           1       0.64      0.85      0.73      2787

    accuracy                           0.84     11264
   macro avg       0.79      0.85      0.81     11264
weighted avg       0.87      0.84      0.85     11264

Confusion Matrix:
 [[7110 1367]
 [ 407 2380]]

=== Logistic Regression with SMOTE Oversampling ===
Accuracy: 0.8440163352272727
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      8477
           1       0.64      0.85      0.73      2787

    accuracy                           0.84     11264
   macro avg       0.79      0.85      0.81     11264
weighted avg       0.87      0.84      0.85     11264

Confusion Matrix:
 [[7130 1347]
 [ 410 2377]]

=== Logistic Regression with Undersampling ===
Accuracy: 0.8427734375
              preci

In [7]:
# === Random Forest ===
print("\n=== Random Forest ===")
clf_rf = RandomForestClassifier(
    n_estimators=300,       # number of trees
    max_depth=20,           # limit tree depth
    min_samples_split=10,   # split only if enough samples
    min_samples_leaf=5,     # prevent overfitting on small leaves
    class_weight="balanced",# handle imbalance
    n_jobs=-1,
    random_state=42
)
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# === XGBoost ===
print("\n=== XGBoost ===")
clf_xgb = XGBClassifier(
    n_estimators=400,         # number of boosting rounds
    max_depth=8,              # tree depth
    learning_rate=0.05,       # smaller LR with more estimators
    subsample=0.8,            # random subsampling for robustness
    colsample_bytree=0.8,     # feature subsampling
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]), # handle imbalance
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)
clf_xgb.fit(X_train, y_train)
y_pred_xgb = clf_xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))



=== Random Forest ===
Accuracy: 0.9627130681818182
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8477
           1       0.90      0.96      0.93      2787

    accuracy                           0.96     11264
   macro avg       0.94      0.96      0.95     11264
weighted avg       0.96      0.96      0.96     11264

Confusion Matrix:
 [[8175  302]
 [ 118 2669]]

=== XGBoost ===
Accuracy: 0.9818892045454546
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      8477
           1       0.95      0.98      0.96      2787

    accuracy                           0.98     11264
   macro avg       0.97      0.98      0.98     11264
weighted avg       0.98      0.98      0.98     11264

Confusion Matrix:
 [[8333  144]
 [  60 2727]]


In [10]:
results = {
    "LogReg": (accuracy_score(y_test, y_pred), classification_report(y_test, y_pred, output_dict=True)),
    "RandomForest": (accuracy_score(y_test, y_pred_rf), classification_report(y_test, y_pred_rf, output_dict=True)),
    "XGBoost": (accuracy_score(y_test, y_pred_xgb), classification_report(y_test, y_pred_xgb, output_dict=True)),
}


In [13]:
import joblib

# Path to save the model
model_path = "xgboost_heart_sound_model.pkl"

# Save the trained XGBoost model
joblib.dump(clf_xgb, model_path)
print(f"Model saved to {model_path}")
scaler_path = "scaler_heart_sound.pkl"
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to {scaler_path}")


Model saved to xgboost_heart_sound_model.pkl
