In [1]:
import numpy as np
import pandas as pd
import librosa
import json
import os
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
import keras
from keras.utils import to_categorical
from keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

## Setting the DataFrame and extracting the audio path and JSON metadata

In [2]:
# setting the dataset folder
folder_path = r"public_dataset"

# Creating lists to store the files

files = []
cough_scores = []
timestamps = []

# loop through all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".wav"):
        # store the wav file path
        wav_path = os.path.join(folder_path, file_name)
        files.append(wav_path)

        # Get the corresponding JSON file 
        json_name = file_name.replace(".wav", ".json")
        json_path = os.path.join(folder_path, json_name)

        # Read the labels from the JSON
        if os.path.exists(json_path):
            with open(json_path, "r") as f:
                # store the json data in the data dictionary
                data = json.load(f)

                cough_scores.append(float(data["cough_detected"]))
                timestamps.append(data["datetime"])
        else:
            cough_scores.append(0.0)
            timestamps.append("unknown")
            
# Creating the dataframe
df = pd.DataFrame({
    "file_path" : files,
    "cough_confidence" : cough_scores,
    "timestamp" : timestamps
})

df.head()

Unnamed: 0,file_path,cough_confidence,timestamp
0,public_dataset\00014dcc-0f06-4c27-8c7b-737b18a...,0.0155,2020-11-25T18:58:50.488301+00:00
1,public_dataset\00039425-7f3a-42aa-ac13-834aaa2...,0.9609,2020-04-13T21:30:59.801831+00:00
2,public_dataset\0007c6f1-5441-40e6-9aaf-a761d8f...,0.1643,2020-10-18T15:38:38.205870+00:00
3,public_dataset\0009eb28-d8be-4dc1-92bb-907e53b...,0.9301,2020-04-12T04:02:18.159383+00:00
4,public_dataset\0012c608-33d0-4ef7-bde3-75a0b1a...,0.0482,2020-04-15T01:03:59.029326+00:00


In [None]:
def extract_mfcc_2d(wav_path,
                    sr = 16000,
                    n_mfcc = 40,
                    n_fft = 1024, 
                    hop_length = 512, 
                    duration = None, 
                    max_frames = None,
                    normalise = True
                   ):
    """
    We Extract 2D MFCC (time x n_mfcc) from a wav file.

    Parameters:
    - wav_path: path to the .wav file
    - sr: target sample rate
    - n_mfcc: number of mfccs coefficient to be extracted
    - n_fft: FFT window size (samples per each frame)
    - hop_length: hop length(samples) between frames
    """
    if duration is not None:
        target_samples = int(sr * duration)
        y, _ = librosa.load(wav_path, sr = sr, mono = True, duration = duration)
        if len(y) < target_samples:
             y = np.pad(y, (0, target_samples - len(y)), mode='constant')
        else:
            y, _ = librosa.load(wav_path, sr = sr, mono = True)

    # (n_features, n_frames)
    mfcc = librosa.feature.mfcc(y = y, sr = sr, n_mfcc = n_mfcc, n_fft = n_fft,
                                hop_length = hop_length)

    if normalise:
        eps = 1e-9
        mean = np.mean(mfcc, axis = 1, keepdims = True)
        std = np.std(mfcc, axis = 1, keepdims = True)
        mfcc = (mfcc - mean) / (std + eps)

    # Transpose to (time_frames, n_features)                            
    mfcc = mfcc.T  # shape: (n_frames, n_features)
    
    if max_frames is not None:
        T, F = mfcc.shape
        if T < max_frames:
            pad_width = ((0, max_frames - T), (0, 0))
            mfcc = np.pad(mfcc, pad_width=pad_width, mode='constant', constant_values=0.0)
        elif T > max_frames:
            mfcc = mfcc[:max_frames, :] 

    return mfcc

In [4]:
def extract_features(df,
                     duration = 9,
                     sr = 16000,
                     n_mfcc = 40,
                     n_fft = 1024, 
                     hop_length = 512,
                     normalise = True,
                    ):
    target_samples = int( sr * duration)
    expected_frames = 1 + int(np.floor((target_samples - n_fft) / float(hop_length))) if target_samples > n_fft else 1
    
    n_files = len(df)
    X = np.zeros((n_files, expected_frames, n_mfcc), dtype = np.float32)
    y = (df["cough_confidence"] >= 0.5).astype(np.float32).values

    for i, wav_path in enumerate(tqdm(df["file_path"], desc = "Extracting MFCCs")):
        try:
            mfcc = extract_mfcc_2d(
                wav_path,
                sr = sr,
                n_mfcc = n_mfcc,
                n_fft = n_fft,
                hop_length = hop_length,
                duration = duration,
                max_frames=expected_frames,
                normalise = normalise
            )
            X[i] = mfcc
        except Exception as e:
            print(f"Warning: failed to process {wav_path}: {e}")

    return X, y

X, y = extract_features(df, duration = 9.0)

Extracting MFCCs: 100%|██████████| 27550/27550 [12:16<00:00, 37.41it/s]


In [5]:
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

print(f"\nClass distribution:")

print(f"Cough samples (1): {np.sum(y == 1)}")
print(f"Non-cough samples (0): {np.sum(y == 0)}")

X shape: (27550, 280, 40)
y shape: (27550,)

Class distribution:
Cough samples (1): 18146
Non-cough samples (0): 9404


In [6]:

print('Version:', tf.__version__); print('GPU Available:', len(tf.config.list_physical_devices('GPU')) > 0)

Version: 2.10.0
GPU Available: True


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"validating set: {X_val.shape}")


Training set: (17632, 280, 40)
Test set: (5510, 280, 40)
validating set: (4408, 280, 40)


In [8]:
y_train_onehot = to_categorical(y_train, num_classes=2)
y_val_onehot   = to_categorical(y_val,   num_classes=2)
y_test_onehot  = to_categorical(y_test,  num_classes=2)

print(f"Training set: {y_train_onehot.shape}")
print(f"Validation set: {y_val_onehot.shape}")
print(f"Test set: {y_test_onehot.shape}")

Training set: (17632, 2)
Validation set: (4408, 2)
Test set: (5510, 2)


In [None]:
def build_tinyml_cnn(input_shape, num_classes=2):
    model = models.Sequential([
        layers.Input(shape=input_shape),

        # Conv Block 1: Tiny but functional
        layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),

        # Conv Block 2
        layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPool1D(pool_size=2),

        layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPool1D(pool_size=2),

        # Collapse to vector
        layers.GlobalAveragePooling1D(),

        # Tiny Dense classifier
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2), 
        layers.Dense(16, activation="relu"),
        layers.Dropout(0.1),

        layers.Dense(num_classes, activation='softmax')
    ], name="cough_cnn")
    return model

In [10]:
def compile_model(model, learning_rate = 3e-4):
    model.compile(
        optimizer = keras.optimizers.Adam(learning_rate = learning_rate),
        loss = keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
        metrics=["accuracy",
                 keras.metrics.Precision(name="Precision"),
                 keras.metrics.Recall(name="recall")]
    )
    return model

In [11]:
classes = np.unique(y_train)
weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

class_weight = {int(c): float(w) for c, w in zip(classes, weights)}
print("Class weights:", class_weight)

Class weights: {0: 1.4649385177799934, 1: 0.7590838643017048}


In [12]:
import tensorflow as tf

# Set memory growth for GPU to avoid taking all memory
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth for each GPU
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(f"Warning: {e}")

GPU memory growth enabled.


In [13]:
print(f"2. Final training data shape for model.fit: {X_train.shape}")

2. Final training data shape for model.fit: (17632, 280, 40)


In [14]:
def train_model(model,
                X_train, y_train_onehot,
                X_val, y_val_onehot,
                class_weight=None,
                batch_size=16,
                epochs=50,
                save_path=None):

    with tf.device('/cpu:0'):
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_onehot))
        train_dataset = train_dataset.shuffle(buffer_size=1024)
        train_dataset = train_dataset.batch(batch_size)
        train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

        val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val_onehot))
        val_dataset = val_dataset.batch(batch_size)
        val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=8,
            restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss",
            factor=0.5,
            patience=4,
            min_lr=1e-6)
    ]

    # ✅ FIX 2: Add this line to monitor GPU usage
    tf.debugging.set_log_device_placement(True)  # Set to True for detailed logs

    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        callbacks=callbacks,
        class_weight=class_weight,
        verbose=1
    )

    if save_path:
        model.save(save_path)
        print(f"Model saved to: {save_path}")

    return history

In [15]:
print(f"2. Final training data shape for model.fit: {X_train.shape}")

2. Final training data shape for model.fit: (17632, 280, 40)


In [16]:
input_shape = X_train.shape[1:]
model = build_tinyml_cnn(input_shape=input_shape, num_classes=2)
compile_model(model, learning_rate=3e-4)
history = train_model(model,
                      X_train, y_train_onehot,
                      X_val, y_val_onehot,
                      class_weight = class_weight,
                      batch_size=32,
                      epochs=50,
                      save_path="cough_cnn.h5")     

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Model saved to: cough_cnn.h5


In [17]:
def evaluate_model(model, X_test, y_test_onehot, y_test_labels, batch_size=32, show_report=True):
    """Forces evaluation onto CPU to avoid GPU memory issues."""
    
    # 1. Force TensorFlow operations to CPU
    with tf.device('/cpu:0'):
        # Create dataset on CPU
        test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test_onehot))
        test_dataset = test_dataset.batch(batch_size)
        
        # Move model weights to CPU temporarily
        with tf.device('/cpu:0'):
            # Evaluate
            eval_metrics = model.evaluate(test_dataset, verbose=0)
            print("Evaluation (loss, accuracy, precision, recall):", eval_metrics)
            
            # Predict
            y_pred_prob = model.predict(test_dataset, verbose=0)
    
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    if show_report:
        print("\nClassification report:")
        print(classification_report(y_test_labels, y_pred, digits=4))
        print("Confusion matrix:")
        print(confusion_matrix(y_test_labels, y_pred))

    return eval_metrics, y_pred

In [18]:
eval_metrics, y_pred = evaluate_model(model, X_test, y_test_onehot, y_test)

Evaluation (loss, accuracy, precision, recall): [0.36378079652786255, 0.8952813148498535, 0.8952813148498535, 0.8952813148498535]

Classification report:
              precision    recall  f1-score   support

         0.0     0.8414    0.8543    0.8478      1881
         1.0     0.9239    0.9165    0.9202      3629

    accuracy                         0.8953      5510
   macro avg     0.8826    0.8854    0.8840      5510
weighted avg     0.8957    0.8953    0.8955      5510

Confusion matrix:
[[1607  274]
 [ 303 3326]]


In [19]:
def get_file_size(file_path):
    """Get file size in bytes"""
    return os.path.getsize(file_path)

def convert_bytes(bytes_size, unit):
    """Convert bytes to specified unit (KB, MB, GB)"""
    units = {"KB": 1024, "MB": 1024**2, "GB": 1024**3}
    return bytes_size / units[unit]

# Now use it
file_size_mb = convert_bytes(get_file_size("cough_cnn.h5"), "MB")
print(f"Model size: {file_size_mb:.2f} MB")

Model size: 0.55 MB


In [20]:
import numpy as np

# Suppose your X_train is already loaded in memory
np.save("X_train.npy", X_train)
print("X_train saved as X_train.npy")


X_train saved as X_train.npy


In [25]:
import tensorflow as tf
import numpy as np
import os

model = tf.keras.models.load_model("cough_cnn.h5")
print("Keras input shape:", model.input_shape) 
print("Keras output shape:", model.output_shape)
print("Keras dtype:", model.dtype)

# --- Representative generator using model.input_shape ---
input_shape = model.input_shape  # tuple (None, dim1, dim2?) etc
# create a safe generator that uses X_train (you already have X_train)
def representative_data_gen():
    for i in range(100):
        sample = X_train[i].astype(np.float32)
        # Ensure sample includes batch dimension
        if sample.ndim == len(input_shape) - 1:
            sample = sample.reshape((1,) + sample.shape)
        elif sample.ndim == len(input_shape):
            sample = sample.reshape((1,) + sample.shape[1:])
        yield [sample]

# Convert to TFLite INT8 (FULL integer quantization, INT8 I/O)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model = converter.convert()
open("cough_cnn_int8.tflite","wb").write(tflite_model)
print("Wrote cough_cnn_int8.tflite (size KB):", os.path.getsize("cough_cnn_int8.tflite")/1024)


Keras input shape: (None, 280, 40)
Keras output shape: (None, 2)
Keras dtype: float32




INFO:tensorflow:Assets written to: C:\Users\Aman\AppData\Local\Temp\tmpqwzsgytb\assets


INFO:tensorflow:Assets written to: C:\Users\Aman\AppData\Local\Temp\tmpqwzsgytb\assets


Wrote cough_cnn_int8.tflite (size KB): 58.625


In [26]:
interpreter = tf.lite.Interpreter(model_path="cough_cnn_int8.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()[0]
output_details = interpreter.get_output_details()[0]

print("Input shape:", input_details["shape"])
print("Input quantization:", input_details["quantization"])   # (scale, zero_point)
print("Output shape:", output_details["shape"])
print("Output quantization:", output_details["quantization"]) # (scale, zero_point)
print("Output dtype:", output_details["dtype"])


Input shape: [  1 280  40]
Input quantization: (0.11736483126878738, 2)
Output shape: [1 2]
Output quantization: (0.00390625, -128)
Output dtype: <class 'numpy.int8'>
