# Cough Detection (2s Windows, Weak Labels)

This notebook creates a **new 2-second training pipeline** from the existing 9-second recordings.
It does **not** modify your original notebook.

Approach:
- split data at **file level** first (to avoid leakage)
- mine 2s windows from each file
- auto-label confident windows using weak labels + energy/onset ranking
- train and export an INT8 TFLite model for ESP32


In [1]:
import os
import json
import random
from pathlib import Path

import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
from keras.utils import to_categorical
from tqdm import tqdm

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

DATASET_DIR = Path("../public_dataset")

# Audio/MFCC config
SR = 16000
CLIP_SEC = 2.0
CLIP_SAMPLES = int(SR * CLIP_SEC)
WINDOW_HOP_SEC = 0.5
WINDOW_HOP_SAMPLES = int(SR * WINDOW_HOP_SEC)

N_MFCC = 40
N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 512

# Weak-label mining config
POS_FILE_MIN = 0.80   # file-level cough_confidence >= this -> positive file
NEG_FILE_MAX = 0.20   # file-level cough_confidence <= this -> negative file

POS_WINDOWS_PER_FILE = 3
NEG_WINDOWS_PER_FILE = 4

print("TensorFlow:", tf.__version__)
print("Dataset dir:", DATASET_DIR.resolve())


TensorFlow: 2.10.0
Dataset dir: E:\minor-project\public_dataset


In [2]:
# Build file-level dataframe
rows = []

for wav_path in sorted(DATASET_DIR.glob("*.wav")):
    json_path = wav_path.with_suffix(".json")
    cough_confidence = 0.0
    timestamp = "unknown"

    if json_path.exists():
        with open(json_path, "r", encoding="utf-8") as f:
            meta = json.load(f)
        cough_confidence = float(meta.get("cough_detected", 0.0))
        timestamp = str(meta.get("datetime", "unknown"))

    rows.append({
        "file_path": str(wav_path),
        "cough_confidence": cough_confidence,
        "timestamp": timestamp,
        "weak_file_label": 1 if cough_confidence >= 0.5 else 0,
    })

df_files = pd.DataFrame(rows)
print("Total files:", len(df_files))
print(df_files["weak_file_label"].value_counts(dropna=False))
df_files.head()


Total files: 22771
weak_file_label
1    15041
0     7730
Name: count, dtype: int64


Unnamed: 0,file_path,cough_confidence,timestamp,weak_file_label
0,..\public_dataset\2d1d8b3f-4bfc-4bbd-b32e-1a03...,0.0,unknown,0
1,..\public_dataset\2d22665d-5ca0-47f1-80ad-ebfa...,0.8796,2020-08-19T18:12:19.693957+00:00,1
2,..\public_dataset\2d25616a-2bee-492e-a3ff-7c20...,0.0099,2020-04-14T18:37:54.848372+00:00,0
3,..\public_dataset\2d26833b-337e-4d31-b566-f0d2...,0.9817,2020-05-18T19:56:19.359410+00:00,1
4,..\public_dataset\2d28744c-a394-438b-8999-0542...,0.8822,2020-04-14T15:04:24.762414+00:00,1


In [3]:
# File-level split first (critical to avoid train/test leakage)
train_files, test_files = train_test_split(
    df_files,
    test_size=0.20,
    random_state=SEED,
    stratify=df_files["weak_file_label"],
)

train_files, val_files = train_test_split(
    train_files,
    test_size=0.20,
    random_state=SEED,
    stratify=train_files["weak_file_label"],
)

print("Train files:", len(train_files))
print("Val files:  ", len(val_files))
print("Test files: ", len(test_files))


Train files: 14572
Val files:   3644
Test files:  4555


In [4]:
def _normalize_0_1(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    if len(x) == 0:
        return x
    mn, mx = float(np.min(x)), float(np.max(x))
    if mx - mn < 1e-12:
        return np.zeros_like(x)
    return (x - mn) / (mx - mn)


def _window_starts(num_samples: int, clip_samples: int, hop_samples: int):
    if num_samples <= clip_samples:
        return [0]
    n = 1 + int((num_samples - clip_samples) // hop_samples)
    return [i * hop_samples for i in range(n)]


def mine_windows_from_file(
    wav_path: str,
    file_score: float,
    split_name: str,
    sr: int = SR,
    clip_samples: int = CLIP_SAMPLES,
    hop_samples: int = WINDOW_HOP_SAMPLES,
    pos_file_min: float = POS_FILE_MIN,
    neg_file_max: float = NEG_FILE_MAX,
    pos_k: int = POS_WINDOWS_PER_FILE,
    neg_k: int = NEG_WINDOWS_PER_FILE,
):
    y, _ = librosa.load(wav_path, sr=sr, mono=True)
    starts = _window_starts(len(y), clip_samples, hop_samples)

    segments = []
    rms_scores = []
    onset_scores = []

    for s in starts:
        seg = y[s:s + clip_samples]
        if len(seg) < clip_samples:
            seg = np.pad(seg, (0, clip_samples - len(seg)), mode="constant")

        rms = float(np.sqrt(np.mean(seg * seg) + 1e-12))
        onset_env = librosa.onset.onset_strength(y=seg, sr=sr)
        onset = float(np.mean(onset_env)) if len(onset_env) else 0.0

        segments.append((s, seg))
        rms_scores.append(rms)
        onset_scores.append(onset)

    rms_n = _normalize_0_1(np.array(rms_scores, dtype=np.float32))
    onset_n = _normalize_0_1(np.array(onset_scores, dtype=np.float32))
    event_score = 0.7 * rms_n + 0.3 * onset_n

    rows = []

    if file_score >= pos_file_min:
        # Positive file: keep only top-k most event-like windows as cough
        k = min(pos_k, len(segments))
        top_idx = np.argsort(-event_score)[:k]
        for idx in top_idx:
            s, _ = segments[int(idx)]
            rows.append({
                "file_path": wav_path,
                "start_sample": int(s),
                "start_sec": float(s / sr),
                "label": 1,
                "split": split_name,
                "file_score": float(file_score),
                "event_score": float(event_score[int(idx)]),
            })

    elif file_score <= neg_file_max:
        # Negative file: sample a few windows as non-cough
        k = min(neg_k, len(segments))
        rng = np.random.default_rng(SEED + hash((wav_path, split_name)) % 10_000)
        pick = rng.choice(len(segments), size=k, replace=False)
        for idx in pick:
            s, _ = segments[int(idx)]
            rows.append({
                "file_path": wav_path,
                "start_sample": int(s),
                "start_sec": float(s / sr),
                "label": 0,
                "split": split_name,
                "file_score": float(file_score),
                "event_score": float(event_score[int(idx)]),
            })

    # Mid-confidence files are ignored to keep labels cleaner
    return rows


In [5]:
def build_window_manifest(df_split: pd.DataFrame, split_name: str) -> pd.DataFrame:
    all_rows = []
    for r in tqdm(df_split.itertuples(index=False), total=len(df_split), desc=f"Mining {split_name} windows"):
        file_rows = mine_windows_from_file(
            wav_path=r.file_path,
            file_score=float(r.cough_confidence),
            split_name=split_name,
        )
        all_rows.extend(file_rows)

    out = pd.DataFrame(all_rows)
    if len(out) == 0:
        raise RuntimeError(f"No windows mined for split={split_name}. Relax thresholds.")
    return out


train_manifest = build_window_manifest(train_files, "train")
val_manifest = build_window_manifest(val_files, "val")
test_manifest = build_window_manifest(test_files, "test")

print("Train windows:", len(train_manifest))
print(train_manifest["label"].value_counts())
print("Val windows:", len(val_manifest))
print(val_manifest["label"].value_counts())
print("Test windows:", len(test_manifest))
print(test_manifest["label"].value_counts())


Mining train windows: 100%|██████████| 14572/14572 [18:43<00:00, 12.97it/s]
Mining val windows: 100%|██████████| 3644/3644 [12:33<00:00,  4.83it/s]  
Mining test windows: 100%|██████████| 4555/4555 [11:26<00:00,  6.64it/s]

Train windows: 36114
label
1    23375
0    12739
Name: count, dtype: int64
Val windows: 8937
label
1    5829
0    3108
Name: count, dtype: int64
Test windows: 11161
label
1    7215
0    3946
Name: count, dtype: int64





In [6]:
def audio_segment_to_mfcc(
    segment: np.ndarray,
    sr: int = SR,
    n_mfcc: int = N_MFCC,
    n_mels: int = N_MELS,
    n_fft: int = N_FFT,
    hop_length: int = HOP_LENGTH,
    normalise: bool = True,
):
    mfcc = librosa.feature.mfcc(
        y=segment,
        sr=sr,
        n_mfcc=n_mfcc,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length,
        center=True,
        htk=False,
    )

    if normalise:
        eps = 1e-9
        mean = np.mean(mfcc, axis=1, keepdims=True)
        std = np.std(mfcc, axis=1, keepdims=True)
        mfcc = (mfcc - mean) / (std + eps)

    mfcc = mfcc.T.astype(np.float32)  # (frames, n_mfcc)
    return mfcc


def expected_frames(clip_samples: int = CLIP_SAMPLES, n_fft: int = N_FFT, hop_length: int = HOP_LENGTH):
    if clip_samples <= n_fft:
        return 1
    return 1 + int(np.floor((clip_samples - n_fft) / float(hop_length)))


EXPECTED_FRAMES = expected_frames()
print("Expected frames per 2s clip:", EXPECTED_FRAMES)


def build_feature_tensor(manifest: pd.DataFrame):
    X = np.zeros((len(manifest), EXPECTED_FRAMES, N_MFCC), dtype=np.float32)
    y = manifest["label"].astype(np.int32).values

    # tiny cache to avoid re-loading the same wav for multiple windows
    cache = {}

    for i, row in enumerate(tqdm(manifest.itertuples(index=False), total=len(manifest), desc="Extract MFCC")):
        wav_path = row.file_path
        if wav_path not in cache:
            audio, _ = librosa.load(wav_path, sr=SR, mono=True)
            cache[wav_path] = audio

        audio = cache[wav_path]
        s = int(row.start_sample)
        seg = audio[s:s + CLIP_SAMPLES]
        if len(seg) < CLIP_SAMPLES:
            seg = np.pad(seg, (0, CLIP_SAMPLES - len(seg)), mode="constant")

        mfcc = audio_segment_to_mfcc(seg)

        # Safety pad/trim
        if mfcc.shape[0] < EXPECTED_FRAMES:
            pad = np.zeros((EXPECTED_FRAMES - mfcc.shape[0], N_MFCC), dtype=np.float32)
            mfcc = np.concatenate([mfcc, pad], axis=0)
        elif mfcc.shape[0] > EXPECTED_FRAMES:
            mfcc = mfcc[:EXPECTED_FRAMES]

        X[i] = mfcc

    return X, y


X_train, y_train = build_feature_tensor(train_manifest)
X_val, y_val = build_feature_tensor(val_manifest)
X_test, y_test = build_feature_tensor(test_manifest)

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:  ", X_val.shape, "y_val:  ", y_val.shape)
print("X_test: ", X_test.shape, "y_test: ", y_test.shape)


Expected frames per 2s clip: 61


Extract MFCC: 100%|██████████| 36114/36114 [04:47<00:00, 125.61it/s]
Extract MFCC: 100%|██████████| 8937/8937 [01:20<00:00, 111.19it/s]
Extract MFCC: 100%|██████████| 11161/11161 [01:35<00:00, 117.41it/s]


X_train: (36114, 61, 40) y_train: (36114,)
X_val:   (8937, 61, 40) y_val:   (8937,)
X_test:  (11161, 61, 40) y_test:  (11161,)


In [7]:
# Optional MFCC-level augmentation (train only)
def time_shift(mfcc, max_shift=4):
    shift = tf.random.uniform([], -max_shift, max_shift + 1, dtype=tf.int32)
    return tf.roll(mfcc, shift=shift, axis=0)


def add_noise(mfcc, noise_level=0.01):
    noise = tf.random.normal(tf.shape(mfcc), stddev=noise_level)
    return mfcc + noise


def random_gain(mfcc, min_gain=0.9, max_gain=1.1):
    gain = tf.random.uniform([], min_gain, max_gain)
    return mfcc * gain


def augment_mfcc(mfcc, label):
    if tf.random.uniform([]) < 0.5:
        mfcc = time_shift(mfcc, max_shift=4)
    if tf.random.uniform([]) < 0.5:
        mfcc = random_gain(mfcc, 0.9, 1.1)
    mfcc = add_noise(mfcc, noise_level=0.01)
    return mfcc, label


num_classes = 2
y_train_oh = to_categorical(y_train, num_classes=num_classes)
y_val_oh = to_categorical(y_val, num_classes=num_classes)
y_test_oh = to_categorical(y_test, num_classes=num_classes)

classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, weights)}
print("class_weight:", class_weight)


def build_model(input_shape):
    inp = keras.Input(shape=input_shape)
    x = keras.layers.Conv1D(24, 5, padding="same", activation="relu")(inp)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.MaxPooling1D(2)(x)

    x = keras.layers.Conv1D(48, 3, padding="same", activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.MaxPooling1D(2)(x)

    x = keras.layers.Conv1D(96, 3, padding="same", activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.GlobalAveragePooling1D()(x)

    x = keras.layers.Dense(32, activation="relu")(x)
    x = keras.layers.Dropout(0.2)(x)
    out = keras.layers.Dense(2, activation="softmax")(x)

    model = keras.Model(inp, out, name="cough_cnn_2s")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=3e-4),
        loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.05),
        metrics=["accuracy"],
    )
    return model


model = build_model(input_shape=X_train.shape[1:])
model.summary()

train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train_oh))
train_ds = train_ds.shuffle(4096, seed=SEED)
train_ds = train_ds.map(augment_mfcc, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val_oh)).batch(32).prefetch(tf.data.AUTOTUNE)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=4, min_lr=1e-6),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=50,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1,
)


class_weight: {0: 1.417458199230709, 1: 0.7724919786096257}
Model: "cough_cnn_2s"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 61, 40)]          0         
                                                                 
 conv1d (Conv1D)             (None, 61, 24)            4824      
                                                                 
 batch_normalization (BatchN  (None, 61, 24)           96        
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 30, 24)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 30, 48)            3504      
                                                            

In [8]:
# Threshold tuning on validation set
val_probs = model.predict(X_val, batch_size=64, verbose=0)[:, 1]

cand = np.linspace(0.05, 0.95, 19)
best_thr = 0.5
best_f1 = -1.0

for thr in cand:
    pred = (val_probs >= thr).astype(np.int32)
    f1 = f1_score(y_val, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = float(thr)

print(f"Best validation threshold: {best_thr:.2f}, F1: {best_f1:.4f}")

test_probs = model.predict(X_test, batch_size=64, verbose=0)[:, 1]
y_pred = (test_probs >= best_thr).astype(np.int32)

print("\nClassification report (test):")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix (test):")
print(confusion_matrix(y_test, y_pred))


Best validation threshold: 0.35, F1: 0.9505

Classification report (test):
              precision    recall  f1-score   support

           0     0.9094    0.8806    0.8948      3946
           1     0.9358    0.9520    0.9439      7215

    accuracy                         0.9268     11161
   macro avg     0.9226    0.9163    0.9193     11161
weighted avg     0.9265    0.9268    0.9265     11161

Confusion matrix (test):
[[3475  471]
 [ 346 6869]]


In [9]:
# Save model + export INT8 TFLite
model_h5_path = Path("cough_cnn_2s_weak.h5")
model_tflite_path = Path("cough_cnn_2s_weak_int8.tflite")

model.save(model_h5_path)
print("Saved:", model_h5_path)


def representative_data_gen():
    n = min(200, len(X_train))
    idx = np.linspace(0, len(X_train) - 1, num=n, dtype=int)
    for i in idx:
        yield [X_train[i:i+1].astype(np.float32)]


converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model = converter.convert()
model_tflite_path.write_bytes(tflite_model)

print("Saved:", model_tflite_path, "size KB:", model_tflite_path.stat().st_size / 1024.0)


Saved: cough_cnn_2s_weak.h5




INFO:tensorflow:Assets written to: C:\Users\Aman\AppData\Local\Temp\tmp8g1l27b7\assets


INFO:tensorflow:Assets written to: C:\Users\Aman\AppData\Local\Temp\tmp8g1l27b7\assets


Saved: cough_cnn_2s_weak_int8.tflite size KB: 41.5390625


In [10]:
# Inspect quantization params and print Arduino constants you must update
interpreter = tf.lite.Interpreter(model_path=str(model_tflite_path))
interpreter.allocate_tensors()

inp = interpreter.get_input_details()[0]
out = interpreter.get_output_details()[0]

print("Input shape:", inp["shape"])
print("Input quantization:", inp["quantization"])
print("Output shape:", out["shape"])
print("Output quantization:", out["quantization"])

print("\n---- Arduino constants to update ----")
print("NUM_FRAMES =", int(inp["shape"][1]))
print("NUM_MFCCS =", int(inp["shape"][2]))
print("MODEL_INPUT_SCALE =", float(inp["quantization"][0]))
print("MODEL_INPUT_ZERO_POINT =", int(inp["quantization"][1]))
print("OUTPUT_SCALE =", float(out["quantization"][0]))
print("OUTPUT_ZERO_POINT =", int(out["quantization"][1]))
print("COUGH_INDEX = 1  # if training labels are 0=non-cough, 1=cough")


Input shape: [ 1 61 40]
Input quantization: (0.058460138738155365, -4)
Output shape: [1 2]
Output quantization: (0.00390625, -128)

---- Arduino constants to update ----
NUM_FRAMES = 61
NUM_MFCCS = 40
MODEL_INPUT_SCALE = 0.058460138738155365
MODEL_INPUT_ZERO_POINT = -4
OUTPUT_SCALE = 0.00390625
OUTPUT_ZERO_POINT = -128
COUGH_INDEX = 1  # if training labels are 0=non-cough, 1=cough


In [11]:
# Save manifests for traceability
train_manifest.to_csv("train_windows_2s.csv", index=False)
val_manifest.to_csv("val_windows_2s.csv", index=False)
test_manifest.to_csv("test_windows_2s.csv", index=False)

print("Saved window manifests:")
print("- train_windows_2s.csv")
print("- val_windows_2s.csv")
print("- test_windows_2s.csv")


Saved window manifests:
- train_windows_2s.csv
- val_windows_2s.csv
- test_windows_2s.csv
