# ESP32 5s Transfer Learning (Tuned)
This version keeps one final Keras model file (`cough_cnn_5s_transfer_esp32.h5`), then exports int8 TFLite + header.

In [21]:
import os
# GPU-stable setup (must run before TensorFlow starts using GPU).
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

import random
from pathlib import Path

import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Configure memory growth for all visible GPUs.
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
        except Exception as e:
            print('Could not set memory growth:', e)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

print('TensorFlow:', tf.__version__)
print('Visible GPUs:', tf.config.list_physical_devices('GPU'))


TensorFlow: 2.10.0


In [22]:
# -----------------------------
# Config
# -----------------------------
BASE_MODEL_PATH = Path('./cough_cnn_5s_base.h5')
ESP32_ROOT = Path('./esp32_dataset')
OUTPUT_MODEL_H5 = 'cough_cnn_5s_transfer_esp32.h5'

# Audio / MFCC (must match Arduino)
SR = 16000
SRC_SECONDS = 2.0
TARGET_SECONDS = 5.0
SRC_SAMPLES = int(SR * SRC_SECONDS)
TARGET_SAMPLES = int(SR * TARGET_SECONDS)

N_MFCC = 40
N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 512
EXPECTED_FRAMES = 1 + int(np.floor((TARGET_SAMPLES - N_FFT) / float(HOP_LENGTH)))

# Split
TEST_SIZE = 0.15
VAL_SIZE_FROM_TRAIN = 0.1765  # around 15% of total

# Synthesis
TRAIN_VERSIONS_PER_SAMPLE = 6
VAL_VERSIONS_PER_SAMPLE = 2
TEST_VERSIONS_PER_SAMPLE = 2

# Training
BATCH_SIZE = 32
HEAD_EPOCHS = 12
FINE_TUNE_EPOCHS = 24
HEAD_LR = 3e-4
FINE_TUNE_LR = 6e-5

if not BASE_MODEL_PATH.exists():
    raise FileNotFoundError(f'Base model not found: {BASE_MODEL_PATH.resolve()}')

cough_dir = ESP32_ROOT / 'cough'
non_cough_dir = ESP32_ROOT / 'non_cough'
if not cough_dir.exists() or not non_cough_dir.exists():
    raise FileNotFoundError('Expected esp32_dataset/cough and esp32_dataset/non_cough')

rows = []
rows += [{'wav_path': str(p.resolve()), 'label': 1} for p in sorted(cough_dir.glob('*.wav'))]
rows += [{'wav_path': str(p.resolve()), 'label': 0} for p in sorted(non_cough_dir.glob('*.wav'))]
df = pd.DataFrame(rows)

if len(df) == 0:
    raise RuntimeError('No wav files found in esp32_dataset.')

train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=SEED, stratify=df['label'])
train_df, val_df = train_test_split(train_df, test_size=VAL_SIZE_FROM_TRAIN, random_state=SEED, stratify=train_df['label'])

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print('Total files:', len(df))
print('Train counts:', train_df['label'].value_counts().to_dict())
print('Val counts:  ', val_df['label'].value_counts().to_dict())
print('Test counts: ', test_df['label'].value_counts().to_dict())


Total files: 624
Train counts: {0: 225, 1: 211}
Val counts:   {0: 49, 1: 45}
Test counts:  {0: 49, 1: 45}


In [23]:
# -----------------------------
# Audio helpers and synthesis
# -----------------------------
def pad_or_trim(y, target_len):
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    elif len(y) > target_len:
        y = y[:target_len]
    return y.astype(np.float32)

def load_2s(path):
    y, _ = librosa.load(path, sr=SR, mono=True, offset=0.0, duration=SRC_SECONDS)
    return pad_or_trim(y, SRC_SAMPLES)

def load_any(path):
    y, _ = librosa.load(path, sr=SR, mono=True)
    return y.astype(np.float32)

def rms(x):
    return float(np.sqrt(np.mean(np.square(x), dtype=np.float64) + 1e-10))

# Use train non-cough pool as realistic background domain
TRAIN_BG_POOL = train_df.loc[train_df['label'] == 0, 'wav_path'].tolist()
if len(TRAIN_BG_POOL) == 0:
    raise RuntimeError('No non-cough files in train split to build background pool.')

def sample_bg_5s(rng):
    p = TRAIN_BG_POOL[int(rng.integers(0, len(TRAIN_BG_POOL)))]
    y = load_any(p)
    if len(y) >= TARGET_SAMPLES:
        start = int(rng.integers(0, len(y) - TARGET_SAMPLES + 1))
        return y[start:start + TARGET_SAMPLES].astype(np.float32)
    reps = int(np.ceil(TARGET_SAMPLES / len(y)))
    return np.tile(y, reps)[:TARGET_SAMPLES].astype(np.float32)

def synthesize_5s(src_2s, label, rng, training=True):
    y5 = sample_bg_5s(rng)
    y5 *= rng.uniform(0.8, 1.2)

    # Place source clip at random position
    start = int(rng.integers(0, TARGET_SAMPLES - SRC_SAMPLES + 1))
    event = src_2s.copy()

    if int(label) == 1:
        # Positive samples: keep cough clearly audible
        snr_db = float(rng.uniform(12.0, 26.0) if training else rng.uniform(14.0, 22.0))
        event *= rng.uniform(0.9, 1.25)
    else:
        # Hard negatives: loud non-cough events should stay non-cough
        snr_db = float(rng.uniform(2.0, 20.0) if training else rng.uniform(4.0, 16.0))
        event *= rng.uniform(0.8, 1.5)

    local_bg = y5[start:start + SRC_SAMPLES]
    target_rms = max(rms(local_bg), 1e-6) * (10.0 ** (snr_db / 20.0))
    event = event * (target_rms / max(rms(event), 1e-6))
    y5[start:start + SRC_SAMPLES] += event

    if training:
        # Mild random disturbances for robustness
        if rng.random() < 0.35:
            y5 += rng.uniform(0.0005, 0.01) * rng.normal(0.0, 1.0, len(y5)).astype(np.float32)
        if rng.random() < 0.2:
            burst_len = int(rng.integers(int(0.004 * SR), int(0.02 * SR)))
            bstart = int(rng.integers(0, max(1, len(y5) - burst_len)))
            y5[bstart:bstart + burst_len] += rng.uniform(-0.25, 0.25)

    return np.clip(y5, -1.0, 1.0).astype(np.float32)


In [24]:
# -----------------------------
# MFCC + dataset build
# -----------------------------
def extract_mfcc_2d(y):
    mfcc = librosa.feature.mfcc(
        y=y,
        sr=SR,
        n_mfcc=N_MFCC,
        n_mels=N_MELS,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        htk=False,
    ).T.astype(np.float32)

    if mfcc.shape[0] < EXPECTED_FRAMES:
        pad = np.zeros((EXPECTED_FRAMES - mfcc.shape[0], N_MFCC), dtype=np.float32)
        mfcc = np.vstack([mfcc, pad])
    elif mfcc.shape[0] > EXPECTED_FRAMES:
        mfcc = mfcc[:EXPECTED_FRAMES, :]

    # Match Arduino MFCC standardization
    mean = np.mean(mfcc, axis=0, keepdims=True)
    std = np.std(mfcc, axis=0, keepdims=True) + 1e-6
    mfcc = (mfcc - mean) / std

    return mfcc.astype(np.float32)

def spec_augment(mfcc, rng):
    out = mfcc.copy()
    t = out.shape[0]
    f = out.shape[1]

    # one time mask
    t_mask = int(rng.integers(0, max(2, int(0.08 * t))))
    if t_mask > 0:
        t0 = int(rng.integers(0, max(1, t - t_mask)))
        out[t0:t0 + t_mask, :] = 0.0

    # one freq mask
    f_mask = int(rng.integers(0, max(2, int(0.12 * f))))
    if f_mask > 0:
        f0 = int(rng.integers(0, max(1, f - f_mask)))
        out[:, f0:f0 + f_mask] = 0.0

    return out

def build_split(split_df, versions_per_sample, seed, training=False):
    rng = np.random.default_rng(seed)
    total = len(split_df) * int(versions_per_sample)

    X = np.zeros((total, EXPECTED_FRAMES, N_MFCC), dtype=np.float32)
    y = np.zeros((total,), dtype=np.int32)

    idx = 0
    for row in tqdm(split_df.itertuples(index=False), total=len(split_df)):
        src_2s = load_2s(row.wav_path)
        label = int(row.label)

        for _ in range(int(versions_per_sample)):
            y5 = synthesize_5s(src_2s, label, rng, training=training)
            mfcc = extract_mfcc_2d(y5)
            if training and rng.random() < 0.45:
                mfcc = spec_augment(mfcc, rng)
            X[idx] = mfcc
            y[idx] = label
            idx += 1

    return X[:idx], y[:idx]

X_train, y_train = build_split(train_df, TRAIN_VERSIONS_PER_SAMPLE, SEED, training=True)
X_val, y_val = build_split(val_df, VAL_VERSIONS_PER_SAMPLE, SEED + 1, training=False)
X_test, y_test = build_split(test_df, TEST_VERSIONS_PER_SAMPLE, SEED + 2, training=False)

print('X_train:', X_train.shape, 'X_val:', X_val.shape, 'X_test:', X_test.shape)
print('Train classes:', {0: int(np.sum(y_train == 0)), 1: int(np.sum(y_train == 1))})


100%|██████████| 436/436 [00:25<00:00, 17.30it/s]
100%|██████████| 94/94 [00:02<00:00, 40.86it/s]
100%|██████████| 94/94 [00:01<00:00, 48.67it/s]

X_train: (2616, 155, 40) X_val: (188, 155, 40) X_test: (188, 155, 40)
Train classes: {0: 1350, 1: 1266}





In [25]:
# -----------------------------
# Transfer learning
# -----------------------------
base_model = keras.models.load_model(str(BASE_MODEL_PATH), compile=False)

# Reuse pretrained feature extractor up to global average pooling
backbone = keras.Model(
    inputs=base_model.input,
    outputs=base_model.get_layer('global_average_pooling1d').output,
    name='backbone'
)

# Freeze backbone for head warmup
for layer in backbone.layers:
    layer.trainable = False

x = keras.layers.Dropout(0.25)(backbone.output)
x = keras.layers.Dense(48, activation='relu', kernel_regularizer=keras.regularizers.l2(1e-4))(x)
x = keras.layers.Dropout(0.30)(x)
out = keras.layers.Dense(2, activation='softmax')(x)
model = keras.Model(backbone.input, out, name='cough_transfer_tuned')

class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=y_train)
class_weight = {0: float(class_weights[0]), 1: float(class_weights[1])}

y_train_oh = keras.utils.to_categorical(y_train, num_classes=2)
y_val_oh = keras.utils.to_categorical(y_val, num_classes=2)

callbacks = [
    keras.callbacks.ModelCheckpoint(
        OUTPUT_MODEL_H5,
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_auc',
        mode='max',
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_auc',
        mode='max',
        patience=8,
        restore_best_weights=True,
        verbose=1
    )
]

# Phase 1: head training
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=HEAD_LR),
    loss='categorical_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc')]
)

history_head = model.fit(
    X_train, y_train_oh,
    validation_data=(X_val, y_val_oh),
    epochs=HEAD_EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

# Phase 2: unfreeze upper backbone block
for layer in backbone.layers:
    if layer.name in {'conv1d_1', 'batch_normalization_1', 'conv1d_2', 'batch_normalization_2'}:
        layer.trainable = True

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=FINE_TUNE_LR),
    loss='categorical_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc')]
)

history_ft = model.fit(
    X_train, y_train_oh,
    validation_data=(X_val, y_val_oh),
    epochs=FINE_TUNE_EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

# Final single model artifact
model = keras.models.load_model(OUTPUT_MODEL_H5, compile=False)
print('Saved best model:', Path(OUTPUT_MODEL_H5).resolve())


Epoch 1/12
Epoch 1: val_auc improved from -inf to 0.51409, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 2/12
Epoch 2: val_auc improved from 0.51409 to 0.55683, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 3/12
Epoch 3: val_auc improved from 0.55683 to 0.57983, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 4/12
Epoch 4: val_auc improved from 0.57983 to 0.59526, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 5/12
Epoch 5: val_auc improved from 0.59526 to 0.60532, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 6/12
Epoch 6: val_auc improved from 0.60532 to 0.60882, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 7/12
Epoch 7: val_auc improved from 0.60882 to 0.61677, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 8/12
Epoch 8: val_auc improved from 0.61677 to 0.62719, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 9/12
Epoch 9: val_auc improved from 0.62719 to 0.62958, saving model to cough_cnn_5s_transfer_esp32.h5
Epoch 10/12
Epoch 10: 

In [26]:
# -----------------------------
# Validation + test metrics
# -----------------------------
val_prob = model.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
val_pred = np.argmax(val_prob, axis=1)
val_acc = float(np.mean(val_pred == y_val))
val_auc = float(roc_auc_score(y_val, val_prob[:, 1]))

test_prob = model.predict(X_test, batch_size=BATCH_SIZE, verbose=0)
test_pred = np.argmax(test_prob, axis=1)
test_acc = float(np.mean(test_pred == y_test))
test_auc = float(roc_auc_score(y_test, test_prob[:, 1]))

print(f'Val accuracy:  {val_acc:.4f}')
print(f'Val AUC:       {val_auc:.4f}')
print(f'Test accuracy: {test_acc:.4f}')
print(f'Test AUC:      {test_auc:.4f}')

print('\nTest classification report:')
print(classification_report(y_test, test_pred, digits=4))
print('Test confusion matrix:\n', confusion_matrix(y_test, test_pred))


Val accuracy:  0.7181
Val AUC:       0.7827
Test accuracy: 0.7606
Test AUC:      0.8553

Test classification report:
              precision    recall  f1-score   support

           0     0.7789    0.7551    0.7668        98
           1     0.7419    0.7667    0.7541        90

    accuracy                         0.7606       188
   macro avg     0.7604    0.7609    0.7605       188
weighted avg     0.7612    0.7606    0.7607       188

Test confusion matrix:
 [[74 24]
 [21 69]]


In [27]:
# -----------------------------
# Export int8 TFLite + header
# -----------------------------
TFLITE_PATH = Path('cough_cnn_5s_transfer_esp32_int8.tflite')

def representative_data_gen():
    n = min(300, len(X_train))
    idx = np.random.choice(len(X_train), size=n, replace=False)
    for i in idx:
        yield [X_train[i:i+1].astype(np.float32)]

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

quant_tflite = converter.convert()
TFLITE_PATH.write_bytes(quant_tflite)
print('Saved TFLite:', TFLITE_PATH.resolve())

interpreter = tf.lite.Interpreter(model_path=str(TFLITE_PATH))
interpreter.allocate_tensors()
in_d = interpreter.get_input_details()[0]
out_d = interpreter.get_output_details()[0]
print('Input quantization:', in_d['quantization'])
print('Output quantization:', out_d['quantization'])

def write_tflite_header(tflite_bytes, header_path, array_name):
    guard = ''.join(ch if ch.isalnum() else '_' for ch in header_path.stem.upper()) + '_H'
    per_line = 12

    lines = []
    lines.append(f'#ifndef {guard}')
    lines.append(f'#define {guard}')
    lines.append('')
    lines.append(f'const unsigned char {array_name}[] = {{')

    for i in range(0, len(tflite_bytes), per_line):
        chunk = tflite_bytes[i:i + per_line]
        lines.append('  ' + ', '.join(f'0x{b:02x}' for b in chunk) + ',')

    lines.append('};')
    lines.append(f'const unsigned int {array_name}_len = {len(tflite_bytes)};')
    lines.append('')
    lines.append(f'#endif  // {guard}')
    lines.append('')

    header_path.write_text('\n'.join(lines), encoding='utf-8')

ARRAY_NAME = 'cough_cnn_5s_transfer_esp32_int8_tflite'
HEADER_LOCAL = Path('model_data_5s_transfer.h')
write_tflite_header(quant_tflite, HEADER_LOCAL, ARRAY_NAME)
print('Saved header:', HEADER_LOCAL.resolve())

ARDUINO_HEADER = Path('../Arduino copy/main/model_data_5s_transfer.h')
if ARDUINO_HEADER.parent.exists():
    write_tflite_header(quant_tflite, ARDUINO_HEADER, ARRAY_NAME)
    print('Updated Arduino header:', ARDUINO_HEADER.resolve())

print('\nIn Arduino main.ino use:')
print('  #include "model_data_5s_transfer.h"')
print('  tf.begin(cough_cnn_5s_transfer_esp32_int8_tflite)')
print('  symbol length: cough_cnn_5s_transfer_esp32_int8_tflite_len')




INFO:tensorflow:Assets written to: C:\Users\Aman\AppData\Local\Temp\tmpa6llim0n\assets


INFO:tensorflow:Assets written to: C:\Users\Aman\AppData\Local\Temp\tmpa6llim0n\assets


Saved TFLite: E:\minor-project\model\cough_cnn_5s_transfer_esp32_int8.tflite
Input quantization: (0.06139513850212097, 13)
Output quantization: (0.00390625, -128)
Saved header: E:\minor-project\model\model_data_5s_transfer.h
Updated Arduino header: E:\minor-project\Arduino copy\main\model_data_5s_transfer.h

In Arduino main.ino use:
  #include "model_data_5s_transfer.h"
  tf.begin(cough_cnn_5s_transfer_esp32_int8_tflite)
  symbol length: cough_cnn_5s_transfer_esp32_int8_tflite_len
