# Deep-Learning Analysis of Smartphone and Electronic-Stethoscope Phonocardiograms for Detection of Reduced Left Ventricular Ejection Fraction

This notebook rebuilds all derived artifacts and runs experiments on Google Colab.
Default settings prioritize the fastest training (local /content storage for code, data, cache, and results).
Set `USE_LOCAL_DATA = False` if your dataset is too large for /content.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_REPO_DIR = '/content/drive/MyDrive/phonocardiogram-lvef-deeplearning'
WORK_DIR = '/content/pcg_repo'
DATA_DIR = '/content/pcg_data'
RUNS_DIR = '/content/pcg_runs'
USE_LOCAL_DATA = True  # fastest default
SYNC_BACK_TO_DRIVE = True
SYNC_DERIVED = True
DRIVE_RUNS_DIR = f"{DRIVE_REPO_DIR}/runs"


In [None]:
import os
import shutil
import subprocess

def rsync(src, dst, excludes=None):
    cmd = ['rsync', '-a', '--delete']
    if excludes:
        for ex in excludes:
            cmd += ['--exclude', ex]
    cmd += [src, dst]
    print(' '.join(cmd))
    subprocess.run(cmd, check=True)

os.makedirs(WORK_DIR, exist_ok=True)
rsync(DRIVE_REPO_DIR + '/', WORK_DIR + '/', excludes=['.git', 'cache', 'splits', 'results', 'checkpoints', 'checkpoints_cpu', '__pycache__'])

if USE_LOCAL_DATA:
    os.makedirs(DATA_DIR, exist_ok=True)
    rsync(DRIVE_REPO_DIR + '/heart_sounds/', DATA_DIR + '/heart_sounds/')
    shutil.copy2(DRIVE_REPO_DIR + '/lvef.csv', DATA_DIR + '/lvef.csv')
    LVEF_CSV = f"{DATA_DIR}/lvef.csv"
    HEART_DIR = f"{DATA_DIR}/heart_sounds"
else:
    LVEF_CSV = f"{DRIVE_REPO_DIR}/lvef.csv"
    HEART_DIR = f"{DRIVE_REPO_DIR}/heart_sounds"

if not os.path.exists(LVEF_CSV):
    raise FileNotFoundError(f'Missing LVEF CSV: {LVEF_CSV}')
if not os.path.isdir(HEART_DIR):
    raise FileNotFoundError(f'Missing heart_sounds dir: {HEART_DIR}')

os.makedirs(RUNS_DIR, exist_ok=True)
os.chdir(WORK_DIR)
print('WORK_DIR:', WORK_DIR)
print('LVEF_CSV:', LVEF_CSV)
print('HEART_DIR:', HEART_DIR)
print('RUNS_DIR:', RUNS_DIR)


In [None]:
import sys
from pathlib import Path
import torch

print('torch:', torch.__version__)
print('cuda:', torch.version.cuda)
print('cuda available:', torch.cuda.is_available())

reqs = [r.strip() for r in Path('requirements.txt').read_text().splitlines() if r.strip()]
reqs = [r for r in reqs if not r.startswith('torch') and not r.startswith('torchaudio')]
cmd = [sys.executable, '-m', 'pip', 'install'] + reqs
print('Installing:', ' '.join(reqs))
subprocess.run(cmd, check=True)


If your filename pattern differs, edit `FILENAME_RE` / `DEVICE_MAP` in `src/data/build_metadata.py` before running the next cell.


In [None]:
!python -m src.data.build_metadata \
  --lvef_csv {LVEF_CSV} \
  --heart_dir {HEART_DIR} \
  --output_csv metadata.csv


In [None]:
!python -m src.data.make_patient_splits \
  --metadata_csv metadata.csv \
  --output_dir splits

!python -m src.data.make_patient_cv_splits \
  --metadata_csv metadata.csv \
  --output_dir splits/cv \
  --n_splits 5 \
  --n_repeats 1


In [None]:
PER_DEVICE_STATS = False  # set True for per-device normalization
per_device_flag = '--per_device' if PER_DEVICE_STATS else ''

!python -m src.data.compute_stats \
  --train_csv splits/metadata_train.csv \
  --representations mfcc gammatone \
  {per_device_flag}


In [None]:
CACHE_ROOT = '/content/pcg_cache'
NORMALIZATION = 'per_device' if PER_DEVICE_STATS else 'global'

for rep in ['mfcc', 'gammatone']:
    print(f'Caching {rep}...')
    !python -m src.data.precompute_cache \
      --representation {rep} \
      --normalization {NORMALIZATION} \
      --cache_root {CACHE_ROOT} \
      --splits splits/metadata_train.csv splits/metadata_val.csv splits/metadata_test.csv


In [None]:
# Optional QA report
# !mkdir -p reports
# !python -m src.data.qa_report \
#   --metadata_csv metadata.csv \
#   --output_json reports/qa_report.json \
#   --output_csv reports/qa_records.csv \
#   --fixed_duration 4.0


In [None]:
# Default: 5-fold CV for model selection
REPRESENTATION = 'mfcc'
BACKBONE = 'mobilenetv2'
TRAIN_DEVICES = None  # e.g. ['iphone'] for within-device CV
VAL_DEVICES = None
TEST_DEVICES = None

AUTO_POS_WEIGHT = True
TUNE_THRESHOLD = True
AMP = True
USE_CACHE = False  # CV splits are on-the-fly unless you precompute cache per fold

import sys
import subprocess

cmd = [
    sys.executable,
    '-m',
    'src.experiments.run_cv',
    '--cv_index',
    'splits/cv/index.csv',
    '--results_dir',
    f'{RUNS_DIR}/results',
    '--output_dir',
    f'{RUNS_DIR}/checkpoints',
    '--',
    '--representation',
    REPRESENTATION,
    '--backbone',
    BACKBONE,
]

if AUTO_POS_WEIGHT:
    cmd.append('--auto_pos_weight')
if TUNE_THRESHOLD:
    cmd.append('--tune_threshold')
if AMP:
    cmd.append('--amp')
if USE_CACHE:
    cmd.append('--use_cache')

if TRAIN_DEVICES:
    cmd += ['--train_device_filter', *TRAIN_DEVICES]
if VAL_DEVICES:
    cmd += ['--val_device_filter', *VAL_DEVICES]
if TEST_DEVICES:
    cmd += ['--test_device_filter', *TEST_DEVICES]

print('Running:', ' '.join(cmd))
subprocess.run(cmd, check=True)


In [None]:
# Optional: train a final within-device model (single run)
RUN_SINGLE = False

if RUN_SINGLE:
    import sys
    import subprocess

    REPRESENTATION = 'mfcc'
    BACKBONE = 'mobilenetv2'
    TRAIN_DEVICES = None  # e.g. ['android_phone']
    VAL_DEVICES = None
    TEST_DEVICES = None

    cmd = [
        sys.executable,
        '-m',
        'src.training.train',
        '--train_csv',
        'splits/metadata_train.csv',
        '--val_csv',
        'splits/metadata_val.csv',
        '--test_csv',
        'splits/metadata_test.csv',
        '--representation',
        REPRESENTATION,
        '--backbone',
        BACKBONE,
        '--results_dir',
        f'{RUNS_DIR}/results',
        '--output_dir',
        f'{RUNS_DIR}/checkpoints',
        '--use_cache',
        '--auto_pos_weight',
        '--tune_threshold',
        '--amp',
        '--per_device_eval',
        '--save_predictions',
        '--save_history',
    ]

    if TRAIN_DEVICES:
        cmd += ['--train_device_filter', *TRAIN_DEVICES]
    if VAL_DEVICES:
        cmd += ['--val_device_filter', *VAL_DEVICES]
    if TEST_DEVICES:
        cmd += ['--test_device_filter', *TEST_DEVICES]

    print('Running:', ' '.join(cmd))
    subprocess.run(cmd, check=True)


In [None]:
# Optional: cross-device evaluation from a saved checkpoint (no retraining)
RUN_EVAL_ONLY = False

if RUN_EVAL_ONLY:
    import sys
    import subprocess

    CHECKPOINT_PATH = f'{RUNS_DIR}/checkpoints/<run_name>/best.pth'
    TEST_DEVICES = ['iphone', 'digital_stethoscope']

    cmd = [
        sys.executable,
        '-m',
        'src.training.train',
        '--eval_only',
        '--checkpoint_path',
        CHECKPOINT_PATH,
        '--train_csv',
        'splits/metadata_train.csv',
        '--val_csv',
        'splits/metadata_val.csv',
        '--test_csv',
        'splits/metadata_test.csv',
        '--results_dir',
        f'{RUNS_DIR}/results',
        '--per_device_eval',
        '--save_predictions',
    ]

    if TEST_DEVICES:
        cmd += ['--test_device_filter', *TEST_DEVICES]

    print('Running:', ' '.join(cmd))
    subprocess.run(cmd, check=True)


In [None]:
if SYNC_DERIVED:
    shutil.copy2('metadata.csv', f"{DRIVE_REPO_DIR}/metadata.csv")
    shutil.copy2('tf_stats.json', f"{DRIVE_REPO_DIR}/tf_stats.json")
    rsync('splits/', f"{DRIVE_REPO_DIR}/splits/")
    print('Synced derived artifacts to drive.')

if SYNC_BACK_TO_DRIVE:
    os.makedirs(DRIVE_RUNS_DIR, exist_ok=True)
    rsync(RUNS_DIR + '/', DRIVE_RUNS_DIR + '/')
    print('Synced runs to drive:', DRIVE_RUNS_DIR)
