# CODE-15% Dataset EDA

This notebook performs exploratory data analysis on the CODE-15% ECG dataset converted to WFDB format. 
We will validate the output, extract key metadata, explore label distribution, and visualize sample ECG signals.


## Imports and setup

In [1]:
# Import libraries
import os
import wfdb
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import pprint, re

In [2]:
# Path to the WFDB files converted from CODE-15%
data_dir = "data/code15_output/exams_part17"

# Collect all header files (*.hea) in that folder
hea_files = [f for f in os.listdir(data_dir) if f.endswith(".hea")]

# Quick sanity-check
print(f"Found {len(hea_files):,} .hea files")
print("First 5 headers:", hea_files[:5])

# Pick three random records to confirm the label format
for hea in random.sample(hea_files, 3):
    rec_name = hea[:-4]                               # strip ".hea"
    hdr = wfdb.rdheader(os.path.join(data_dir, rec_name))
    
    print(f"\n--- {rec_name}.hea ---")
    # Show the first few comment lines in the header
    for line in hdr.comments[:6]:
        print(line)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data/code15_output/exams_part17'

In [None]:
# Collect all record names (without the .hea extension)
record_names = [f[:-4] for f in os.listdir(data_dir) if f.endswith(".hea")]

# Function to extract the Chagas label from comments
def get_chagas_label(comments):
    """
    Return numeric label and the raw header line for inspection.
    """
    for line in comments:
        if 'Chagas label' in line:
            llow = line.lower()
            if 'true' in llow:
                return 1, line.strip()
            if 'false' in llow:
                return 0, line.strip()
    return None, None

# Sample three random records
for rec in random.sample(record_names, 3):
    hdr = wfdb.rdheader(os.path.join(data_dir, rec))
    label_num, raw_line = get_chagas_label(hdr.comments)
    print(f"{rec} → {raw_line}  → numeric label = {label_num}")

1278023 → Chagas label: False  → numeric label = 0
204852 → Chagas label: False  → numeric label = 0
684228 → Chagas label: False  → numeric label = 0


In [10]:
# Scan up to 2 000 records to find a 'True' label
found = None
for rec in record_names[:2000]:
    hdr = wfdb.rdheader(os.path.join(data_dir, rec))
    label_num, raw_line = get_chagas_label(hdr.comments)
    if label_num == 1:
        found = (rec, raw_line, label_num)
        break

if found:
    rec, raw_line, label_num = found
    print(f"{rec} → {raw_line}  → numeric label = {label_num}")
else:
    print("No positive (True) labels in first 2 000 records—data is heavily imbalanced.")


758600 → Chagas label: True  → numeric label = 1


In [11]:
# Build a DataFrame of record name + numeric Chagas label
sample_size = 5000            # keep small for a quick first run
records = []

for rec in record_names[:sample_size]:
    hdr         = wfdb.rdheader(os.path.join(data_dir, rec))
    label_num, _ = get_chagas_label(hdr.comments)
    records.append({'record': rec, 'label': label_num})

df = pd.DataFrame(records)
print(df.head())
print("\nLabel counts:\n", df['label'].value_counts(dropna=False))

    record  label
0   590673      0
1   214626      0
2  2936711      0
3  1175521      0
4  1073151      0

Label counts:
 label
0    4894
1     106
Name: count, dtype: int64


### Data preparation functions

In [None]:
def get_chagas_label(comments):
    """Extract Chagas label.
    Args: comments (list[str]) – header comment lines.
    Returns: (int|None, str|None) – numeric label 0/1 and raw line."""
    for line in comments:
        if 'Chagas label' in line:
            low = line.lower()
            if 'true' in low:  return 1, line.strip()
            if 'false' in low: return 0, line.strip()
    return None, None

def parse_age_sex(comments):
    """Extract age and sex.
    Args: comments (list[str]).
    Returns: (int|None, str|None) – age and sex."""
    age = sex = None
    for line in comments:
        if line.lower().startswith('age:'):
            m = re.search(r'\d+', line)
            age = int(m.group()) if m else None
        if line.lower().startswith('sex:'):
            sex = line.split(':')[-1].strip()
    return age, sex

In [None]:
# Test get_chagas_label and parse_age_sex on 10 random records
subset = random.sample(record_names, 10)
for rec in subset:
    hdr = wfdb.rdheader(os.path.join(data_dir, rec))
    lbl, raw = get_chagas_label(hdr.comments)
    age, sex = parse_age_sex(hdr.comments)
    print(f'{rec}: label={lbl}, age={age}, sex={sex} | {raw}')

3408212: label=0, age=62, sex=Female | Chagas label: False
1128109: label=0, age=29, sex=Female | Chagas label: False
2999302: label=0, age=83, sex=Female | Chagas label: False
2974531: label=0, age=30, sex=Female | Chagas label: False
274006: label=0, age=85, sex=Female | Chagas label: False
1407714: label=0, age=61, sex=Male | Chagas label: False
4196225: label=0, age=31, sex=Male | Chagas label: False
718428: label=0, age=30, sex=Female | Chagas label: False
689266: label=0, age=62, sex=Female | Chagas label: False
430967: label=0, age=71, sex=Female | Chagas label: False


In [18]:
# Build DataFrame of record-level metadata and save to CSV
records = []
for rec in record_names:
    hdr = wfdb.rdheader(os.path.join(data_dir, rec))
    label, _ = get_chagas_label(hdr.comments)
    age, sex  = parse_age_sex(hdr.comments)
    records.append({
        'record': rec,
        'label': label,
        'age': age,
        'sex': sex,
        'fs': hdr.fs,
        'length': hdr.sig_len,
        'path': f'{data_dir}/{rec}',
        'source': 'CODE15'
    })

meta = pd.DataFrame(records)
meta.to_csv('code15_metadata_full.csv', index=False)
print(f'saved {meta.shape[0]} rows to code15_metadata_full.csv')


saved 19901 rows to code15_metadata_full.csv


In [19]:
# Load the saved metadata CSV and show label counts
meta = pd.read_csv('code15_metadata_full.csv')
print(meta['label'].value_counts(dropna=False))

label
0    19499
1      402
Name: count, dtype: int64
