In [1]:
import numpy as np
import pandas as pd
import wfdb
import os
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as snss
from pprint import pprint
from tqdm import tqdm
import sys
sys.path.append("../finetune/")
sys.path.append("../utils")

In [None]:
# set your meta path of mimic-ecg
meta_path = 'your_path/mimic-iv-ecg-diagnostic-electrocardiogram-matched-subset-1.0'
report_csv = pd.read_csv(f'{meta_path}/machine_measurements.csv', low_memory=False)
record_csv = pd.read_csv(f'{meta_path}/record_list.csv', low_memory=False)

In [None]:
def process_report(row):
    # Select the relevant columns and filter out NaNs
    report = row[['report_0', 'report_1', 'report_2', 'report_3', 'report_4', 
                  'report_5', 'report_6', 'report_7', 'report_8', 'report_9', 
                  'report_10', 'report_11', 'report_12', 'report_13', 'report_14', 
                  'report_15', 'report_16', 'report_17']].dropna()
    # Concatenate the report
    report = '. '.join(report)
    # Replace and preprocess text
    report = report.replace('EKG', 'ECG').replace('ekg', 'ecg')
    report = report.strip(' ***').strip('*** ').strip('***').strip('=-').strip('=')
    # Convert to lowercase
    report = report.lower()

    # concatenate the report if the report length is not 0
    total_report = ''
    if len(report.split()) != 0:
        total_report = report
        total_report = total_report.replace('\n', ' ')
        total_report = total_report.replace('\r', ' ')
        total_report = total_report.replace('\t', ' ')
        total_report += '.'
    if len(report.split()) == 0:
        total_report = 'empty'
    # Calculate the length of the report in words
    return len(report.split()), total_report

tqdm.pandas()
report_csv['report_length'], report_csv['total_report'] = zip(*report_csv.progress_apply(process_report, axis=1))
# Filter out reports with less than 4 words
report_csv = report_csv[report_csv['report_length'] >= 4]

# you should get 771693 here
print(report_csv.shape)

In [None]:
report_csv.reset_index(drop=True, inplace=True)
record_csv = record_csv[record_csv['study_id'].isin(report_csv['study_id'])]
record_csv.reset_index(drop=True, inplace=True)

In [None]:
# build an empty numpy array to store the data, we use int16 to save the space
temp_npy = np.zeros((len(record_csv), 12, 5000), dtype=np.int16)

for p in tqdm(record_csv['path']):
    # read the data
    ecg_path = os.path.join(meta_path, p)
    record = wfdb.rdsamp(ecg_path)[0]
    record = record.T
    # replace the nan with the neighbor 5 value mean
    # detect nan in each lead
    if np.isnan(record).sum() == 0 and np.isinf(record).sum() == 0:
        # normalize to 0-1
        record = (record - record.min()) / (record.max() - record.min())
        # scale the data
        record *= 1000
        # convert to int16
        record = record.astype(np.int16)
        # store the data
        temp_npy[record_csv[record_csv['path'] == p].index[0]] = record[:, :5000]

    else:
        if np.isinf(record).sum() == 0:
            for i in range(record.shape[0]):
                nan_idx = np.where(np.isnan(record[:, i]))[0]
                for idx in nan_idx:
                    record[idx, i] = np.mean(record[max(0, idx-6):min(idx+6, record.shape[0]), i])
        if np.isnan(record).sum() == 0:
            for i in range(record.shape[0]):
                inf_idx = np.where(np.isinf(record[:, i]))[0]
                for idx in inf_idx:
                    record[idx, i] = np.mean(record[max(0, idx-6):min(idx+6, record.shape[0]), i])

        # normalize to 0-1
        record = (record - record.min()) / (record.max() - record.min())
        # scale the data
        record *= 1000
        # convert to int16
        record = record.astype(np.int16)
        # store the data
        temp_npy[record_csv[record_csv['path'] == p].index[0]] = record[:, :5000]

In [None]:
# split to train and val
train_npy, train_csv, val_npy, val_csv = train_test_split(temp_npy, report_csv, test_size=0.02, random_state=42)

train_csv.reset_index(drop=True, inplace=True)
val_csv.reset_index(drop=True, inplace=True)

# save to your path
np.save("your_path_train.npy", train_npy)
np.save("your_path_val.npy", val_npy)
train_csv.to_csv("your_path_train.csv", index=False)
val_csv.to_csv("your_path_val.csv", index=False)

### Report

In [2]:
train_csv = pd.read_csv('/data/chenjian/ECG_MM/pretrain_data/val.csv',low_memory=False)

KeyError: 7

In [5]:
train_csv.head(10)

Unnamed: 0,subject_id,study_id,cart_id,ecg_time,report_0,report_1,report_2,report_3,report_4,report_5,...,p_onset,p_end,qrs_onset,qrs_end,t_end,p_axis,qrs_axis,t_axis,report_length,total_report
0,11296238,45602043,6896680,2189-07-25 08:11:00,Possible ectopic atrial rhythm.,Left axis deviation,Right bundle branch block,Inferior infarct - age undetermined,,Abnormal ECG,...,40,172,182,298,590,-36,-38,21,18,possible ectopic atrial rhythm.. left axis dev...
1,15098892,41283872,6672155,2183-04-07 21:44:00,Sinus bradycardia,Prolonged QT interval,,Borderline ECG,,,...,40,158,238,314,782,82,54,54,7,sinus bradycardia. prolonged qt interval. bord...
2,17684215,43875452,6790237,2167-01-25 23:49:00,Sinus rhythm.,Inferior T wave changes are nonspecific,,Borderline ECG,,,...,40,136,160,264,532,26,59,0,10,sinus rhythm.. inferior t wave changes are non...
3,18862100,46807732,6610295,2137-01-18 14:04:00,Atrial fibrillation with slow ventricular resp...,Right bundle branch block,Inferior T wave changes are nonspecific,,Abnormal ECG,,...,29999,29999,200,344,608,29999,62,-9,18,atrial fibrillation with slow ventricular resp...
4,14650506,40831044,6752764,2125-10-10 03:38:00,Sinus rhythm,,Normal ECG,,,,...,40,152,218,308,590,66,52,50,4,sinus rhythm. normal ecg.
5,10956924,49888070,6924910,2160-03-18 18:12:00,Atrial fibrillation.,Possible left anterior fascicular block,Anterolateral ST-T changes are nonspecific,,Abnormal ECG,,...,29999,29999,200,308,610,29999,-39,95,14,atrial fibrillation.. possible left anterior f...
6,12259873,46576969,6717310,2103-11-03 11:31:00,*** CONSIDER ACUTE ST ELEVATION MI ***,Atrial fibrillation with rapid ventricular res...,"Inferior ST elevation, CONSIDER ACUTE INFARCT",,Abnormal ECG,,...,29999,29999,200,306,560,29999,21,12,20,consider acute st elevation mi ***. atrial fib...
7,11013461,49823269,6672155,2168-11-08 13:14:00,Sinus rhythm.,Poor R wave progression - probable normal variant,Inferior ST-T changes may be due to myocardial...,,Abnormal ECG,,...,40,140,194,286,600,18,38,-24,21,sinus rhythm.. poor r wave progression - proba...
8,12523947,43486650,6228095,2185-10-03 13:56:00,Probable atrial fibrillation with slow ventric...,Prolonged QT interval,Possible anterior infarct - age undetermined,Inferior/lateral ST-T changes may be due to my...,,Abnormal ECG,...,29999,29999,200,306,838,29999,-18,-32,31,probable atrial fibrillation with slow ventric...
9,12433669,49526284,6903271,2112-06-04 00:55:00,--- Warning: Data quality may affect interpret...,Sinus bradycardia,Prolonged QT interval,Possible anterior infarct - age undetermined,Lateral T wave changes may be due to myocardia...,,...,40,172,222,318,730,52,3,99,30,warning: data quality may affect interpretati...
