In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
database = pd.read_csv("ptbxl_database.csv")

In [3]:
database.head()

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [5]:
database['scp_codes_dict'] = database['scp_codes'].apply(ast.literal_eval)
database['diagnosis'] = database['scp_codes_dict'].apply(lambda x: list(x.keys()))
print(database[['ecg_id','diagnosis','scp_codes_dict']].head())

   ecg_id          diagnosis                            scp_codes_dict
0       1  [NORM, LVOLT, SR]  {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
1       2      [NORM, SBRAD]              {'NORM': 80.0, 'SBRAD': 0.0}
2       3         [NORM, SR]                {'NORM': 100.0, 'SR': 0.0}
3       4         [NORM, SR]                {'NORM': 100.0, 'SR': 0.0}
4       5         [NORM, SR]                {'NORM': 100.0, 'SR': 0.0}


In [6]:

database['diagnosis_sorted'] = database['diagnosis'].apply(lambda x: sorted(x))
database['diagnosis_sorted'].head()

database[['ecg_id','diagnosis','diagnosis_sorted']].head()

# count unique
unique_list_count = database['diagnosis'].apply(tuple).nunique()
print("Number of unique diagnosis:", unique_list_count) # too many 

Number of unique diagnosis: 4320


In [7]:
data = database[['ecg_id','diagnosis_sorted']].copy()
data['diagnosis_combined'] = data['diagnosis_sorted'].apply(lambda x: ' '.join(x))


data.head()

Unnamed: 0,ecg_id,diagnosis_sorted,diagnosis_combined
0,1,"[LVOLT, NORM, SR]",LVOLT NORM SR
1,2,"[NORM, SBRAD]",NORM SBRAD
2,3,"[NORM, SR]",NORM SR
3,4,"[NORM, SR]",NORM SR
4,5,"[NORM, SR]",NORM SR


In [8]:
data['ecg_filename'] = data['ecg_id'].apply(lambda x: f"{x:05}_lr-0.png")
data.head()

Unnamed: 0,ecg_id,diagnosis_sorted,diagnosis_combined,ecg_filename
0,1,"[LVOLT, NORM, SR]",LVOLT NORM SR,00001_lr-0.png
1,2,"[NORM, SBRAD]",NORM SBRAD,00002_lr-0.png
2,3,"[NORM, SR]",NORM SR,00003_lr-0.png
3,4,"[NORM, SR]",NORM SR,00004_lr-0.png
4,5,"[NORM, SR]",NORM SR,00005_lr-0.png


In [9]:
import os 
import glob

In [13]:
directory_path = 'dat/connected_binarized/'
existing_pngs = set([os.path.basename(file) for file in glob.glob(f'{directory_path}/**/*.png', recursive = True)])
expected_pngs = set(data['ecg_filename'])

missing_pngs = expected_pngs - existing_pngs

print(missing_pngs)

set()


In [14]:
data_filtered = data[~data['ecg_filename'].isin(missing_pngs)]
print(f"Remaining records: {len(data_filtered)}")

Remaining records: 21799


In [16]:
print(data_filtered.shape)
print(data_filtered.head())

(21799, 4)
   ecg_id   diagnosis_sorted diagnosis_combined    ecg_filename
0       1  [LVOLT, NORM, SR]      LVOLT NORM SR  00001_lr-0.png
1       2      [NORM, SBRAD]         NORM SBRAD  00002_lr-0.png
2       3         [NORM, SR]            NORM SR  00003_lr-0.png
3       4         [NORM, SR]            NORM SR  00004_lr-0.png
4       5         [NORM, SR]            NORM SR  00005_lr-0.png


In [18]:
data_filtered_check = data_filtered.copy()
counts = data_filtered_check.groupby('diagnosis_combined').size().reset_index(name='count')

less_than_15 = counts[counts['count'] > 100]

len(less_than_15)

21

In [20]:
counts = data_filtered['diagnosis_combined'].value_counts()
to_drop = counts[counts < 100].index
df_filtered = data_filtered[~data_filtered['diagnosis_combined'].isin(to_drop)]
print(df_filtered.shape)
print(df_filtered.head())

(11706, 4)
   ecg_id diagnosis_sorted diagnosis_combined    ecg_filename
1       2    [NORM, SBRAD]         NORM SBRAD  00002_lr-0.png
2       3       [NORM, SR]            NORM SR  00003_lr-0.png
3       4       [NORM, SR]            NORM SR  00004_lr-0.png
4       5       [NORM, SR]            NORM SR  00005_lr-0.png
5       6       [NORM, SR]            NORM SR  00006_lr-0.png
