In [1]:
import pandas as pd 
import numpy as np


Data Pre-Processing. Need to clean the data to be fed into the CNN model

In [2]:
database = pd.read_csv("ptbxl_database.csv")
database.head()

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [3]:
import ast

In [4]:
# to simply classification process, let's just keep the codes, and get rid of the percentages.

database['scp_codes_dict'] = database['scp_codes'].apply(ast.literal_eval)
database['diagnosis'] = database['scp_codes_dict'].apply(lambda x: list(x.keys()))
print(database[['ecg_id','diagnosis','scp_codes_dict']].head())

   ecg_id          diagnosis                            scp_codes_dict
0       1  [NORM, LVOLT, SR]  {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
1       2      [NORM, SBRAD]              {'NORM': 80.0, 'SBRAD': 0.0}
2       3         [NORM, SR]                {'NORM': 100.0, 'SR': 0.0}
3       4         [NORM, SR]                {'NORM': 100.0, 'SR': 0.0}
4       5         [NORM, SR]                {'NORM': 100.0, 'SR': 0.0}


In [5]:
# convert diagnosis alphabetically. 

database['diagnosis_sorted'] = database['diagnosis'].apply(lambda x: sorted(x))
database['diagnosis_sorted'].head()

database[['ecg_id','diagnosis','diagnosis_sorted']].head()

# count unique
unique_list_count = database['diagnosis'].apply(tuple).nunique()
print("Number of unique diagnosis:", unique_list_count) # too many 


Number of unique diagnosis: 4320


In [6]:
# create a relevant dataframe to work with
data = database[['ecg_id','diagnosis_sorted']].copy()
data['diagnosis_combined'] = data['diagnosis_sorted'].apply(lambda x: ' '.join(x))


data.head()


Unnamed: 0,ecg_id,diagnosis_sorted,diagnosis_combined
0,1,"[LVOLT, NORM, SR]",LVOLT NORM SR
1,2,"[NORM, SBRAD]",NORM SBRAD
2,3,"[NORM, SR]",NORM SR
3,4,"[NORM, SR]",NORM SR
4,5,"[NORM, SR]",NORM SR


In [7]:
# match id to the picture label. 

data['ecg_filename'] = data['ecg_id'].apply(lambda x: f"{x:05}_lr-0.png")
data.head()


Unnamed: 0,ecg_id,diagnosis_sorted,diagnosis_combined,ecg_filename
0,1,"[LVOLT, NORM, SR]",LVOLT NORM SR,00001_lr-0.png
1,2,"[NORM, SBRAD]",NORM SBRAD,00002_lr-0.png
2,3,"[NORM, SR]",NORM SR,00003_lr-0.png
3,4,"[NORM, SR]",NORM SR,00004_lr-0.png
4,5,"[NORM, SR]",NORM SR,00005_lr-0.png


In [8]:
# identify which pngs are missing
import os
import glob

In [9]:
directory_path = "dat/"
existing_pngs = set([os.path.basename(file) for file in glob.glob(f'{directory_path}/**/*.png', recursive=True)])

expected_pngs = set(data['ecg_filename'])
missing_pngs = expected_pngs - existing_pngs

print(f"Number of missing PNGs: {len(missing_pngs)}")
# print(missing_pngs) # there is a lot because I didn't generate after 1300


Number of missing PNGs: 0


In [10]:
data_filtered = data[~data['ecg_filename'].isin(missing_pngs)]
print(f"Remaining records: {len(data_filtered)}")


Remaining records: 21799


In [11]:
print(data_filtered.shape)
print(data_filtered.head())

(21799, 4)
   ecg_id   diagnosis_sorted diagnosis_combined    ecg_filename
0       1  [LVOLT, NORM, SR]      LVOLT NORM SR  00001_lr-0.png
1       2      [NORM, SBRAD]         NORM SBRAD  00002_lr-0.png
2       3         [NORM, SR]            NORM SR  00003_lr-0.png
3       4         [NORM, SR]            NORM SR  00004_lr-0.png
4       5         [NORM, SR]            NORM SR  00005_lr-0.png


In [12]:
data_filtered_check = data_filtered.copy()
counts = data_filtered_check.groupby('diagnosis_combined').size().reset_index(name='count')

less_than_15 = counts[counts['count'] > 9]

len(less_than_15)

178

In [13]:
# let's set an arbitrary amount of minimum samples as 10.

min_samples = 10
counts = data_filtered['diagnosis_combined'].value_counts()
to_drop = counts[counts < min_samples].index
df_filtered = data_filtered[~data_filtered['diagnosis_combined'].isin(to_drop)]

df_filtered.shape


(15617, 4)

#### Data Augmentation / Additional Filtering. There isn't enough samples per category, so much artificially increase the number of samples in the smaller categories. 

In [14]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [15]:
# creating a master csv
# print(df_filtered[['diagnosis_combined','ecg_filename']].head())

def get_folder_path(filename):
    number = int(filename.split('_')[0])
    folder_number = (number // 1000) * 1000
    folder_name = f"100_{folder_number:05d}"
    return os.path.join("dat/connected_binarized/", folder_name, filename)

df_filtered['ecg_filepath'] = df_filtered['ecg_filename'].apply(get_folder_path)

print(df_filtered.head())


   ecg_id   diagnosis_sorted diagnosis_combined    ecg_filename  \
0       1  [LVOLT, NORM, SR]      LVOLT NORM SR  00001_lr-0.png   
1       2      [NORM, SBRAD]         NORM SBRAD  00002_lr-0.png   
2       3         [NORM, SR]            NORM SR  00003_lr-0.png   
3       4         [NORM, SR]            NORM SR  00004_lr-0.png   
4       5         [NORM, SR]            NORM SR  00005_lr-0.png   

                                       ecg_filepath  
0  dat/connected_binarized/100_00000\00001_lr-0.png  
1  dat/connected_binarized/100_00000\00002_lr-0.png  
2  dat/connected_binarized/100_00000\00003_lr-0.png  
3  dat/connected_binarized/100_00000\00004_lr-0.png  
4  dat/connected_binarized/100_00000\00005_lr-0.png  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['ecg_filepath'] = df_filtered['ecg_filename'].apply(get_folder_path)


In [16]:
# split into training and validation set
random_state = 42

train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train_list = []
val_list = []
test_list = []

df_labeled = df_filtered.loc[:,['diagnosis_combined','ecg_filepath']]

for label in df_labeled['diagnosis_combined'].unique():
    subset = df_labeled[df_labeled['diagnosis_combined'] == label]
    
    subset = subset.sample(frac=1, random_state=random_state)
    n = len(subset)
    
    
    train_end = int(train_ratio * n)
    val_end = train_end + int(val_ratio * n)
    
    
    train_list.append(subset.iloc[:train_end])
    val_list.append(subset.iloc[train_end:val_end])
    test_list.append(subset.iloc[val_end:])


train_df = pd.concat(train_list).sample(frac=1, random_state=random_state)
val_df   = pd.concat(val_list).sample(frac=1, random_state=random_state)
test_df  = pd.concat(test_list).sample(frac=1, random_state=random_state)

train_df['ecg_filepath'] = train_df['ecg_filepath'].apply(lambda x: x.replace('\\', '/'))
val_df['ecg_filepath'] = val_df['ecg_filepath'].apply(lambda x: x.replace('\\', '/'))
test_df['ecg_filepath'] = test_df['ecg_filepath'].apply(lambda x: x.replace('\\', '/'))

print("Train samples:", len(train_df))
print("Validation samples:", len(val_df))
print("Test samples:", len(test_df))

print(train_df.head())
print(val_df.head())
print(test_df.head())


Train samples: 10859
Validation samples: 2254
Test samples: 2504
      diagnosis_combined                                      ecg_filepath
958        LVOLT NORM SR  dat/connected_binarized/100_00000/00971_lr-0.png
21439               PACE  dat/connected_binarized/100_21000/21478_lr-0.png
20022          NDT SARRH  dat/connected_binarized/100_20000/20061_lr-0.png
21603             NDT SR  dat/connected_binarized/100_21000/21642_lr-0.png
19379               PACE  dat/connected_binarized/100_19000/19418_lr-0.png
      diagnosis_combined                                      ecg_filepath
3882        LVOLT NDT SR  dat/connected_binarized/100_03000/03902_lr-0.png
13569             NDT SR  dat/connected_binarized/100_13000/13601_lr-0.png
21530          NDT STACH  dat/connected_binarized/100_21000/21569_lr-0.png
6639             NORM SR  dat/connected_binarized/100_06000/06660_lr-0.png
2459             NORM SR  dat/connected_binarized/100_02000/02472_lr-0.png
      diagnosis_combined           

In [17]:
# Not using this code because we want to stratify instead of using random split.
# from sklearn.model_selection import train_test_split
'''df_labeled = df_filtered[df_filtered['diagnosis_combined'].notnull()].copy()
print("Total labeled samples:", len(df_labeled))

train_df, test_df = train_test_split(
    df_labeled,
    test_size=0.2,  # 20% of the data for testing
    stratify=df_labeled['diagnosis_combined'],
    random_state=42
)

train_df, val_df = train_test_split(
    train_df,
    test_size=0.2,  # 20% of the original train set goes to validation
    stratify=train_df['diagnosis_combined'],
    random_state=42
)

print("Train count:", len(train_df))
print("Validation count:", len(val_df))
print("Test count:", len(test_df))'''

'df_labeled = df_filtered[df_filtered[\'diagnosis_combined\'].notnull()].copy()\nprint("Total labeled samples:", len(df_labeled))\n\ntrain_df, test_df = train_test_split(\n    df_labeled,\n    test_size=0.2,  # 20% of the data for testing\n    stratify=df_labeled[\'diagnosis_combined\'],\n    random_state=42\n)\n\ntrain_df, val_df = train_test_split(\n    train_df,\n    test_size=0.2,  # 20% of the original train set goes to validation\n    stratify=train_df[\'diagnosis_combined\'],\n    random_state=42\n)\n\nprint("Train count:", len(train_df))\nprint("Validation count:", len(val_df))\nprint("Test count:", len(test_df))'

In [18]:
train_datagen = ImageDataGenerator(
    rotation_range=5,              # small rotations
    height_shift_range=(0.0, -0.4),  # upward shift only
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()

In [19]:
batch_size = 32
target_size = (224, 224)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col="ecg_filepath",
    y_col="diagnosis_combined",
    target_size=target_size,
    batch_size=batch_size,
    class_mode="categorical",  
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col="ecg_filepath",
    y_col="diagnosis_combined",
    target_size=target_size,
    batch_size=batch_size,
    class_mode="categorical",
    shuffle=False
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col="ecg_filepath",
    y_col="diagnosis_combined",
    target_size=target_size,
    batch_size=batch_size,
    class_mode="categorical",
    shuffle=False
)

Found 10859 validated image filenames belonging to 178 classes.
Found 2254 validated image filenames belonging to 178 classes.
Found 2504 validated image filenames belonging to 178 classes.


## CNN model training

## Transfer Learnig

In [25]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras import Model

In [26]:
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
preds = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=preds)

for layer in base_model.layers:
    layer.trainable = False


model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [None]:
epochs = 10  # Adjust as needed

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs
)

Epoch 1/10
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m787s[0m 4s/step - accuracy: 0.4746 - loss: 3.2437 - val_accuracy: 0.5253 - val_loss: 2.6325
Epoch 2/10
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m716s[0m 3s/step - accuracy: 0.5049 - loss: 2.9502 - val_accuracy: 0.5253 - val_loss: 2.6921
Epoch 3/10
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m760s[0m 4s/step - accuracy: 0.5068 - loss: 2.9234 - val_accuracy: 0.5253 - val_loss: 2.5945
Epoch 4/10
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m896s[0m 4s/step - accuracy: 0.5070 - loss: 2.9162 - val_accuracy: 0.5253 - val_loss: 2.5822
Epoch 5/10
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.5071 - loss: 2.9021

KeyboardInterrupt: 

## Include classes only with n > 100

In [None]:
data_filtered_check = data_filtered.copy()
counts = data_filtered_check.groupby('diagnosis_combined').size().reset_index(name='count')

over_100 = counts[counts['count'] > 100]

len(over_100)

over_100

Unnamed: 0,diagnosis_combined,count
732,ABQRS IMI SR,264
820,ABQRS NORM SR,113
1703,CLBBB SR,108
2080,IMI SR,111
2281,IRBBB NORM SR,121
2303,IRBBB SR,132
2647,LAFB SR,204
2799,LVH SR VCLVH,126
2851,NDT SR,374
2861,NORM,146


In [None]:
data_filtered_check

Unnamed: 0,ecg_id,diagnosis_sorted,diagnosis_combined,ecg_filename
0,1,"[LVOLT, NORM, SR]",LVOLT NORM SR,00001_lr-0.png
1,2,"[NORM, SBRAD]",NORM SBRAD,00002_lr-0.png
2,3,"[NORM, SR]",NORM SR,00003_lr-0.png
3,4,"[NORM, SR]",NORM SR,00004_lr-0.png
4,5,"[NORM, SR]",NORM SR,00005_lr-0.png
...,...,...,...,...
13958,13995,"[NORM, SR]",NORM SR,13995_lr-0.png
13959,13996,"[NDT, NST_, SR]",NDT NST_ SR,13996_lr-0.png
13960,13997,"[AFIB, ASMI, INJAL, LOWT, PVC, STD_]",AFIB ASMI INJAL LOWT PVC STD_,13997_lr-0.png
13961,13998,"[ISC_, SR]",ISC_ SR,13998_lr-0.png


In [None]:
data_over_100 = df_filtered.loc[
    df_filtered['diagnosis_combined'].isin(over_100['diagnosis_combined'])
]

In [None]:
data_over_100

Unnamed: 0,ecg_id,diagnosis_sorted,diagnosis_combined,ecg_filename,ecg_filepath
1,2,"[NORM, SBRAD]",NORM SBRAD,00002_lr-0.png,dat/100_00000/00002_lr-0.png
2,3,"[NORM, SR]",NORM SR,00003_lr-0.png,dat/100_00000/00003_lr-0.png
3,4,"[NORM, SR]",NORM SR,00004_lr-0.png,dat/100_00000/00004_lr-0.png
4,5,"[NORM, SR]",NORM SR,00005_lr-0.png,dat/100_00000/00005_lr-0.png
5,6,"[NORM, SR]",NORM SR,00006_lr-0.png,dat/100_00000/00006_lr-0.png
...,...,...,...,...,...
13954,13991,"[NORM, SR]",NORM SR,13991_lr-0.png,dat/100_13000/13991_lr-0.png
13955,13992,"[LVH, SR, VCLVH]",LVH SR VCLVH,13992_lr-0.png,dat/100_13000/13992_lr-0.png
13956,13993,"[NORM, SR]",NORM SR,13993_lr-0.png,dat/100_13000/13993_lr-0.png
13958,13995,"[NORM, SR]",NORM SR,13995_lr-0.png,dat/100_13000/13995_lr-0.png


In [None]:
# split to training and test dataset. 
from sklearn.model_selection import train_test_split

In [None]:
df = data_over_100[['diagnosis_combined', 'ecg_filepath']]

train_val_df, test_df = train_test_split(df,
                                        test_size = .15,
                                        random_state = 1,
                                        stratify = df['diagnosis_combined'])

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.18,              
    random_state=42,
    stratify=train_val_df['diagnosis_combined']
)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 5046
Validation size: 1108
Test size: 1087


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

val_test_datagen = ImageDataGenerator(rescale=1./255)



In [None]:
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='ecg_filepath',
    y_col='diagnosis_combined',
    target_size=(224, 224),
    class_mode='categorical',  
    batch_size=32,
    shuffle=True
)

val_generator = val_test_datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col='ecg_filepath',
    y_col='diagnosis_combined',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)
test_generator = val_test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='ecg_filepath',
    y_col='diagnosis_combined',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

Found 5046 validated image filenames belonging to 16 classes.
Found 1108 validated image filenames belonging to 16 classes.
Found 1087 validated image filenames belonging to 16 classes.


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(16, activation='softmax')  
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
from PIL import Image

for path in train_df['ecg_filepath']:
    try:
        img = Image.open(path)
        img.verify()  # Ensure file is not corrupted
    except Exception as e:
        print(f"Problem with {path}: {e}")

In [None]:
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
history = model.fit(
    train_generator,
    epochs=5,
    validation_data=val_generator,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator)
)

  self._warn_if_super_not_called()


Epoch 1/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1360s[0m 9s/step - accuracy: 0.6154 - loss: 2.2621 - val_accuracy: 0.6606 - val_loss: 1.5759
Epoch 2/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m728s[0m 5s/step - accuracy: 0.6635 - loss: 1.6295 - val_accuracy: 0.6606 - val_loss: 1.5352
Epoch 3/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m651s[0m 4s/step - accuracy: 0.6716 - loss: 1.5741 - val_accuracy: 0.6606 - val_loss: 1.5412
Epoch 4/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m735s[0m 5s/step - accuracy: 0.6636 - loss: 1.5930 - val_accuracy: 0.6606 - val_loss: 1.5416
Epoch 5/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m760s[0m 5s/step - accuracy: 0.6592 - loss: 1.5883 - val_accuracy: 0.6606 - val_loss: 1.5380


In [None]:
test_loss, test_acc = model.evaluate(
    test_generator,
    steps=len(test_generator)
)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1274s[0m 39s/step - accuracy: 0.6850 - loss: 1.4641
Test Loss: 1.5412803888320923
Test Accuracy: 0.6596136093139648


In [None]:
from tensorflow.keras.preprocessing import image
import numpy as np

In [None]:
img_path = "test_less_noise/ekg_chris_kim.png"

img = image.load_img(img_path, target_size=(224, 224))


img_array = image.img_to_array(img)


img_array = np.expand_dims(img_array, axis=0)

predictions = model.predict(img_array)

predicted_index = np.argmax(predictions, axis=1)[0]
predicted_confidence = predictions[0][predicted_index]

print("Predicted class index:", predicted_index)
print("Confidence:", predicted_confidence)
print("Prediction vector:", predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Predicted class index: 12
Confidence: 1.0
Prediction vector: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


In [None]:
predictions = model.predict(img_array)
class_names = train_generator.class_indices
idx_to_class = {v: k for k, v in class_names.items()}
print("Predicted class name:", idx_to_class[predicted_index])

Predicted class name: NORM SR
