In [5]:
import pandas as pd 
import kagglehub as kh
import os
from sklearn.model_selection import train_test_split

In [6]:
# Download latest version
path = kh.dataset_download("shayanfazeli/heartbeat")

In [7]:
def load_data(path):
    # load data from mitbih
    mitbih_train = pd.read_csv(os.path.join(path, "mitbih_train.csv"), header=None)
    mitbih_test = pd.read_csv(os.path.join(path, "mitbih_test.csv"), header=None)

    # load data from ptbdb
    ptbdb_normal = pd.read_csv(os.path.join(path, "ptbdb_normal.csv"), header=None)
    ptbdb_abnormal = pd.read_csv(os.path.join(path, "ptbdb_abnormal.csv"), header=None)
  
    return mitbih_train, mitbih_test, ptbdb_normal, ptbdb_abnormal

In [8]:
mitbih_train, mitbih_test, ptbdb_normal, ptbdb_abnormal = load_data(path)

In [9]:
def replace_label_column(data, label):
    data.iloc[:, -1] = label  # Replace the last column with the new label
    return data

In [10]:
# append labels to ptbdb data
ptbdb_normal = replace_label_column(ptbdb_normal, 6) # normal
ptbdb_abnormal = replace_label_column(ptbdb_abnormal, 7) # abnormal

In [11]:
# Concatenate the two ptbdb datasets
ptbdb_all = pd.concat([ptbdb_normal, ptbdb_abnormal], ignore_index=True)

# split the data 80% / 20% 
X_ptbdb_train, X_ptbdb_test, y_ptbdb_train, y_ptbdb_test = train_test_split(ptbdb_all.iloc[:, :-1], ptbdb_all.iloc[:, -1], test_size=0.2, random_state=42)

In [12]:
# combine mitbih and ptbdb data
X_train = pd.concat([X_ptbdb_train, mitbih_train.iloc[:, :-1]], ignore_index=True)
X_test = pd.concat([X_ptbdb_test, mitbih_test.iloc[:, :-1]], ignore_index=True)

# combine mitbih and ptbdb labels
y_train = pd.concat([y_ptbdb_train, mitbih_train.iloc[:, -1]], ignore_index=True)
y_test = pd.concat([y_ptbdb_test, mitbih_test.iloc[:, -1]], ignore_index=True)

In [13]:
def drop_almost_zero_columns(X, threshold=0.8):
    zero_fraction = (X == 0).sum() / len(X)
    drop_cols = zero_fraction[zero_fraction >= threshold].index
    return X.drop(columns=drop_cols)

In [14]:
# drop the columns with all zeros from the train and test datasets
X_train = drop_almost_zero_columns(X_train)
X_test = drop_almost_zero_columns(X_test)