In [1]:
import pandas as pd
import os

In [2]:
# Step 2: Check current working directory
print("Current directory:", os.getcwd())

Current directory: /workspaces/ecg-arrhythmia-attention/notebooks


In [3]:
# Step 3: List files in your dataset folder
print("Files in data/mitdb_new:", os.listdir('../data/mitdb_new'))

Files in data/mitdb_new: ['ptbdb_abnormal.csv', 'mitbih_train.csv', 'ptbdb_normal.csv', 'mitbih_test.csv']


In [4]:
# Step 4: Load the datasets
train_df = pd.read_csv('../data/mitdb_new/mitbih_train.csv')
test_df  = pd.read_csv('../data/mitdb_new/mitbih_test.csv')

In [5]:
# Step 5: Quick check
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (87553, 188)
Test shape: (21891, 188)


In [6]:
# Step 6: Preview the first 5 rows
train_df.head()

Unnamed: 0,9.779411554336547852e-01,9.264705777168273926e-01,6.813725233078002930e-01,2.450980395078659058e-01,1.544117629528045654e-01,1.911764740943908691e-01,1.519607901573181152e-01,8.578431606292724609e-02,5.882352963089942932e-02,4.901960864663124084e-02,...,0.000000000000000000e+00.79,0.000000000000000000e+00.80,0.000000000000000000e+00.81,0.000000000000000000e+00.82,0.000000000000000000e+00.83,0.000000000000000000e+00.84,0.000000000000000000e+00.85,0.000000000000000000e+00.86,0.000000000000000000e+00.87,0.000000000000000000e+00.88
0,0.960114,0.863248,0.461538,0.196581,0.094017,0.125356,0.099715,0.088319,0.074074,0.082621,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.659459,0.186486,0.07027,0.07027,0.059459,0.056757,0.043243,0.054054,0.045946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.925414,0.665746,0.541436,0.276243,0.196133,0.077348,0.071823,0.060773,0.066298,0.058011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.967136,1.0,0.830986,0.586854,0.356808,0.248826,0.14554,0.089202,0.117371,0.150235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.927461,1.0,0.626943,0.193437,0.094991,0.072539,0.043178,0.053541,0.093264,0.189983,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Step 7: Split features (X) and labels (y)
# Assuming the last column is the label
X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values

X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

In [8]:
# Step 8: Check shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (87553, 187)
y_train shape: (87553,)
X_test shape: (21891, 187)
y_test shape: (21891,)


In [11]:
# Step 9: Scale features using StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [13]:
# Step 10: Check scaled shapes
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

X_train_scaled shape: (87553, 187)
X_test_scaled shape: (21891, 187)


In [15]:
import torch

In [16]:
# Step 11: Convert features to tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test_scaled, dtype=torch.float32)

In [17]:
# Step 12: Convert labels to tensors
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # y_train is already a numpy array
y_test_tensor  = torch.tensor(y_test, dtype=torch.long)

In [18]:
# Step 13: Check shapes
print("X_train_tensor shape:", X_train_tensor.shape)
print("y_train_tensor shape:", y_train_tensor.shape)
print("X_test_tensor shape:", X_test_tensor.shape)
print("y_test_tensor shape:", y_test_tensor.shape)

X_train_tensor shape: torch.Size([87553, 187])
y_train_tensor shape: torch.Size([87553])
X_test_tensor shape: torch.Size([21891, 187])
y_test_tensor shape: torch.Size([21891])


In [19]:
from torch.utils.data import TensorDataset, DataLoader

In [20]:
# Step 14: Create TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

In [21]:
# Step 15: Create DataLoaders
batch_size = 64  # you can change this later
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [22]:
# Step 16: Check
print("Number of training batches:", len(train_loader))
print("Number of testing batches:", len(test_loader))

Number of training batches: 1369
Number of testing batches: 343
