In [9]:
import os

print("Current directory:", os.getcwd())
print("Files/folders here:", os.listdir('.'))
print("Files in data/mitdb_new:", os.listdir('../data/mitdb_new'))  # note the relative path

Current directory: /workspaces/ecg-arrhythmia-attention/notebooks
Files/folders here: ['01_data_exploration.ipynb', '02_preprocessing.ipynb']
Files in data/mitdb_new: ['ptbdb_abnormal.csv', 'mitbih_train.csv', 'ptbdb_normal.csv', 'mitbih_test.csv']


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch

# Load datasets
train_df = pd.read_csv('../data/mitdb_new/mitbih_train.csv')
test_df = pd.read_csv('../data/mitdb_new/mitbih_test.csv')

# Split features and labels
X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values
X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Print shapes to confirm
print("X_train_tensor shape:", X_train_tensor.shape)
print("y_train_tensor shape:", y_train_tensor.shape)
print("X_test_tensor shape:", X_test_tensor.shape)
print("y_test_tensor shape:", y_test_tensor.shape)


X_train_tensor shape: torch.Size([87553, 187])
y_train_tensor shape: torch.Size([87553])
X_test_tensor shape: torch.Size([21891, 187])
y_test_tensor shape: torch.Size([21891])


In [1]:
import pandas as pd


In [3]:
import os

# Check current working directory
print("Current working directory:", os.getcwd())

# List all files/folders in current directory
print("Files/folders here:", os.listdir('.'))

# Check if data folder exists
if os.path.exists('data/mitdb_new'):
    print("Dataset folder found! Files inside:")
    print(os.listdir('data/mitdb_new'))
else:
    print("Dataset folder NOT found. You need to adjust the path.")


Current working directory: /workspaces/ecg-arrhythmia-attention/notebooks
Files/folders here: ['01_data_exploration.ipynb', '02_preprocessing.ipynb']
Dataset folder NOT found. You need to adjust the path.


In [4]:
import pandas as pd

# Use relative path from current notebook folder
train_df = pd.read_csv('../data/mitdb_new/mitbih_train.csv')
test_df = pd.read_csv('../data/mitdb_new/mitbih_test.csv')

# Check the first few rows
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print(train_df.head())

Train shape: (87553, 188)
Test shape: (21891, 188)
   9.779411554336547852e-01  9.264705777168273926e-01  \
0                  0.960114                  0.863248   
1                  1.000000                  0.659459   
2                  0.925414                  0.665746   
3                  0.967136                  1.000000   
4                  0.927461                  1.000000   

   6.813725233078002930e-01  2.450980395078659058e-01  \
0                  0.461538                  0.196581   
1                  0.186486                  0.070270   
2                  0.541436                  0.276243   
3                  0.830986                  0.586854   
4                  0.626943                  0.193437   

   1.544117629528045654e-01  1.911764740943908691e-01  \
0                  0.094017                  0.125356   
1                  0.070270                  0.059459   
2                  0.196133                  0.077348   
3                  0.356808        

In [5]:
# Assume the last column is the label (if not, adjust accordingly)
X_train = train_df.iloc[:, :-1].values  # all columns except last
y_train = train_df.iloc[:, -1].values   # last column

X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (87553, 187)
y_train shape: (87553,)
X_test shape: (21891, 187)
y_test shape: (21891,)


In [6]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit on training data and transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

X_train_scaled shape: (87553, 187)
X_test_scaled shape: (21891, 187)


In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert to tensors (no .values because y_train and y_test are already numpy arrays)
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Check one batch
for X_batch, y_batch in train_loader:
    print("X_batch shape:", X_batch.shape)
    print("y_batch shape:", y_batch.shape)
    break


X_batch shape: torch.Size([64, 187])
y_batch shape: torch.Size([64])
