In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

# Make sure you have this installed: pip install gdown
import gdown
import zipfile

# Make sure you have copied the 'src' folder from FMM-Head into your project
from src.datasets.datasetsLibrary import get_ptb_xl_fmm_dataset
from pipeline_improved import VAE1D, beta_cyclic, compute_scores
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, r2_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def find_data_subfolder(subfolder_name, start_path='.'):
    current_path = os.path.abspath(start_path)
    while True:
        candidate = os.path.join(current_path, 'data', subfolder_name)
        if os.path.isdir(candidate):
            return candidate
        parent = os.path.dirname(current_path)
        if parent == current_path:
            break
        current_path = parent
    return None

DATA_DIR = find_data_subfolder('') # Finds the root 'data' folder

In [None]:
# This cell loads the FMM-enhanced dataset.
# If the data is not found in './data/ptb_xl_fmm', it will automatically download it.
# This may take a while the first time.

print("Loading FMM-enhanced PTB-XL dataset...")
# Corrected call (added lead=0)
data_dict = get_ptb_xl_fmm_dataset(
    datapath=DATA_DIR,
    num_leads=1,
    lead=0,  # <-- ADD THIS LINE
    num_waves=5,
    sequence_length=2048,
    delete_high_A=False
)
print("Dataset loaded successfully.")

# Extract the training and testing sets
X_train_raw = data_dict['train']['data']
y_train = data_dict['train']['labels']
coeffs_train = data_dict['train']['coefficients']

X_test_raw = data_dict['test']['data'] 
y_test = data_dict['test']['labels']
coeffs_test = data_dict['test']['coefficients']

# --- Create Multi-Channel Input ---
# The VAE will be trained on the signal (1 channel) + its FMM features (55 channels).
# We reshape the 55 coefficients to match the signal's time dimension (2048).
train_coeffs_reshaped = np.repeat(np.expand_dims(coeffs_train, axis=1), 2048, axis=1)
test_coeffs_reshaped = np.repeat(np.expand_dims(coeffs_test, axis=1), 2048, axis=1)

# Concatenate along the channel dimension
X_train = np.concatenate([X_train_raw, train_coeffs_reshaped], axis=2).astype(np.float32)
X_test = np.concatenate([X_test_raw, test_coeffs_reshaped], axis=2).astype(np.float32)

# --- Separate into Normal and Anomalous Sets ---
# We use the provided labels to create our final sets.
normal_class_id = data_dict['params']['normal_class']
train_normals = X_train[y_train == normal_class_id]
train_anomalies = X_train[y_train != normal_class_id]
test_normals = X_test[y_test == normal_class_id]
test_anomalies = X_test[y_test != normal_class_id]

# --- Convert to PyTorch Tensors and fix dimensions for Conv1D ---
# Conv1D expects (Batch, Channels, Length)
dev_norm_tensor = torch.tensor(train_normals).permute(0, 2, 1)
ano_dev_tensor = torch.tensor(train_anomalies).permute(0, 2, 1)
test_norm_tensor = torch.tensor(test_normals).permute(0, 2, 1)
ano_test_tensor = torch.tensor(test_anomalies).permute(0, 2, 1)

print(f"Shape of multi-channel normal data for VAE training: {dev_norm_tensor.shape}")

Loading FMM-enhanced PTB-XL dataset...
Loading "train" folder


  0%|          | 0/95868 [00:00<?, ?it/s]

/Users/naopersonal/Documents/Github/TP-final_ML/data/ptb_xl_fmm/train/sample_12579_beat_9





ValueError: 

In [None]:
#normals = load_sanos(PTB_DIR, CHAP_DIR)     
#ptb_df  = pd.read_csv(find_file(PTB_DIR, 'ptbxl_database.csv')) 

In [None]:
#anomalies = load_all_anomalies(PTB_DIR, CHAP_DIR, ptb_df)  # (N_ano,1,L) 

In [None]:
# VAE & Training Hyperparameters
latent_dim = 32
lr = 1e-3
n_blocks = 3
epochs = 50  # Increase this (e.g., to 100-150) for better results if you have time
batch_size = 32
beta_value = 1.0 # Using a constant beta of 1.0 is a good starting point
NUM_CHANNELS = dev_norm_tensor.shape[1] # Should be 56 (1 ECG + 55 FMM)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# --- VAE Training on Normal Data Only ---
# The VAE learns the distribution of healthy, FMM-enhanced signals.
dev_ds = TensorDataset(dev_norm_tensor, torch.zeros(len(dev_norm_tensor)))
dev_loader = DataLoader(dev_ds, batch_size=batch_size, shuffle=True)

model = VAE1D(input_ch=NUM_CHANNELS, latent_dim=latent_dim, n_blocks=n_blocks).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

print("Starting VAE training...")
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for (x, _) in tqdm(dev_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False):
        x = x.to(device)
        mu, logv = model.encode(x)
        z = model.reparameterize(mu, logv)
        rec = model.decode(z)
        
        recon_loss = nn.functional.mse_loss(rec, x)
        kl_loss = (-0.5 * (1 + logv - mu.pow(2) - logv.exp()).sum()) / x.size(0)
        
        loss = recon_loss + beta_value * kl_loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}/{epochs}, VAE Loss: {total_loss/len(dev_loader):.4f}")

# Save the trained VAE model
torch.save(model.state_dict(), 'fmm_vae_model.pth')
print("VAE model saved to fmm_vae_model.pth")

In [None]:
# --- XGBoost Classifier Training ---

# 1. Prepare a mixed development set for the classifier
dev_x_classifier = torch.cat([dev_norm_tensor, ano_dev_tensor], dim=0)
y_dev_classifier = torch.tensor(
    np.concatenate([np.zeros(len(dev_norm_tensor)), np.ones(len(ano_dev_tensor))]),
    dtype=torch.long
)
dev_loader_classifier = DataLoader(TensorDataset(dev_x_classifier, y_dev_classifier), batch_size=batch_size)

# 2. Extract features using the trained VAE
print("Extracting features from dev set for classifier training...")
errs_dev, zs_dev = compute_scores(model, dev_loader_classifier, device, beta=beta_value)
max_val = np.finfo(np.float32).max / 10
errs_dev = np.nan_to_num(errs_dev, nan=0.0, posinf=max_val, neginf=-max_val)
zs_dev = np.nan_to_num(zs_dev, nan=0.0, posinf=max_val, neginf=-max_val)
X_train_clf = np.hstack([errs_dev.reshape(-1, 1), zs_dev])
y_train_clf = y_dev_classifier.numpy()

# 3. Train the XGBoost Classifier
print("Training XGBoost classifier...")
clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train_clf, y_train_clf)
print("Classifier training complete.")

In [None]:
# --- Final Evaluation on Unseen Test Set ---

# 1. Prepare the final, balanced test set
N_test = len(test_norm_tensor)
final_test_x = torch.cat([test_norm_tensor, ano_test_tensor[:N_test]], dim=0)
final_y_test = torch.tensor(
    np.concatenate([np.zeros(N_test), np.ones(N_test)]),
    dtype=torch.long
)
final_test_loader = DataLoader(TensorDataset(final_test_x, final_y_test), batch_size=batch_size)

# 2. Extract features from the test set
print("Extracting features from the unseen test set...")
errs_test, zs_test = compute_scores(model, final_test_loader, device, beta=beta_value)
errs_test = np.nan_to_num(errs_test, nan=0.0, posinf=max_val, neginf=-max_val)
zs_test = np.nan_to_num(zs_test, nan=0.0, posinf=max_val, neginf=-max_val)
X_test_clf = np.hstack([errs_test.reshape(-1, 1), zs_test])
y_test_clf = final_y_test.numpy()

# 3. Make predictions and evaluate
print("Evaluating classifier on the unseen test set...")
probs = clf.predict_proba(X_test_clf)[:, 1]
y_pred = (probs >= 0.5).astype(int)

metrics = {
    'roc_auc': roc_auc_score(y_test_clf, probs),
    'precision': precision_score(y_test_clf, y_pred),
    'recall': recall_score(y_test_clf, y_pred),
    'f1': f1_score(y_test_clf, y_pred),
    'accuracy': accuracy_score(y_test_clf, y_pred),
    'r2': r2_score(y_test_clf, y_pred) # R2 score on binarized predictions
}
print("\n--- Final Metrics on Unseen Test Set ---")
print(metrics)

Epoch 1/50, Loss: 3.2853
Epoch 2/50, Loss: 1.0379
Epoch 3/50, Loss: 1.0125
Epoch 4/50, Loss: 1.0056
Epoch 5/50, Loss: 1.0046
Epoch 6/50, Loss: 1.0032
Epoch 7/50, Loss: 1.0017
Epoch 8/50, Loss: 0.9973
Epoch 9/50, Loss: 0.9988
Epoch 10/50, Loss: 1.0093
Epoch 11/50, Loss: 1.0167
Epoch 12/50, Loss: 0.9975
Epoch 13/50, Loss: 0.9910
Epoch 14/50, Loss: 0.9896
Epoch 15/50, Loss: 0.9888
Epoch 16/50, Loss: 0.9887
Epoch 17/50, Loss: 0.9887
Epoch 18/50, Loss: 0.9887
Epoch 19/50, Loss: 0.9892
Epoch 20/50, Loss: 0.9906
Epoch 21/50, Loss: 0.9922
Epoch 22/50, Loss: 1.0031
Epoch 23/50, Loss: 0.9910
Epoch 24/50, Loss: 0.9905
Epoch 25/50, Loss: 0.9888
Epoch 26/50, Loss: 0.9879
Epoch 27/50, Loss: 0.9877
Epoch 28/50, Loss: 0.9877
Epoch 29/50, Loss: 0.9879
Epoch 30/50, Loss: 0.9876
Epoch 31/50, Loss: 0.9877
Epoch 32/50, Loss: 0.9880
Epoch 33/50, Loss: 0.9880
Epoch 34/50, Loss: 0.9882
Epoch 35/50, Loss: 0.9879
Epoch 36/50, Loss: 0.9879
Epoch 37/50, Loss: 0.9877
Epoch 38/50, Loss: 16.7783
Epoch 39/50, Loss: 1