In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install torch-geometric pywavelets wfdb scikit-learn matplotlib

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, Batch
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch.utils.data import DataLoader
import wfdb
import pywt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Dataset
class PTBXLGraphDataset(Dataset):
    def __init__(self, meta_df, ecg_folder, wavelet='db4', level=4, threshold_factor=0.1):
        self.meta = meta_df
        self.ecg_folder = ecg_folder
        self.wavelet = wavelet
        self.level = level
        self.threshold_factor = threshold_factor
        num_leads = 12
        edges = [(i, j) for i in range(num_leads) for j in range(num_leads) if i != j]
        self.edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        row = self.meta.iloc[idx]
        rec = os.path.join(self.ecg_folder, row['filename_lr'])
        sig, _ = wfdb.rdsamp(rec)
        sig = sig.T

        node_features = []
        for lead in sig:
            lead = (lead - np.mean(lead)) / (np.std(lead) + 1e-8)
            coeffs = pywt.wavedec(lead, self.wavelet, level=self.level)
            coeffs_thresh = [pywt.threshold(c, self.threshold_factor * np.max(np.abs(c)), mode='soft') if np.max(np.abs(c)) > 0 else c for c in coeffs]
            denoised = pywt.waverec(coeffs_thresh, self.wavelet)
            denoised = denoised[:1000] if len(denoised) > 1000 else np.pad(denoised, (0, 1000 - len(denoised)), 'constant')
            node_features.append(denoised)

        node_features = torch.tensor(np.array(node_features), dtype=torch.float32)
        y = torch.tensor(row['superclass_id'], dtype=torch.long)  # Changed to superclass_id
        return Data(x=node_features, edge_index=self.edge_index, y=y)

# CNN Backbone
class ResNetECG(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(12, 64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm1d(64), nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm1d(128), nn.ReLU()
        )
        self.gap = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        x = self.conv(x)
        return self.gap(x).squeeze(-1)

# GNN Module
class ECGGNNModule(nn.Module):
    def __init__(self, in_channels=1000, hidden_channels=64):
        super().__init__()
        self.gcn1 = GCNConv(in_channels, hidden_channels)
        self.pool1 = TopKPooling(hidden_channels, ratio=0.8)
        self.gcn2 = GCNConv(hidden_channels, hidden_channels)
        self.pool2 = TopKPooling(hidden_channels, ratio=0.8)
        self.lin = nn.Linear(hidden_channels, hidden_channels)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.gcn1(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
        x = F.relu(self.gcn2(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x_pooled = global_mean_pool(x, batch)
        return self.lin(x_pooled)

# Full ECGNN
class ECGNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.cnn = ResNetECG()
        self.gnn = ECGGNNModule()
        self.fc = nn.Linear(128 + 64, num_classes)

    def forward(self, data):
        batch_size = data.num_graphs
        nodes_per_graph = 12
        x_reshaped = data.x.view(batch_size, nodes_per_graph, -1)
        x_cnn = x_reshaped
        cnn_out = self.cnn(x_cnn)
        gnn_out = self.gnn(data.x, data.edge_index, data.batch)
        return self.fc(torch.cat([cnn_out, gnn_out], dim=1))

# Note: The following code assumes train_meta, val_meta, and test_meta are already defined
# You'll need to add your data loading code here

# Data Preparation
train_meta, val_meta = train_test_split(
    train_meta, test_size=0.2, stratify=train_meta['superclass'], random_state=42
)

superclass_le = LabelEncoder()
superclass_le.fit(train_meta['superclass'])
train_meta['superclass_id'] = superclass_le.transform(train_meta['superclass'])
val_meta['superclass_id'] = superclass_le.transform(val_meta['superclass'])
test_meta['superclass_id'] = superclass_le.transform(test_meta['superclass'])

# Datasets and Loaders
ecg_folder = '/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1'
train_ds = PTBXLGraphDataset(train_meta, ecg_folder)
val_ds = PTBXLGraphDataset(val_meta, ecg_folder)
test_ds = PTBXLGraphDataset(test_meta, ecg_folder)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=Batch.from_data_list)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)

# Model Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ECGNN(num_classes=len(superclass_le.classes_)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Training Loop with Periodic Validation
for epoch in range(1, 21):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pred = out.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)
    train_acc = correct / total
    print(f"Epoch {epoch} | Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f}")

    if epoch % 10 == 0:
        model.eval()
        val_correct, val_total = 0, 0
        with torch.no_grad():
            for data in val_loader:
                data = data.to(device)
                out = model(data)
                pred = out.argmax(dim=1)
                val_correct += (pred == data.y).sum().item()
                val_total += data.y.size(0)
        val_acc = val_correct / val_total
        print(f"Epoch {epoch} | Val Acc: {val_acc:.4f}")

# Final Test Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        out = model(data)
        preds = out.argmax(dim=1)
        y_true.extend(data.y.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

test_acc = accuracy_score(y_true, y_pred)
print(f"Final Test Accuracy: {test_acc:.4f}")
print("\nTest Classification Report:")
print(classification_report(y_true, y_pred, target_names=superclass_le.classes_))

In [None]:
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader
# from torch_geometric.data import Data, Batch
# from torch_geometric.nn import GATConv, GraphNorm, Set2Set
# import wfdb
# import pywt
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import classification_report
# import ast
# import os
# from torch.optim.lr_scheduler import CosineAnnealingLR

# # Load metadata and diagnostic codes
# df = pd.read_csv('/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/ptbxl_database.csv')
# scp_statements = pd.read_csv('/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/scp_statements.csv', index_col=0)

# def extract_dominant_superclass(scp_dict):
#     filtered = {k: v for k, v in scp_dict.items() if k in scp_statements.index and scp_statements.loc[k, 'diagnostic'] == 1}
#     if not filtered:
#         return 'NORM'
#     dominant_code = max(filtered, key=filtered.get)
#     superclass = scp_statements.loc[dominant_code, 'diagnostic_class']
#     return superclass if superclass else 'NORM'

# df['scp_codes'] = df['scp_codes'].apply(ast.literal_eval)
# df['diagnostic_superclass'] = df['scp_codes'].apply(extract_dominant_superclass)
# le = LabelEncoder()
# df['class_id'] = le.fit_transform(df['diagnostic_superclass'])

# # Split data into train, validation, and test sets
# train_idx, test_idx = train_test_split(df.index, test_size=0.2, stratify=df['class_id'], random_state=42)
# train_meta = df.loc[train_idx].reset_index(drop=True)
# test_meta = df.loc[test_idx].reset_index(drop=True)
# train_idx, val_idx = train_test_split(train_meta.index, test_size=0.1, stratify=train_meta['class_id'], random_state=42)
# train_meta_new = train_meta.loc[train_idx].reset_index(drop=True)
# val_meta = train_meta.loc[val_idx].reset_index(drop=True)

# # Dataset class with adaptive denoising and augmentation
# class PTBXLGraphDataset(Dataset):
#     def __init__(self, meta_df, ecg_folder, wavelet='db4', level=4, train=True):
#         self.meta = meta_df
#         self.ecg_folder = ecg_folder
#         self.wavelet = wavelet
#         self.level = level
#         self.train_mode = train
#         self.edge_index = self.build_edge_index()

#     def build_edge_index(self):
#         num_leads = 12
#         edges = [(i, j) for i in range(num_leads) for j in range(num_leads) if i != j]
#         return torch.tensor(edges, dtype=torch.long).t().contiguous()

#     def __len__(self):
#         return len(self.meta)

#     def __getitem__(self, idx):
#         row = self.meta.iloc[idx]
#         rec_path = os.path.join(self.ecg_folder, row['filename_lr'])
#         sig, _ = wfdb.rdsamp(rec_path)
#         sig = sig.T  # Shape: (12, 1000)

#         if self.train_mode:
#             num_drop = np.random.randint(0, 3)
#             drop_leads = np.random.choice(12, size=num_drop, replace=False)
#         else:
#             drop_leads = []

#         node_features = []
#         for i, lead in enumerate(sig):
#             if self.train_mode:
#                 shift = np.random.randint(-50, 51)
#                 lead = np.roll(lead, shift)
#                 scale = np.random.uniform(0.9, 1.1)
#                 lead = lead * scale
#                 noise = np.random.normal(0, 0.01, lead.shape)
#                 lead = lead + noise

#             lead = (lead - np.mean(lead)) / (np.std(lead) + 1e-8)
#             coeffs = pywt.wavedec(lead, self.wavelet, level=self.level)
#             if len(coeffs[-1]) > 0:
#                 sigma = np.median(np.abs(coeffs[-1])) / 0.6745
#                 threshold = sigma * np.sqrt(2 * np.log(len(lead)))
#                 coeffs_thresh = [c if i == 0 else pywt.threshold(c, threshold, mode='soft') for i, c in enumerate(coeffs)]
#                 denoised = pywt.waverec(coeffs_thresh, self.wavelet)
#             else:
#                 denoised = lead

#             denoised = denoised[:1000] if len(denoised) > 1000 else np.pad(denoised, (0, 1000 - len(denoised)), 'constant')
#             node_features.append(np.zeros(1000) if i in drop_leads else denoised)

#         x = torch.tensor(np.array(node_features), dtype=torch.float32)
#         y = torch.tensor(row['class_id'], dtype=torch.long)
#         return Data(x=x, edge_index=self.edge_index, y=y)

# # Residual CNN block
# class ResidualBlock(nn.Module):
#     def __init__(self, in_channels, out_channels, kernel_size):
#         super().__init__()
#         self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2)
#         self.bn1 = nn.BatchNorm1d(out_channels)
#         self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2)
#         self.bn2 = nn.BatchNorm1d(out_channels)
#         self.relu = nn.ReLU()
#         self.residual = nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()

#     def forward(self, x):
#         identity = self.residual(x)
#         out = self.relu(self.bn1(self.conv1(x)))
#         out = self.bn2(self.conv2(out))
#         return self.relu(out + identity)

# # CNN for feature extraction
# class NodeCNN(nn.Module):
#     def __init__(self, signal_length, out_dim):
#         super().__init__()
#         self.block1 = ResidualBlock(1, 32, 5)
#         self.block2 = ResidualBlock(32, 64, 5)
#         self.block3 = ResidualBlock(64, 128, 3)
#         self.block4 = ResidualBlock(128, 256, 3)
#         self.block5 = ResidualBlock(256, 512, 3)
#         self.pool = nn.AdaptiveAvgPool1d(1)
#         self.fc = nn.Linear(512, out_dim)
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.block1(x)
#         x = self.dropout(x)
#         x = self.block2(x)
#         x = self.dropout(x)
#         x = self.block3(x)
#         x = self.dropout(x)
#         x = self.block4(x)
#         x = self.dropout(x)
#         x = self.block5(x)
#         x = self.dropout(x)
#         x = self.pool(x).squeeze(-1)
#         return self.fc(x)

# # Combined CNN and GNN model
# class CNN_GNN_Model(nn.Module):
#     def __init__(self, signal_length, cnn_out_dim, gnn_hidden, num_classes):
#         super().__init__()
#         self.node_cnn = NodeCNN(signal_length, cnn_out_dim)
#         self.gnn1 = GATConv(cnn_out_dim, gnn_hidden, heads=4, concat=True)
#         self.norm1 = GraphNorm(gnn_hidden * 4)
#         self.gnn2 = GATConv(gnn_hidden * 4, gnn_hidden, heads=1, concat=False)
#         self.norm2 = GraphNorm(gnn_hidden)
#         self.pool = Set2Set(gnn_hidden, processing_steps=3)
#         self.fc = nn.Linear(gnn_hidden * 2, num_classes)
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, data):
#         x, edge_index, batch = data.x, data.edge_index, data.batch
#         x = x.unsqueeze(1)  # (N, 1, 1000)
#         x = self.node_cnn(x)  # (N, cnn_out_dim)
#         x = F.relu(self.norm1(self.gnn1(x, edge_index)))
#         x = self.dropout(x)
#         x = F.relu(self.norm2(self.gnn2(x, edge_index)))
#         x = self.dropout(x)
#         x = self.pool(x, batch)
#         return self.fc(x)

# # Focal Loss for handling class imbalance
# class FocalLoss(nn.Module):
#     def __init__(self, alpha=1.0, gamma=2.0):
#         super().__init__()
#         self.alpha = alpha
#         self.gamma = gamma

#     def forward(self, inputs, targets):
#         ce_loss = F.cross_entropy(inputs, targets, reduction='none')
#         pt = torch.exp(-ce_loss)
#         focal_loss = self.alpha * ((1 - pt) ** self.gamma) * ce_loss
#         return focal_loss.mean()

# # Data preparation
# ecg_folder = '/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/'
# train_dataset = PTBXLGraphDataset(train_meta_new, ecg_folder, train=True)
# val_dataset = PTBXLGraphDataset(val_meta, ecg_folder, train=False)
# test_dataset = PTBXLGraphDataset(test_meta, ecg_folder, train=False)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=Batch.from_data_list)
# val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)

# # Model setup
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = CNN_GNN_Model(signal_length=1000, cnn_out_dim=128, gnn_hidden=64, num_classes=len(le.classes_)).to(device)
# criterion = FocalLoss()
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-3)
# scheduler = CosineAnnealingLR(optimizer, T_max=50)

# # Training and evaluation functions
# def train_epoch():
#     model.train()
#     total_loss = 0
#     correct = 0
#     total = 0
#     for data in train_loader:
#         data = data.to(device)
#         optimizer.zero_grad()
#         out = model(data)
#         loss = criterion(out, data.y)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item() * data.y.size(0)
#         preds = out.argmax(dim=1)
#         correct += (preds == data.y).sum().item()
#         total += data.y.size(0)
#     return total_loss / total, correct / total

# def evaluate(loader):
#     model.eval()
#     total_loss = 0
#     correct = 0
#     total = 0
#     with torch.no_grad():
#         for data in loader:
#             data = data.to(device)
#             out = model(data)
#             loss = criterion(out, data.y)
#             total_loss += loss.item() * data.y.size(0)
#             preds = out.argmax(dim=1)
#             correct += (preds == data.y).sum().item()
#             total += data.y.size(0)
#     return total_loss / total, correct / total

# # Training loop with early stopping
# best_val_acc = 0
# patience = 10
# counter = 0
# for epoch in range(1, 101):
#     train_loss, train_acc = train_epoch()
#     val_loss, val_acc = evaluate(val_loader)
#     print(f"[Epoch {epoch}] Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
#     scheduler.step()
#     if val_acc > best_val_acc:
#         best_val_acc = val_acc
#         torch.save(model.state_dict(), 'best_model.pt')
#         counter = 0
#     else:
#         counter += 1
#     if counter >= patience:
#         print("Early stopping triggered")
#         break

# # Test evaluation
# model.load_state_dict(torch.load('best_model.pt'))
# test_loss, test_acc = evaluate(test_loader)
# print(f"Final Test Accuracy: {test_acc:.4f}, Loss: {test_loss:.4f}")

# y_true, y_pred = [], []
# with torch.no_grad():
#     for data in test_loader:
#         data = data.to(device)
#         out = model(data)
#         preds = out.argmax(dim=1)
#         y_true.extend(data.y.cpu().numpy())
#         y_pred.extend(preds.cpu().numpy())

# report = classification_report(y_true, y_pred, target_names=le.classes_, digits=4)
# print("\nClassification Report:\n", report)