In [1]:
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl

In [43]:
# Input data
subject_ids = np.loadtxt("selected_samples.txt", dtype=str)
left_hippo_dir = "LeftCSV/"
left_hippo_files = glob.glob(left_hippo_dir+"*")
right_hippo_dir = "RightCSV/"
right_hippo_files = glob.glob(right_hippo_dir+"*")
labels = pd.read_csv("adni.csv",names=['ID','AD']) 

In [22]:
transform = transforms.Compose([
    transforms.Normalize(mean=[0.5], std=[0.5])
])

In [33]:
# Import Dataset
class HippocampusDataset(Dataset):
    def __init__(self, left_hippo_files, right_hippo_files, labels=None):
        self.left_hippo = [pd.read_csv(f, header=None, sep = " ").values for f in left_hippo_files]
        self.right_hippo = [pd.read_csv(f, header=None, sep = " ").values for f in right_hippo_files]
        self.labels = labels['AD']

    def __len__(self):
        return len(self.left_hippo)

    def __getitem__(self, idx):
        left = self.left_hippo[idx]
        right = self.right_hippo[idx]
        left = left.astype(np.float32)
        right = right.astype(np.float32)
        # Stack left and right hippocampus data along the feature dimension
        # Resulting shape: (15000, 14)
        sample = np.concatenate((left, right), axis=1)

        # Transpose to shape: (14, 15000)
        sample = sample.T  # Now shape is (14, 15000)
        
        # Convert to torch tensor
        sample = torch.tensor(sample, dtype=torch.float32)

        # Normalize each channel (feature)
        mean = sample.mean(dim=1, keepdim=True)  # Shape: (14, 1)
        std = sample.std(dim=1, keepdim=True)    # Shape: (14, 1)
        sample = (sample - mean) / std
        

        if self.labels is not None:
            label = self.labels.iloc[idx]
            return sample, label
        else:
            return sample

In [37]:
# Updated Model Class using 1D Convolutions
class HippoCNN(pl.LightningModule):
    def __init__(self):
        super(HippoCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=14, out_channels=32, kernel_size=7, stride=2)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, stride=2)
        self.conv3 = nn.Conv1d(64, 128, kernel_size=3, stride=2)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(128 * self.calculate_output_length(), 64)
        self.fc2 = nn.Linear(64, 5)  

        self.criterion = nn.CrossEntropyLoss()

    def calculate_output_length(self):
        length = 15000
        layers = [
            ('conv', 7, 2),
            ('pool', 2, 2),
            ('conv', 5, 2),
            ('pool', 2, 2),
            ('conv', 3, 2),
            ('pool', 2, 2)
        ]
        for layer_type, kernel_size, stride in layers:
            if layer_type == 'conv' or layer_type == 'pool':
                length = ((length + 0 - 1 * (kernel_size - 1) - 1) // stride) + 1
        return length  # Should return 233
    
    def forward(self, x):
        x = F.relu(self.conv1(x))  # Shape: [batch_size, 32, L1]
        x = self.pool(x)           # Shape: [batch_size, 32, L2]
        x = F.relu(self.conv2(x))  # Shape: [batch_size, 64, L3]
        x = self.pool(x)           # Shape: [batch_size, 64, L4]
        x = F.relu(self.conv3(x))  # Shape: [batch_size, 128, L5]
        x = self.pool(x)           # Shape: [batch_size, 128, L6]

        x = x.view(x.size(0), -1)  # Flatten
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x.squeeze()

    def training_step(self, batch, batch_idx):
        data, labels = batch  # labels should be torch.LongTensor with values from 0 to 4
        outputs = self(data)  # outputs shape: [batch_size, 5]
        loss = self.criterion(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss


    def validation_step(self, batch, batch_idx):
        data, labels = batch
        outputs = self(data)
        loss = self.criterion(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        acc = (preds == labels).float().mean()
        self.log('val_loss', loss)
        self.log('val_acc', acc)
        

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [44]:
data_df = pd.DataFrame({
    'ID': subject_ids,
    'LeftFile': left_hippo_files,
    'RightFile': right_hippo_files
})

# Merge data and labels on SubjectID
merged_df = pd.merge(data_df, labels, on='ID', how='inner')  # Keep only subjects with labels

# Now extract the filtered lists
filtered_left_files = merged_df['LeftFile'].tolist()
filtered_right_files = merged_df['RightFile'].tolist()

unique_labels = sorted(set(labels['AD']))
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
labels['AD'] = [label_to_index[label] for label in labels['AD']]

# Split into training and validation sets
train_left, test_left, train_right, test_right, train_labels, test_labels = train_test_split(
    filtered_left_files, filtered_right_files, labels, test_size=0.2, random_state=42
)
train_left, val_left, train_right, val_right, train_labels, val_labels = train_test_split(
    train_left, train_right, train_labels, test_size=0.2, random_state=42
)

In [45]:
# Create Datasets and DataLoaders
train_dataset = HippocampusDataset(train_left, train_right, train_labels)
val_dataset = HippocampusDataset(val_left, val_right, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Initialize model
model = HippoCNN()

In [13]:
val_dataset = HippocampusDataset(val_left, val_right, val_labels, transform)

In [18]:
val_left[0]

'LeftCSV/013_S_4917__2012-08-31__S165927.csv'

In [46]:
# Trainer
trainer = pl.Trainer(max_epochs=50)

# Training
trainer.fit(model, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [MIG-444a2664-9fae-59e9-a85f-4fe324bd65a1]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | conv1     | Conv1d           | 3.2 K  | train
1 | conv2     | Conv1d           | 10.3 K | train
2 | conv3     | Conv1d           | 24.7 K | train
3 | pool      | MaxPool1d        | 0      | train
4 | dropout   | Dropout          | 0      | train
5 | fc1       | Linear           | 1.9 M  | train
6 | fc2       | Linear           | 325    | train
7 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
1.9 M     Trainable params
0         Non-trainable params
1.9 M     Total params
7.789     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal h

                                                                            

/nas/longleaf/home/xiaoqil/anaconda3/envs/cnn/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (19) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|██████████| 19/19 [00:01<00:00, 13.13it/s, v_num=14]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/5 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/5 [00:00<?, ?it/s][A
Validation DataLoader 0:  20%|██        | 1/5 [00:00<00:00, 103.22it/s][A
Validation DataLoader 0:  40%|████      | 2/5 [00:00<00:00, 27.18it/s] [A
Validation DataLoader 0:  60%|██████    | 3/5 [00:00<00:00, 21.78it/s][A
Validation DataLoader 0:  80%|████████  | 4/5 [00:00<00:00, 19.86it/s][A
Validation DataLoader 0: 100%|██████████| 5/5 [00:00<00:00, 19.91it/s][A
Epoch 1: 100%|██████████| 19/19 [00:01<00:00, 15.38it/s, v_num=14]    [A
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/5 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/5 [00:00<?, ?it/s][A
Validation DataLoader 0:  20%|██        | 1/5 [00:00<00:00, 102.54it/s][A
Validation DataLoader 0:  40%|████      | 2/5 [00:00<00:00, 27.19it/s] [A
V

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 19/19 [00:01<00:00, 11.33it/s, v_num=14]


In [50]:
def validation_epoch_end(self, outputs):
    preds = []
    targets = []
    for output in outputs:
        preds.append(output['preds'].cpu())
        targets.append(output['labels'].cpu())

    preds = torch.cat(preds)
    targets = torch.cat(targets)

    accuracy = accuracy_score(targets, preds)
    precision = precision_score(targets, preds, average='macro')
    recall = recall_score(targets, preds, average='macro')
    f1 = f1_score(targets, preds, average='macro')

    self.log('val_accuracy', accuracy)
    self.log('val_precision', precision)
    self.log('val_recall', recall)
    self.log('val_f1', f1)
    
def validation_step(self, batch, batch_idx):
    data, labels = batch
    outputs = self(data)
    loss = self.criterion(outputs, labels)
    preds = torch.argmax(outputs, dim=1)
    acc = (preds == labels).float().mean()
    self.log('val_loss', loss)
    self.log('val_acc', acc)
    return {'preds': preds, 'labels': labels}


In [52]:
test_dataset = HippocampusDataset(test_left, test_right, test_labels)
test_loader = DataLoader(test_dataset, batch_size=32)

In [61]:
# Inference on Test Set
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        data, _ = batch  # Unpack the batch; labels are ignored
        outputs = model(data)
        predictions = torch.argmax(outputs, dim=1)
        test_preds.extend(predictions.cpu().numpy())


In [73]:
# Compute Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

labels_list = test_labels['AD']
accuracy = accuracy_score(labels_list, test_preds)
precision = precision_score(labels_list, test_preds, average='weighted')
recall = recall_score(labels_list, test_preds, average='weighted')
f1 = f1_score(labels_list, test_preds, average='weighted')
# roc_auc = roc_auc_score(labels_list, test_preds, multi_class='ovr', average='weighted')

print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')
# print(f'Validation ROC-AUC: {roc_auc:.4f}')

Test Accuracy: 0.4000
Test Precision: 0.3758
Test Recall: 0.4000
Test F1-Score: 0.3792
