In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from collections import OrderedDict
import torch
torch.manual_seed(0)
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from tqdm import tqdm
from abcd.local.paths import output_path
from abcd.data.read_data import get_subjects_events_sf, add_event_vars, add_subject_vars, filter_events, filter_subjects
import abcd.data.VARS as VARS
from abcd.data.define_splits import SITES, save_restore_sex_fmri_splits
from abcd.data.divide_with_splits import divide_events_by_splits
from abcd.data.var_tailoring.normalization import normalize_var
from abcd.data.pytorch.get_dataset import PandasDataset
from abcd.models.classification.FullyConnected import FullyConnected3
from abcd.training.ClassifierTrainer import ClassifierTrainer
from abcd.local.paths import core_path, output_path
from abcd.data.VARS import STRUCT_FILES, STRUCT_FEATURES

In [3]:
# Fetch subjects and events
subjects_df, events_df = get_subjects_events_sf()
# Change to 0 (male) and 1 (female)
events_df.loc[events_df["kbi_sex_assigned_at_birth"] == 1.0, "kbi_sex_assigned_at_birth"] = 0
events_df.loc[events_df["kbi_sex_assigned_at_birth"] == 2.0, "kbi_sex_assigned_at_birth"] = 1
# Leave only the baseline visits
events_df = events_df.loc[(events_df['eventname'] == 'baseline_year_1_arm_1')]
print("Leaving baseline visits, we have {} events".format(len(events_df)))

Leaving baseline visits, we have 9086 events


In [4]:
# Define target and features
target_col = 'kbi_sex_assigned_at_birth'
labels = ["Male", "Female"]
feature_cols = list(VARS.NAMED_CONNECTIONS.keys()) + list(VARS.STRUCT_FEATURES.keys())

# Print distribution of baseline class
for val in set(events_df['kbi_sex_assigned_at_birth']):
    print('{} visits with {} target'.format(len(events_df.loc[events_df["kbi_sex_assigned_at_birth"] == val]), labels[int(val)]))

4704 visits with Male target
4382 visits with Female target


In [5]:
# Normalize features
for var_id in feature_cols:
    events_df = normalize_var(events_df, var_id, var_id)

In [6]:
# Divide events into training, validation and testing
splits = save_restore_sex_fmri_splits(k=5)
ood_site_id = SITES[0]
events_train, events_id_test, events_ood_test = divide_events_by_splits(events_df, splits, ood_site_id)
print("Nr. events train: {}, val: {}, test: {}".format(len(events_train), len(events_id_test), len(events_ood_test)))

Nr. events train: 7064, val: 1738, test: 284


In [7]:
# Define PyTorch datasets and dataloaders
datasets = OrderedDict([('train', PandasDataset(events_train, feature_cols, target_col)),
            ('val', PandasDataset(events_id_test, feature_cols, target_col)),
            ('test', PandasDataset(events_ood_test, feature_cols, target_col))])

In [8]:
# Create dataloaders
batch_size = 64
dataloaders = OrderedDict([(dataset_name, DataLoader(dataset, batch_size=batch_size, shuffle=True))
    for dataset_name, dataset in datasets.items()])

for X, y in dataloaders['train']:
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X: torch.Size([64, 177])
Shape of y: torch.Size([64]) torch.int64


In [9]:
# Determine device for training
device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print("Using {} device".format(device))

Using cpu device


In [10]:
# Define model
models_path = os.path.join(output_path, 'ABCD_sex_prediction_sf', 'models')
model = FullyConnected3(save_path=models_path, labels=labels, input_size=len(feature_cols))
model = model.to(device)
print(model)

FullyConnected3(
  (softmax): Softmax(dim=1)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_layers): Sequential(
    (0): Linear(in_features=177, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=2, bias=True)
  )
)


In [11]:
# Define optimizer and trainer
learning_rate = 1e-3
loss_f = nn.CrossEntropyLoss()
trainer_path = os.path.join(output_path, 'ABCD_sex_prediction_sf', 'results')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainer = ClassifierTrainer(trainer_path, device, optimizer, loss_f, labels=labels)

In [12]:
# Train model
trainer.train(model, dataloaders['train'], dataloaders, 
              nr_epochs=150, starting_from_epoch=0,
              print_loss_every=10, eval_every=10, export_every=50, verbose=True)

  0%|          | 0/150 [00:00<?, ?it/s]

Epoch 0
train CrossEntropyLoss: 0.693 B-Acc.: 0.518 F1: 0.467
val CrossEntropyLoss: 0.693 B-Acc.: 0.524 F1: 0.470
test CrossEntropyLoss: 0.693 B-Acc.: 0.519 F1: 0.457
Saved PyTorch model state FullyConnected3_epoch0.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\models
Saved trainer state ClassifierTrainer_optimizer_epoch0.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results\states
Progress stored in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results


  1%|          | 1/150 [00:03<09:20,  3.76s/it]

Starting epoch 1, loss 0.6646638566309268


  7%|▋         | 10/150 [00:20<05:00,  2.15s/it]

Epoch 10
train CrossEntropyLoss: 0.496 B-Acc.: 0.758 F1: 0.757
val CrossEntropyLoss: 0.521 B-Acc.: 0.748 F1: 0.747
test CrossEntropyLoss: 0.484 B-Acc.: 0.751 F1: 0.749


  7%|▋         | 11/150 [00:24<05:52,  2.54s/it]

Starting epoch 11, loss 0.5059393335570086


 13%|█▎        | 20/150 [00:45<05:19,  2.45s/it]

Epoch 20
train CrossEntropyLoss: 0.503 B-Acc.: 0.749 F1: 0.749
val CrossEntropyLoss: 0.516 B-Acc.: 0.743 F1: 0.742
test CrossEntropyLoss: 0.506 B-Acc.: 0.777 F1: 0.776


 14%|█▍        | 21/150 [00:48<05:42,  2.66s/it]

Starting epoch 21, loss 0.49983624053430986


 20%|██        | 30/150 [01:07<04:24,  2.20s/it]

Epoch 30
train CrossEntropyLoss: 0.486 B-Acc.: 0.764 F1: 0.763
val CrossEntropyLoss: 0.507 B-Acc.: 0.752 F1: 0.751
test CrossEntropyLoss: 0.470 B-Acc.: 0.757 F1: 0.757


 21%|██        | 31/150 [01:10<04:46,  2.40s/it]

Starting epoch 31, loss 0.5080854436298748


 27%|██▋       | 40/150 [01:27<03:34,  1.95s/it]

Epoch 40
train CrossEntropyLoss: 0.486 B-Acc.: 0.765 F1: 0.764
val CrossEntropyLoss: 0.505 B-Acc.: 0.749 F1: 0.748
test CrossEntropyLoss: 0.488 B-Acc.: 0.754 F1: 0.753


 27%|██▋       | 41/150 [01:30<04:05,  2.25s/it]

Starting epoch 41, loss 0.49853463672302867


 33%|███▎      | 50/150 [01:47<03:14,  1.95s/it]

Epoch 50
train CrossEntropyLoss: 0.483 B-Acc.: 0.766 F1: 0.766
val CrossEntropyLoss: 0.503 B-Acc.: 0.752 F1: 0.752
test CrossEntropyLoss: 0.471 B-Acc.: 0.764 F1: 0.764
Saved PyTorch model state FullyConnected3_epoch50.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\models
Saved trainer state ClassifierTrainer_optimizer_epoch50.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results\states
Progress stored in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results


 34%|███▍      | 51/150 [01:50<03:53,  2.36s/it]

Starting epoch 51, loss 0.49777019292384656


 40%|████      | 60/150 [02:08<02:56,  1.97s/it]

Epoch 60
train CrossEntropyLoss: 0.488 B-Acc.: 0.767 F1: 0.767
val CrossEntropyLoss: 0.504 B-Acc.: 0.753 F1: 0.753
test CrossEntropyLoss: 0.502 B-Acc.: 0.761 F1: 0.761


 41%|████      | 61/150 [02:11<03:24,  2.29s/it]

Starting epoch 61, loss 0.49162068291827365


 47%|████▋     | 70/150 [02:29<02:38,  1.98s/it]

Epoch 70
train CrossEntropyLoss: 0.484 B-Acc.: 0.767 F1: 0.765
val CrossEntropyLoss: 0.516 B-Acc.: 0.748 F1: 0.747
test CrossEntropyLoss: 0.481 B-Acc.: 0.754 F1: 0.753


 47%|████▋     | 71/150 [02:32<02:54,  2.21s/it]

Starting epoch 71, loss 0.4938839606873624


 53%|█████▎    | 80/150 [02:49<02:14,  1.92s/it]

Epoch 80
train CrossEntropyLoss: 0.505 B-Acc.: 0.754 F1: 0.747
val CrossEntropyLoss: 0.534 B-Acc.: 0.732 F1: 0.724
test CrossEntropyLoss: 0.527 B-Acc.: 0.707 F1: 0.696


 54%|█████▍    | 81/150 [02:52<02:28,  2.15s/it]

Starting epoch 81, loss 0.49141479988355896


 60%|██████    | 90/150 [03:09<01:55,  1.93s/it]

Epoch 90
train CrossEntropyLoss: 0.494 B-Acc.: 0.765 F1: 0.761
val CrossEntropyLoss: 0.536 B-Acc.: 0.745 F1: 0.741
test CrossEntropyLoss: 0.501 B-Acc.: 0.741 F1: 0.737


 61%|██████    | 91/150 [03:12<02:07,  2.15s/it]

Starting epoch 91, loss 0.49945278103287155


 67%|██████▋   | 100/150 [03:29<01:37,  1.95s/it]

Epoch 100
train CrossEntropyLoss: 0.480 B-Acc.: 0.767 F1: 0.767
val CrossEntropyLoss: 0.509 B-Acc.: 0.751 F1: 0.751
test CrossEntropyLoss: 0.488 B-Acc.: 0.774 F1: 0.774
Saved PyTorch model state FullyConnected3_epoch100.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\models
Saved trainer state ClassifierTrainer_optimizer_epoch100.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results\states
Progress stored in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results


 67%|██████▋   | 101/150 [03:33<01:52,  2.30s/it]

Starting epoch 101, loss 0.490458007331367


 73%|███████▎  | 110/150 [03:50<01:19,  1.98s/it]

Epoch 110
train CrossEntropyLoss: 0.483 B-Acc.: 0.773 F1: 0.771
val CrossEntropyLoss: 0.511 B-Acc.: 0.751 F1: 0.749
test CrossEntropyLoss: 0.511 B-Acc.: 0.734 F1: 0.730


 74%|███████▍  | 111/150 [03:53<01:28,  2.27s/it]

Starting epoch 111, loss 0.48164466640970727


 80%|████████  | 120/150 [04:10<00:54,  1.83s/it]

Epoch 120
train CrossEntropyLoss: 0.498 B-Acc.: 0.748 F1: 0.747
val CrossEntropyLoss: 0.517 B-Acc.: 0.740 F1: 0.739
test CrossEntropyLoss: 0.502 B-Acc.: 0.759 F1: 0.758


 81%|████████  | 121/150 [04:12<01:00,  2.08s/it]

Starting epoch 121, loss 0.4821877715823887


 87%|████████▋ | 130/150 [04:29<00:36,  1.84s/it]

Epoch 130
train CrossEntropyLoss: 0.472 B-Acc.: 0.774 F1: 0.774
val CrossEntropyLoss: 0.506 B-Acc.: 0.753 F1: 0.753
test CrossEntropyLoss: 0.505 B-Acc.: 0.771 F1: 0.771


 87%|████████▋ | 131/150 [04:32<00:39,  2.10s/it]

Starting epoch 131, loss 0.4797613991273416


 93%|█████████▎| 140/150 [04:48<00:17,  1.80s/it]

Epoch 140
train CrossEntropyLoss: 0.487 B-Acc.: 0.761 F1: 0.756
val CrossEntropyLoss: 0.517 B-Acc.: 0.739 F1: 0.734
test CrossEntropyLoss: 0.528 B-Acc.: 0.727 F1: 0.721


 94%|█████████▍| 141/150 [04:50<00:18,  2.05s/it]

Starting epoch 141, loss 0.47953805246868647


100%|██████████| 150/150 [05:07<00:00,  2.05s/it]



Finished training
Epoch 150
train CrossEntropyLoss: 0.467 B-Acc.: 0.777 F1: 0.776
val CrossEntropyLoss: 0.512 B-Acc.: 0.751 F1: 0.750
test CrossEntropyLoss: 0.521 B-Acc.: 0.730 F1: 0.728
Saved PyTorch model state FullyConnected3_epoch150.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\models
Saved trainer state ClassifierTrainer_optimizer_epoch150.pth in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results\states
Progress stored in C:\Users\camgonza\Box\Camila Gonzalez's Files\DATA\ABCD\output\ABCD_sex_prediction_sf\results
