In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

# Define the number of subjects and time points
num_subjects = 100
time_points = ['T1', 'T2', 'T3']

# Function to categorize blood pressure
def categorize_bp(systolic_bp):
    if systolic_bp < 120:
        return 0 #'Normal'
    elif 120 <= systolic_bp <= 129:
        return 1 #'Elevated'
    else:
        return 2 #'Hypertension'

# Generate sample data
np.random.seed(42)  # For reproducibility
data = {
    'SubjectID': np.repeat([f'subject_{i:03}' for i in range(1, num_subjects + 1)], len(time_points)),
    'TimePoint': time_points * num_subjects,
    'SystolicBP': np.random.randint(110, 150, num_subjects * len(time_points)),
    'DiastolicBP': np.random.randint(70, 90, num_subjects * len(time_points))
}

# Create DataFrame
df_longitudinal = pd.DataFrame(data)

# Filter T3 time points to add the BP category
df_longitudinal['BPCategory'] = df_longitudinal['SystolicBP'].apply(categorize_bp)

# Calculate the number of rows to remove, to test whether it is robust to missing rows
#num_rows_to_remove = int(len(df_longitudinal) * 0.2)
#df_longitudinal = df_longitudinal.sample(frac=1, random_state=42).reset_index(drop=True)
#df_longitudinal = df_longitudinal.iloc[num_rows_to_remove:]

all_subjects = list(df_longitudinal['SubjectID'].unique())
train_subjects, test_subjects = all_subjects[:80], all_subjects[80:]
df_train = df_longitudinal[df_longitudinal['SubjectID'].isin(train_subjects)]
df_test = df_longitudinal[df_longitudinal['SubjectID'].isin(test_subjects)]

dataset_path = "C:\\Users\\camgonza\\Box\\Camila Gonzalez's Files\\Data\\NCANDA\\NCANDA_experiments\\long_testground"
df_longitudinal.to_csv(dataset_path + "/df_longitudinal.csv", index=False)


In [3]:
from lln.data.pytorch.get_dataset import LongDataset
from collections import OrderedDict
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from lln.models.longitudinal.Transformer import FeatureTransformer
from lln.training.LongitudinalTrainer import LongitudinalTrainer

input_dim = 2  # Number of features (systolic BP)
dim_model = 64 # Number of features in the hidden state
nr_layers = 3  # Number of LSTM layers
num_heads = 4  # Number of attention heads
dim_feedforward = 256  # Feedforward network size
seq_to_seq = True
see_future = True
output_dim = 3  # Number of output classes (Normal, Elevated, Hypertension)

In [4]:
# Define PyTorch datasets and dataloaders
datasets = OrderedDict([('Train', LongDataset(df_train, feature_cols=['SystolicBP', 'DiastolicBP'], target_col='BPCategory', seq_to_seq=seq_to_seq, id_col='SubjectID', seq_col='TimePoint', timepoints=['T1', 'T2', 'T3'])),
            ('Test', LongDataset(df_test, feature_cols=['SystolicBP', 'DiastolicBP'], target_col='BPCategory', seq_to_seq=seq_to_seq, id_col='SubjectID', seq_col='TimePoint', timepoints=['T1', 'T2', 'T3']))])
print(datasets['Train'].subjects)
print(datasets['Test'].subjects)

['subject_001', 'subject_002', 'subject_003', 'subject_004', 'subject_005', 'subject_006', 'subject_007', 'subject_008', 'subject_009', 'subject_010', 'subject_011', 'subject_012', 'subject_013', 'subject_014', 'subject_015', 'subject_016', 'subject_017', 'subject_018', 'subject_019', 'subject_020', 'subject_021', 'subject_022', 'subject_023', 'subject_024', 'subject_025', 'subject_026', 'subject_027', 'subject_028', 'subject_029', 'subject_030', 'subject_031', 'subject_032', 'subject_033', 'subject_034', 'subject_035', 'subject_036', 'subject_037', 'subject_038', 'subject_039', 'subject_040', 'subject_041', 'subject_042', 'subject_043', 'subject_044', 'subject_045', 'subject_046', 'subject_047', 'subject_048', 'subject_049', 'subject_050', 'subject_051', 'subject_052', 'subject_053', 'subject_054', 'subject_055', 'subject_056', 'subject_057', 'subject_058', 'subject_059', 'subject_060', 'subject_061', 'subject_062', 'subject_063', 'subject_064', 'subject_065', 'subject_066', 'subject_

In [5]:
# Create dataloaders
batch_size = 10
dataloaders = OrderedDict([(dataset_name, DataLoader(dataset, batch_size=batch_size, shuffle=True))
    for dataset_name, dataset in datasets.items()])
for X, y in dataloaders['Train']:
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X: torch.Size([10, 3, 2])
Shape of y: torch.Size([10, 3]) torch.int64


In [7]:
# Define model
save_path = "C:\\Users\\camgonza\\Box\\Camila Gonzalez's Files\\Data\\NCANDA\\NCANDA_experiments\\long_testground\\models"
model = FeatureTransformer(input_dim, dim_model, num_heads, dim_feedforward, nr_layers, output_dim=output_dim, dropout=0.1, seq_to_seq=seq_to_seq, see_future=see_future, save_path=save_path)
print(model)

FeatureTransformer(
  (input_projection): Linear(in_features=2, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder_input_projection): Linear(in_features=2, out_features=64, bias=True)
  (transformer_decoder): Transf

In [8]:
# Define optimizer and trainer
loss_f = nn.CrossEntropyLoss()
trainer_path = "C:\\Users\\camgonza\\Box\\Camila Gonzalez's Files\\Data\\NCANDA\\NCANDA_experiments\\long_testground\\trainer"
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
trainer = LongitudinalTrainer(trainer_path, 'cpu', optimizer, loss_f, seq_to_seq=seq_to_seq, labels=["Normal", "Elevated", "Hypertension"])

In [9]:
# Train model
nr_epochs = 100
trainer.train(model, dataloaders['Train'], dataloaders, 
              nr_epochs=nr_epochs, starting_from_epoch=0,
              print_loss_every=int(nr_epochs/10), eval_every=int(nr_epochs/10), export_every=int(nr_epochs/5), verbose=True)

  0%|          | 0/100 [00:00<?, ?it/s]


TypeError: linear(): argument 'input' (position 1) must be Tensor, not NoneType