# Multimodal

## Setup

### Imports

In [3]:
# Imports
import os
import numpy as np
import pandas as pd
import random
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
from torch.utils.data import Dataset, DataLoader
from src.utils import *
from src.models import *
from src.process_reports import *
from src.train import train_mm, kfold_cv

%load_ext autoreload
%autoreload 2

### Set seed & device

In [4]:
set_seed(42)      
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# set torch matmul precision
torch.set_float32_matmul_precision('medium')

## Process reports

### Summarize reports

In [None]:
%%script false --no-raise-error
# summarize report using gpt-3
summarize_reports()

### Extract text feats from reports

In [5]:
# %%script false --no-raise-error
# extract text feats from (summarized) reports
extract_text_feats()

100%|██████████| 1090/1090 [09:17<00:00,  1.95it/s]


### Annotate subtype & grade from path reports

In [None]:
%%script false --no-raise-error
lm_name = 'gpt-3.5-turbo'

# sample_report_path = 'data/reports_distilled/TCGA-WT-AB41.txt'
reports_dir = 'data/reports_distilled'
# create prompt
# prompt = create_zs_prompt(sample_report_path)

# args for generation
gen_args = {'max_tokens': 200}

# out = gen_subtype_grade_zs(lm_name, prompt, api='openai', args=gen_args)
df_res = classify_reports_zs(lm_name, reports_dir, api='openai', args=gen_args)
df_res.head(10)

## Task: Predict target from WSIs & reports

### Load data

In [10]:
# create dataloaders
target = 'msi'
data_file = 'data/data_tcga_brca_sg_pca.csv'
bsz = 32 # batch size for dataloaders
train_loader, val_loader, test_loader = create_dataloaders(target, data_file, use_rand_splits=True, bsz=bsz)

size of train set: 789, val set: 98, test set: 99


### Train & eval

In [18]:
# %%script false --no-raise-error
# init model
mode = 'mm'  # input modalities: 'text', 'img', or 'mm'
split = 'rand'   # dataset split: 'def' or 'rand'
model = Attention1DRegressor(target=target, mode=mode) # model architecture: 'Attention1DRegressor' or 'Attention1DClassifier'

# set training args
args = {'num_epochs': 100, 'ckpt_name': f'ckpt_best_{mode}_{split}_split', 'resume_ckpt': None, 'tblog_name': f'best_{mode}_{split}_split'}

# train model
model, trainer = train_mm(model, train_loader, val_loader, args)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type            | Params
----------------------------------------------
0 | attention | Sequential      | 262 K 
1 | regressor | Sequential      | 3.1 K 
2 | loss      | MSELoss         | 0     
3 | corr      | PearsonCorrCoef | 0     
4 | r2        | R2Score         | 0     
----------------------------------------------
265 K     Trainable params
0         Non-trainable params
265 K     Total params
1.062     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.003


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.001


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.001. Signaling Trainer to stop.


training on device: cpu


In [19]:
# evaluate the trained model on the test set
trainer.test(model, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_corr_epoch': 0.21345384418964386,
  'test_r2_epoch': -0.45560407638549805}]

### K-fold CV

In [5]:
# %%script false --no-raise-error
# run k-fold CV
# init model
data_file = 'data/data_tcga_brca_sg_pca.csv'
target = 'msi'
mode = 'mm'
dataset = MMDataset(target, data_file)
bsz = 64
model_class = Attention1DRegressor
# model args
model_args = {'mode': mode, 'target': target}
# train args
train_args = {'bsz': bsz, 'k': 5, 'num_epochs': 100, 'patience': 5, 'save_top_k': 0, 'tblog_name': f'best_{mode}_kfold', 'enable_progress_bar': False}

res_kfold_cv = kfold_cv(model_class, dataset, model_args, train_args)
metrics = ['test_corr_epoch', 'test_r2_epoch']
avg_res = {k: np.mean([res[k] for res in res_kfold_cv]).round(3) for k in metrics}
print(f"avg res over {train_args['k']} folds: {avg_res}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


training fold 1/5


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type            | Params
----------------------------------------------
0 | attention | Sequential      | 262 K 
1 | regressor | Sequential      | 3.1 K 
2 | loss      | MSELoss         | 0     
3 | corr      | PearsonCorrCoef | 0     
4 | r2        | R2Score         | 0     
----------------------------------------------
265 K     Trainable params
0         Non-trainable params
265 K     Total params
1.062     Total estimated model params size (MB)
Metric val_loss improved. New best score: 0.003
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New b

training on device: cpu


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type            | Params
----------------------------------------------
0 | attention | Sequential      | 262 K 
1 | regressor | Sequential      | 3.1 K 
2 | loss      | MSELoss         | 0     
3 | corr      | PearsonCorrCoef | 0     
4 | r2        | R2Score         | 0     
----------------------------------------------
265 K     Trainable params
0         Non-trainable params
265 K     Total params
1.062     Total estimated model params size (MB)


training fold 2/5


Metric val_loss improved. New best score: 0.011
Metric val_loss improved by 0.009 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Metric val_loss improved by 0.000 >= min

training on device: cpu


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type            | Params
----------------------------------------------
0 | attention | Sequential      | 262 K 
1 | regressor | Sequential      | 3.1 K 
2 | loss      | MSELoss         | 0     
3 | corr      | PearsonCorrCoef | 0     
4 | r2        | R2Score         | 0     
----------------------------------------------
265 K     Trainable params
0         Non-trainable params
265 K     Total params
1.062     Total estimated model params size (MB)


training fold 3/5


Metric val_loss improved. New best score: 0.011
Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.005
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.004
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min

training on device: cpu


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type            | Params
----------------------------------------------
0 | attention | Sequential      | 262 K 
1 | regressor | Sequential      | 3.1 K 
2 | loss      | MSELoss         | 0     
3 | corr      | PearsonCorrCoef | 0     
4 | r2        | R2Score         | 0     
----------------------------------------------
265 K     Trainable params
0         Non-trainable params
265 K     Total params
1.062     Total estimated model params size (MB)


training fold 4/5


Metric val_loss improved. New best score: 0.011
Metric val_loss improved by 0.008 >= min_delta = 0.0. New best score: 0.004
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Monitored metric val_loss did not improv

training on device: cpu


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type            | Params
----------------------------------------------
0 | attention | Sequential      | 262 K 
1 | regressor | Sequential      | 3.1 K 
2 | loss      | MSELoss         | 0     
3 | corr      | PearsonCorrCoef | 0     
4 | r2        | R2Score         | 0     
----------------------------------------------
265 K     Trainable params
0         Non-trainable params
265 K     Total params
1.062     Total estimated model params size (MB)


training fold 5/5


Metric val_loss improved. New best score: 0.009
Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.004
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.003
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.002
Metric val_loss improved by 0.000 >= min

training on device: cpu


avg res over 5 folds: {'test_corr_epoch': 0.219, 'test_r2_epoch': -0.119}
