### Import libraries and dataset connection

In [1]:
import os, sys
from typing import List
from collections.abc import Callable

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator

In [4]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

In [5]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 8.27
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.templates.default = 'seaborn'

top_h_legend = dict(orientation='h', yanchor="bottom", y=1.02)

In [6]:
from ipywidgets import interact, interact_manual, FloatSlider

In [41]:
sys.path.append(os.path.abspath('..'))

from src.utils import FILES, DATA_FOLDERS, extract_study_phase, load_dataset, fetch_data, preprocess, \
    create_dataset, Preprocessor, retrieve_mapper, get_closest_blood_results, standardise_column_names

CONNECTION_DIR = '/Users/adhaene/Downloads/'

### Fetch data

In [23]:
suspicious, verbose = .5, 1

In [92]:
# LESIONS
# Fetch
lesions = pd.read_csv(os.path.join(CONNECTION_DIR + DATA_FOLDERS[4], FILES[DATA_FOLDERS[4]]['lesions']))
shape = pd.read_csv(os.path.join(CONNECTION_DIR + DATA_FOLDERS[4], FILES[DATA_FOLDERS[4]]['shape']),
                    index_col=0)
shape = standardise_column_names(shape)

# Merge with radiomics
lesions = lesions.merge(shape, on=['gpcr_id', 'study_name', 'roi_id'], how='inner')

# Filter out benign lesions and non-post-1 studies
lesions = lesions[(lesions.pars_suspicious_prob_petct > suspicious) \
                    & (lesions.study_name.isin(['pre-01', 'post-01']))]

# Filter out single-lesion studies
multiple_lesions = lesions.groupby(['gpcr_id', 'study_name']).size().gt(1)
multiple_lesions = multiple_lesions.index[multiple_lesions.values]
lesions = lesions.set_index(['gpcr_id', 'study_name']).loc[multiple_lesions].reset_index()

# Keep only radiomics features and assigned organ
radiomics_features = [
    'original_shape_elongation', 'original_shape_flatness', 'original_shape_leastaxislength',
    'original_shape_majoraxislength', 'original_shape_maximum2ddiametercolumn',
    'original_shape_maximum2ddiameterrow', 'original_shape_maximum2ddiameterslice',
    'original_shape_maximum3ddiameter', 'original_shape_meshvolume', 'original_shape_minoraxislength',
    'original_shape_sphericity', 'original_shape_surfacearea', 'original_shape_surfacevolumeratio',
    'original_shape_voxelvolume', 'mtv', 'tlg', 'pars_suspicious_prob_petct',
    'suv_skewness', 'suv_entropy', 'suv_kurtosis', 'suv_uniformity', 'suv_energy']

lesions = lesions[['gpcr_id', 'study_name', 'lesion_label_id', *radiomics_features, 'assigned_organ']]

if verbose > 0:
    print(f"Post-1 study lesions extracted for {len(lesions.gpcr_id.unique())} patients")

# LABELS
progression = pd.read_csv(os.path.join(CONNECTION_DIR + DATA_FOLDERS[1],
                                        FILES[DATA_FOLDERS[1]]['progression']))
progression['pseudorecist'] = progression.pseudorecist.eq('NPD').mul(1)

# We need to filter out studies who do not have an associated progression label
# Add label from progression DataFrame
lesions = lesions.merge(progression[progression.study_name == 'post-02'][['gpcr_id', 'pseudorecist']],
                        on=['gpcr_id'], how='inner')
lesions = lesions[lesions.pseudorecist.notna()]
lesions.drop(columns='pseudorecist', inplace=True)

if verbose > 0:
    print(f"Post-2 study labels added for {len(lesions.gpcr_id.unique())} patients")
    
# PATIENT-LEVEL
patients = pd.read_csv(os.path.join(CONNECTION_DIR + DATA_FOLDERS[2], FILES[DATA_FOLDERS[2]]['patients']))
studies = pd.read_csv(os.path.join(CONNECTION_DIR + DATA_FOLDERS[0], FILES[DATA_FOLDERS[0]]['studies']))
# Fix encoding for 90+ patients
patients['age_at_treatment_start_in_years'] = \
    patients.age_at_treatment_start_in_years.apply(lambda a: 90 if a == '90 or older' else int(a))

blood = pd.read_csv(os.path.join(CONNECTION_DIR + DATA_FOLDERS[4], FILES[DATA_FOLDERS[4]]['blood']))
blood.rename(columns={feature: feature.replace('-', '_') for feature in blood.columns}, inplace=True)
# Listify immunotherapy type to create multi-feature encoding
blood['immuno_therapy_type'] = blood.immuno_therapy_type \
    .apply(lambda t: ['ipi', 'nivo'] if t == 'ipinivo' else [t])

# Filter in the patient information that we want access to
patient_features = ['age_at_treatment_start_in_years']
blood_features = ['sex', 'bmi', 'performance_score_ecog', 'ldh_sang_ul', 'neutro_absolus_gl',
                    'eosini_absolus_gl', 'leucocytes_sang_gl', 'NRAS_MUTATION', 'BRAF_MUTATION',
                    'immuno_therapy_type', 'lympho_absolus_gl', 'concomittant_tvec',
                    'prior_targeted_therapy', 'prior_treatment', 'nivo_maintenance']

patients = patients[['gpcr_id', *patient_features]]
blood = blood[['gpcr_id', 'n_days_to_treatment_start', *blood_features]]
blood['study_name'] = 'blood'

radiomics = pd.read_csv(os.path.join(CONNECTION_DIR, DATA_FOLDERS[3],
                                        FILES[DATA_FOLDERS[3]]['radiomics']))

potential_patients = list(set(lesions.gpcr_id) & set(patients.gpcr_id) & set(radiomics.gpcr_id) \
                            & set(blood.gpcr_id) \
                            & set(studies[studies.study_name.isin(['pre-01', 'post-01'])].gpcr_id))

Post-1 study lesions extracted for 106 patients
Post-2 study labels added for 66 patients


In [65]:
patient = potential_patients[0]

available_studies = studies[(studies.study_name.isin(['pre-01', 'post-01'])) \
                            & (studies.gpcr_id == patient)].study_name.unique()

study = available_studies[1]

mapper = retrieve_mapper(pd.concat([blood, studies]), patient,
                         'study_name', 'n_days_to_treatment_start').reset_index(drop=True)

study_date = mapper[mapper.study_name == study].n_days_to_treatment_start.to_numpy()[0]
wanted_blood_results_date = get_closest_blood_results(mapper, study) \
            .n_days_to_treatment_start.to_list()
            
bdf = blood[(blood.gpcr_id == patient) \
        & (blood.n_days_to_treatment_start.isin(wanted_blood_results_idx))].copy()

In [None]:
# Transform all one-hot encoded features into True/False to avoid scaler
for feature in blood_features:
    values = blood[feature].value_counts().keys()
    if len(values) == 2 and all(values == [0, 1]):
        blood[feature] = blood[feature].astype(bool)

In [88]:
merged = pd.concat([blood, studies[['gpcr_id', 'study_name', 'n_days_to_treatment_start']]])
filled = merged[(merged.study_name.isin(['pre-01', 'post-01', 'blood']) & (merged.gpcr_id == patient))] \
    .set_index('n_days_to_treatment_start').sort_index() \
    .interpolate(method='index').bfill().reset_index()
filled[filled.study_name.isin(['pre-01', 'post-01'])]

Unnamed: 0,n_days_to_treatment_start,gpcr_id,sex,bmi,performance_score_ecog,ldh_sang_ul,neutro_absolus_gl,eosini_absolus_gl,leucocytes_sang_gl,NRAS_MUTATION,BRAF_MUTATION,immuno_therapy_type,lympho_absolus_gl,concomittant_tvec,prior_targeted_therapy,prior_treatment,nivo_maintenance,study_name
0,-36.0,34610052,male,19.49,0.0,177.0,6.88,1.11,11.1,y,n,[pembro],2.0,False,False,False,False,pre-01
4,57.0,34610052,male,19.205714,0.952381,195.904762,10.782857,12.121905,24.2,y,n,[pembro],1.240952,False,False,False,False,post-01


In [114]:
merged_studies = pd.concat([blood, studies[['gpcr_id', 'study_name', 'n_days_to_treatment_start']]])

blood_processed = pd.DataFrame()

for patient in potential_patients:
        
    patient_studies = merged_studies[merged_studies.gpcr_id == patient] \
        .set_index('n_days_to_treatment_start').sort_index()
    
    # Linear interpolation for numeric values and backwards fill for the rest
    filled_studies = patient_studies.interpolate(method='index').bfill().reset_index()
        
    blood_processed = pd.concat([
        blood_processed, filled_studies[filled_studies.study_name.isin(['pre-01', 'post-01'])]])

blood_processed.reset_index(inplace=True, drop=True)

# Transform all one-hot encoded features into True/False to avoid scaler
for feature in blood_features:
    values = blood_processed[feature].value_counts().keys()
    if len(values) == 2 and all(values == [0, 1]):
        blood_processed[feature] = blood_processed[feature].astype(bool)

In [115]:
blood_processed

Unnamed: 0,n_days_to_treatment_start,gpcr_id,sex,bmi,performance_score_ecog,ldh_sang_ul,neutro_absolus_gl,eosini_absolus_gl,leucocytes_sang_gl,NRAS_MUTATION,BRAF_MUTATION,immuno_therapy_type,lympho_absolus_gl,concomittant_tvec,prior_targeted_therapy,prior_treatment,nivo_maintenance,study_name
0,-36.0,34610052,male,19.490000,0.000000,177.000000,6.880000,1.110000,11.100000,y,n,[pembro],2.000000,False,False,False,False,pre-01
1,57.0,34610052,male,19.205714,0.952381,195.904762,10.782857,12.121905,24.200000,y,n,[pembro],1.240952,False,False,False,False,post-01
2,3.0,34610062,female,20.520000,0.000000,184.000000,4.380000,0.035000,6.300000,n,y,"[nivo, ipi]",1.475000,False,False,False,False,post-01
3,-50.0,34610064,female,29.210000,0.000000,304.000000,2.830000,0.285000,5.700000,y,n,"[nivo, ipi]",1.935000,False,False,False,True,pre-01
4,84.0,34610064,female,30.430000,1.000000,222.714286,2.663333,1.103333,6.666667,y,n,[nivo],2.023333,False,False,False,True,post-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,105.0,34610044,male,19.530000,0.000000,195.200000,7.327037,1.156296,10.255556,n,n,[ipi],1.055185,False,False,False,False,post-01
99,-1.0,34610045,male,31.440000,0.000000,263.000000,,,,n,y,"[nivo, ipi]",,False,False,False,False,pre-01
100,89.0,34610045,,29.060000,1.000000,408.000000,,,,,,,,True,,True,True,post-01
101,-10.0,34610046,male,28.380000,0.000000,272.000000,3.240000,0.220000,5.400000,n,y,"[nivo, ipi]",1.300000,False,False,False,False,pre-01


In [118]:
patient = 34610045
        
patient_studies = merged_studies[merged_studies.gpcr_id == patient] \
    .set_index('n_days_to_treatment_start').sort_index()

# Linear interpolation for numeric values and backwards fill for the rest
filled_studies = patient_studies.interpolate(method='index').bfill().ffill().reset_index()

filled_studies

Unnamed: 0,n_days_to_treatment_start,gpcr_id,sex,bmi,performance_score_ecog,ldh_sang_ul,neutro_absolus_gl,eosini_absolus_gl,leucocytes_sang_gl,NRAS_MUTATION,BRAF_MUTATION,immuno_therapy_type,lympho_absolus_gl,concomittant_tvec,prior_targeted_therapy,prior_treatment,nivo_maintenance,study_name
0,-49.0,34610045,male,31.44,0.0,263.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,pre-02
1,-1.0,34610045,male,31.44,0.0,263.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,pre-01
2,0.0,34610045,male,31.44,0.0,263.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,blood
3,35.0,34610045,male,28.73,0.0,350.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,blood
4,56.0,34610045,male,29.06,0.0,280.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,blood
5,77.0,34610045,male,29.06,1.0,408.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,blood
6,89.0,34610045,male,29.06,1.0,408.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,post-01
7,348.0,34610045,male,29.06,1.0,408.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,post-04
8,432.0,34610045,male,29.06,1.0,408.0,,,,n,y,"[nivo, ipi]",,False,False,False,False,post-05


In [136]:
delta = 30

patient = 34610001

mapper = studies[studies.gpcr_id == patient].set_index('study_name').n_days_to_treatment_start.to_dict()

if 'pre-01' in mapper.keys() and 'post-01' in mapper.keys():

    pre = mapper['pre-01']
    post = mapper['post-01']
    difference = abs(post - pre)

    pre_blood = blood[(blood.gpcr_id == patient) & (blood.n_days_to_treatment_start.between(pre - delta, pre + delta))]
    post_blood = blood[(blood.gpcr_id == patient) & (blood.n_days_to_treatment_start.between(post - delta, post + delta))]

In [12]:
px.scatter(merged[merged.study_name.isin(['pre-01', 'post-01', 'blood'])], 
           x='n_days_to_treatment_start', y='gpcr_id', color='study_name') \
    .update_layout(legend=top_h_legend, yaxis_title='Patient')