### Import libraries and dataset connection

In [1]:
import os, sys
from typing import List
from collections.abc import Callable

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator

In [4]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

In [5]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 8.27
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.templates.default = 'seaborn'

top_h_legend = dict(orientation='h', yanchor="bottom", y=1.02)

In [6]:
from ipywidgets import interact, interact_manual, FloatSlider

In [7]:
sys.path.append(os.path.abspath('..'))

from src.utils import FILES, DATA_FOLDERS, extract_study_phase, load_dataset, fetch_data, preprocess, create_dataset, Preprocessor

CONNECTION_DIR = '/Users/adhaene/Downloads/'

### Fetch data

In [136]:
delta = 30

patient = 34610001

mapper = studies[studies.gpcr_id == patient].set_index('study_name').n_days_to_treatment_start.to_dict()

if 'pre-01' in mapper.keys() and 'post-01' in mapper.keys():

    pre = mapper['pre-01']
    post = mapper['post-01']
    difference = abs(post - pre)

    pre_blood = blood[(blood.gpcr_id == patient) & (blood.n_days_to_treatment_start.between(pre - delta, pre + delta))]
    post_blood = blood[(blood.gpcr_id == patient) & (blood.n_days_to_treatment_start.between(post - delta, post + delta))]

In [147]:
blood_filtered = pd.DataFrame()

valid_patients = list(set(list(studies[studies.study_name == 'post-01'].gpcr_id.unique())) \
                      & set(list(blood.gpcr_id.unique())))

for patient in valid_patients:
    
    mapper = retrieve_mapper(pd.concat([blood, studies]), patient, 'study_name', 'n_days_to_treatment_start').reset_index(drop=True)

    # Get the `n_days_to_treatment_start` of blood results to aggregate
    wanted_blood_results_idx = get_closest_blood_results(mapper).n_days_to_treatment_start.to_list()
    bdf = blood[(blood.gpcr_id == patient) & (blood.n_days_to_treatment_start.isin(wanted_blood_results_idx))]
    
    blood_filtered = pd.concat([blood_filtered, bdf])
    
fdf = blood_filtered[['gpcr_id', *blood_features, *patient_features]].groupby('gpcr_id').agg({
    'bmi': np.mean,
    'performance_score_ecog': np.mean,
    'ldh_sang_ul': np.mean,
    'neutro_absolus_gl': np.mean,
    'eosini_absolus_gl': np.mean,
    'leucocytes_sang_gl': np.mean,
    'immuno_therapy_type': np.sum,
    'lympho_absolus_gl': np.mean,
    'sex': pd.Series.mode,
    'NRAS_MUTATION': pd.Series.mode,
    'BRAF_MUTATION': pd.Series.mode,
    'concomittant_tvec': pd.Series.mode,
    'prior_targeted_therapy': pd.Series.mode,
    'prior_treatment': pd.Series.mode,
    'nivo_maintenance': pd.Series.mode
})

fdf.reset_index()

Unnamed: 0,gpcr_id,bmi,performance_score_ecog,ldh_sang_ul,neutro_absolus_gl,eosini_absolus_gl,leucocytes_sang_gl,immuno_therapy_type,lympho_absolus_gl,sex,NRAS_MUTATION,BRAF_MUTATION,concomittant_tvec,prior_targeted_therapy,prior_treatment,nivo_maintenance
0,34610001,27.75,0.5,226.0,8.6600,0.000,13.950,"[ipi, nivo, nivo]",4.500,female,[],n,0,0,0,1
1,34610002,24.62,1.0,,5.5400,0.230,7.800,"[ipi, nivo]",1.480,male,n,n,0,0,0,0
2,34610004,27.92,0.0,,3.1900,0.100,4.900,"[pembro, pembro]",1.230,male,y,n,0,0,0,0
3,34610005,26.00,0.5,225.0,9.6200,0.120,12.100,"[ipi, nivo, nivo]",1.815,female,n,y,0,0,0,1
4,34610006,31.23,0.0,246.5,,,,"[pembro, pembro]",,female,y,n,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,34610145,24.45,1.0,154.0,5.5400,0.000,7.100,"[ipi, nivo]",1.350,female,n,y,0,1,1,0
105,34610146,28.98,0.0,,,,,"[ipi, nivo, nivo]",,female,n,y,0,0,0,1
106,34610148,31.30,0.0,,5.8125,0.165,8.225,"[ipi, nivo, ipi, nivo]",1.580,male,y,n,0,0,0,0
107,34610149,22.66,0.0,175.0,4.0200,0.360,6.000,"[ipi, nivo]",1.020,female,y,n,0,0,0,0


In [146]:
blood_filtered[blood_filtered.gpcr_id == 34610001].NRAS_MUTATION

3    NaN
4    NaN
Name: NRAS_MUTATION, dtype: object

In [122]:
def retrieve_mapper(df: pd.DataFrame, patient: int, index: str, values: str):
    return df[df.gpcr_id == patient][[index, values]].sort_values(by=values)


def get_closest_blood_results(mapper: pd.DataFrame, study: str = 'post-01'):
    # Fetch index of wanted study
    p1i = mapper[mapper.study_name == study].index.to_numpy()[0]
    # Get intersection of closest before and after and check that they exist
    closest_idx = list(set((p1i - 1, p1i + 1)) & set(mapper[mapper.study_name == 'blood'].index))
    # Return closest before and after blood results 
    return mapper[mapper.study_name.isin([study, 'blood'])].loc[closest_idx]


In [12]:
px.scatter(merged[merged.study_name.isin(['pre-01', 'post-01', 'blood'])], 
           x='n_days_to_treatment_start', y='gpcr_id', color='study_name') \
    .update_layout(legend=top_h_legend, yaxis_title='Patient')