In [2]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px

import hvplot.pandas

import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

In [3]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [4]:
DATA_DIR = '/Volumes/lts4-immuno/data_2021-09-20'

lesions_file = 'melanoma_lesion-info_organ-overlap_2021-09-17_anonymized_cleaned_all.csv'
lesion_mapping_file = 'melanoma_lesion_mapping_2021-09-20_anonymized.csv'
patients_file = 'melanoma_patient-level_summary_anonymized.csv'
studies_file = 'melanoma_study_level_summary_anonymized.csv'

## Lesions info

In [5]:
lesions = pd.read_csv(os.path.join(DATA_DIR, lesions_file))
lesions.shape

(20892, 17)

In [6]:
lesions.head()

Unnamed: 0,gpcr_id,study_name,roi_id,roi_name,lesion_label_id,pars_bodypart_petct,pars_region_petct,pars_subregion_petct,pars_laterality_petct,pars_classification_petct,vol_ccm,max_suv_val,mean_suv_val,min_suv_val,sd_suv_val,is_malignant,assigned_organ
0,34610002,pre-01,0,Muscles1,1,lower limb,muscles,upper leg group,right,suspicious,1.0,5.58,3.37,2.36,0.92,True,other_lowerlimb
1,34610002,pre-01,1,Muscles2,2,lower limb,muscles,upper leg group,left,suspicious,0.61,5.72,3.5,2.46,0.97,True,other_lowerlimb
2,34610002,pre-01,2,Testis1,3,abdomen,testis,not specified,not specified,benign,50.15,3.79,2.37,1.78,0.41,False,other_abdomen
3,34610002,pre-01,3,Anus1,4,abdomen,anus,not specified,not specified,benign,1.59,2.77,2.23,2.01,0.21,False,other_abdomen
4,34610002,pre-01,4,LymphNodes1,5,abdomen,lymph nodes,inguinal / femoral,left,suspicious,1.83,2.79,1.56,1.18,0.33,True,lymphnode_abdomen


In [19]:
lesions.pars_classification_petct.value_counts()

benign        17641
suspicious     3251
Name: pars_classification_petct, dtype: int64

In [27]:
laterality = lesions.groupby(['pars_laterality_petct', 'pars_classification_petct']).size().to_frame('count').reset_index()

laterality.hvplot.heatmap(x='pars_laterality_petct', y='pars_classification_petct', C='count',
                          xlabel='Lesion laterality', ylabel='Classification', title='Laterality')

In [32]:
location = lesions.groupby(['pars_bodypart_petct', 'pars_region_petct']).size().to_frame('count').reset_index()

location.hvplot.heatmap(x='pars_bodypart_petct', y='pars_region_petct', C='count',
                        xlabel='Bodypart', ylabel='Region', title='Lesion location').opts(height=600)

In [14]:
@interact(patient=list(lesions.gpcr_id.unique()))
def show_lesions(patient):
    
    print(lesions[lesions.gpcr_id == patient].groupby(['study_name', 'pars_bodypart_petct', 'pars_region_petct']).size().to_frame('count'))

interactive(children=(Dropdown(description='patient', options=(34610002, 34610004, 34610005, 34610006, 3461000…

In [7]:
lesions.hvplot.scatter(x='vol_ccm', y='mean_suv_val', by='pars_classification_petct')

In [60]:
organ_count = lesions.groupby(['assigned_organ', 'is_malignant']).size().to_frame('count').reset_index()
organ_count['is_malignant'] = organ_count.is_malignant.apply(lambda b: 'malignant' if b else 'benign')

organs = hv.Dimension('assigned_organ', label='Assigned organ')
count = hv.Dimension('count', label='Count', unit='lesions')
malignant = hv.Dimension('is_malignant', label='Malignant')

organ_bars = hv.Bars(organ_count, kdims=['assigned_organ', 'is_malignant'], vdims='count')

organ_bars.opts(stacked=True, width=700, xrotation=15).sort(by='count', reverse=True)

In [62]:
def extract_study_phase(n: str) -> int:
    status, number = n.split('-')
    sign = -1 if status == 'pre' else 1
    return sign * int(number)

In [63]:
scan_pp = lesions.groupby(['gpcr_id','study_name']).size().to_frame('lesions').reset_index()
scan_pp['study_name'] = scan_pp.study_name.apply(extract_study_phase)
scan = hv.Dimension('study_name', label='Scan w.r.t. treatment')
lesion = hv.Dimension('lesions', label='Lesions')

spp = hv.BoxWhisker(scan_pp, kdims=scan, vdims=lesion).opts(width=700)

patients_ps = scan_pp[['study_name', 'gpcr_id']].groupby('study_name').size().to_frame('patients').reset_index()
patients = hv.Dimension('patients', label='Patients')

pps = hv.Bars(patients_ps, kdims=scan, vdims=patients).opts(width=700)

(pps + spp.sort()).cols(1)

In [49]:
# suv = hv.Dimension('mean_suv_val', label='Average SUV')
# volume = hv.Dimension('vol_ccm', label='Lesion volume', unit='ccm')

# mal_points = hv.Points(lesions[lesions.is_malignant == True], kdims=[suv, volume]).opts(height=500, width=500)
# nmal_points = hv.Points(lesions[lesions.is_malignant == False], kdims=[suv, volume]).opts(height=500, width=500)

# xhist, yhist = (hv.operation.histogram(mal_points, dimension=dim) * hv.operation.histogram(nmal_points, dimension=dim) for dim in [suv, volume])

# composition = (mal_points * nmal_points) << yhist.opts(width=200) << xhist.opts(height=200)

# composition.opts(opts.Histogram(alpha=0.3))

## Lesions mapping   

In [9]:
lesion_mapping = pd.read_csv(os.path.join(DATA_DIR, lesion_mapping_file))

In [10]:
lesion_mapping.head()

Unnamed: 0,gpcr_id,study_name,roi_id,lesion_label_id,lesion_global_id,mapped,reg_center_coord_x,reg_center_coord_y,reg_center_coord_z
0,34610118,pre-02,4,5,1,False,-63.0805,78.7272,313.552
1,34610118,pre-02,18,19,2,False,45.8763,16.466,1085.55
2,34610118,pre-01,0,1,3,False,19.2925,108.388,594.454
3,34610118,pre-01,2,3,4,False,-44.9416,110.341,775.425
4,34610118,pre-01,7,8,5,False,246.549,83.6589,1035.06


## Patients

In [39]:
patients = pd.read_csv(os.path.join(DATA_DIR, patients_file))

In [40]:
patients.head()

Unnamed: 0,gpcr_id,age_at_treatment_start_in_years,duration_treatment_in_days,death_event_observed,survival_in_days,n_imgs_before_treatment,n_imgs_during_treatment,n_imgs_after_treatment_end,n_imgs_after_treatment_start
0,34610039,64,63,False,768,1,0,1,1
1,34610116,80,22,False,1312,2,0,4,4
2,34610117,55,0,False,1145,0,0,4,4
3,34610118,68,63,True,639,2,0,6,6
4,34610042,52,60,False,707,0,0,7,7


In [41]:
patients.shape

(129, 9)

In [53]:
patients['age_at_treatment_start_in_years'] = \
    patients.age_at_treatment_start_in_years.apply(lambda a: 90 if a == '90 or older' else int(a))

In [57]:
patients.hvplot.hist('age_at_treatment_start_in_years',
                     xlabel='Age at treatment start (years)', ylabel='Amount of patients')

In [58]:
patients.hvplot.hist('duration_treatment_in_days', by='death_event_observed', 
                     subplots=True, width=350, legend='top',
                     xlabel='Treatment duration (days)', ylabel='Amount of patients').cols(1)

In [43]:
patients.hvplot.scatter(x='duration_treatment_in_days', y='survival_in_days', 
                        by='death_event_observed', legend='top',
                        xlabel='Treatment duration (days)', ylabel='Survival (days)')

In [16]:
patients[['death_event_observed', 'survival_in_days']]

Unnamed: 0,death_event_observed,survival_in_days
0,False,768
1,False,1312
2,False,1145
3,True,639
4,False,707
...,...,...
124,True,747
125,False,1343
126,False,1619
127,False,1690


In [17]:
from sksurv.nonparametric import kaplan_meier_estimator

time, s_prob = kaplan_meier_estimator(patients.death_event_observed, patients.survival_in_days)

In [24]:
km = pd.DataFrame(np.array([time, s_prob]).T, columns=['time', 'survival_probability'])

km.hvplot.step(x='time', y='survival_probability',
               title='Kaplan-Meier survival curve', 
               xlabel='Time (days)', ylabel='Survival probability')

## Studies

In [34]:
studies = pd.read_csv(os.path.join(DATA_DIR, studies_file))
studies.shape

(472, 22)

In [35]:
studies.head()

Unnamed: 0,gpcr_id,study_name,is_before_treatment,is_during_treatment,is_after_treatment_end,nth_before_treatment,nth_after_treatment_start,nth_during_treatment,nth_after_treatment_end,n_days_to_treatment_start,...,brain_seg_exists,bones_seg_exists,spleen_seg_exists,aorta_seg_exists,heart_seg_exists,kidney_right_seg_exists,kidney_left_seg_exists,lung_right_seg_exists,lung_left_seg_exists,liver_seg_exists
0,34610001,pre-02,True,False,False,2.0,,,,-43,...,True,True,True,True,True,True,True,True,True,True
1,34610001,pre-01,True,False,False,1.0,,,,-7,...,True,True,True,True,True,True,True,True,True,True
2,34610001,post-01,False,False,True,,1.0,,1.0,87,...,True,True,True,True,True,True,True,True,True,True
3,34610001,post-02,False,False,True,,2.0,,2.0,183,...,True,True,True,True,True,False,False,True,True,True
4,34610001,post-03,False,False,True,,3.0,,3.0,275,...,True,True,True,True,True,True,True,True,True,True


In [38]:
studies.groupby('gpcr_id').size().mean()

3.746031746031746

In [34]:
studies['study_name'] = studies.study_name.apply(extract_study_phase)

In [29]:
studies.columns

Index(['gpcr_id', 'study_name', 'is_before_treatment', 'is_during_treatment',
       'is_after_treatment_end', 'nth_before_treatment',
       'nth_after_treatment_start', 'nth_during_treatment',
       'nth_after_treatment_end', 'n_days_to_treatment_start',
       'n_days_to_treatment_end', 'is_malignant', 'brain_seg_exists',
       'bones_seg_exists', 'spleen_seg_exists', 'aorta_seg_exists',
       'heart_seg_exists', 'kidney_right_seg_exists', 'kidney_left_seg_exists',
       'lung_right_seg_exists', 'lung_left_seg_exists', 'liver_seg_exists'],
      dtype='object')

In [59]:
@interact(patient=list(studies.gpcr_id.unique()))
def show_progression(patient=34610001):
    
    print(studies[studies.gpcr_id == patient][['study_name', 'is_malignant']])
    sns.lineplot(data=studies[studies.gpcr_id == patient], x='study_name', y='is_malignant')

interactive(children=(Dropdown(description='patient', options=(34610001, 34610002, 34610004, 34610005, 3461000…