# Baseline model for progression prediction

## Set-up

### Import librairies

In [37]:
import os, sys

from tqdm.notebook import tqdm
from typing import List, Dict

import pandas as pd
import numpy as np
import networkx as nx

import plotly.express as px
import plotly.io as pio

pio.templates.default = 'seaborn'

In [38]:
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../src'))

### Fetch data

In [39]:
from utils import fetch_data

labels, lesions, patients = fetch_data(verbose=1)

include_radiomics = True

Post-1 study lesions extracted for 88 patients
Post-1 study labels added for 88 patients
The intersection of datasets showed 88 potential datapoints.


### Prepare data

In [40]:
lesions_agg = lesions.groupby('gpcr_id').agg({
    'voxels': np.sum,
    'max_suv_val': np.mean,
    'mean_suv_val': np.mean,
    'min_suv_val': np.mean,
    'sd_suv_val': np.mean,
    'assigned_organ': pd.Series.tolist
}).reset_index()

dataset = lesions_agg.merge(patients, on='gpcr_id', how='inner')
dataset.set_index('gpcr_id', inplace=True)

In [41]:
if not include_radiomics:
    valid_columns = [column for column in dataset.columns if column not in ['voxels', 'max_suv_val', 'mean_suv_val', 'min_suv_val', 'sd_suv_val', 'assigned_organ']]
    dataset = dataset[valid_columns]

In [42]:
from utils import Preprocessor

# Separate features by type
numerical = list(dataset.select_dtypes(np.number).columns)
categorical = list(dataset.select_dtypes([bool, object]).columns)
multivalue = ['immuno_therapy_type']

if include_radiomics:
    multivalue.append('assigned_organ')

# Remove multivalue features from categorical ones
for feature in multivalue:
    categorical.remove(feature)
    
features_range = list(range(len(numerical) + len(categorical) + len(multivalue)))
bp = np.cumsum([len(numerical), len(categorical), len(multivalue)])

# Build PipeLine of ColumnTransformers
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

if include_radiomics:
    ct = Pipeline([
        ('imputers', ColumnTransformer([
            ('median', SimpleImputer(strategy='median'), numerical),
            ('frequent', SimpleImputer(strategy='most_frequent'), categorical)
        ], remainder='passthrough')),
        ('preprocess', ColumnTransformer([
            ('scaler', StandardScaler(), features_range[0:bp[0]]),
            ('one-hot', OneHotEncoder(handle_unknown='ignore'), features_range[bp[0]:bp[1]]),
            ('count-vec1', CountVectorizer(analyzer=set), features_range[bp[1]:bp[2]][0]),
            ('count-vec2', CountVectorizer(analyzer=set), features_range[bp[1]:bp[2]][1])
        ], remainder='passthrough')),
    ])
else:
    ct = Pipeline([
        ('imputers', ColumnTransformer([
            ('median', SimpleImputer(strategy='median'), numerical),
            ('frequent', SimpleImputer(strategy='most_frequent'), categorical)
        ], remainder='passthrough')),
        ('preprocess', ColumnTransformer([
            ('scaler', StandardScaler(), features_range[0:bp[0]]),
            ('one-hot', OneHotEncoder(handle_unknown='ignore'), features_range[bp[0]:bp[1]]),
            ('count-vec1', CountVectorizer(analyzer=set), features_range[bp[1]:bp[2]][0])
        ], remainder='passthrough')),
    ])

ppor = Preprocessor(
    pipe=ct,
    feats_out_fn=lambda ct: ct.named_steps['imputers'].transformers_[0][2] \
        + list(ct.named_steps['preprocess'].transformers_[1][1].get_feature_names_out()) \
        + list(ct.named_steps['preprocess'].transformers_[2][1].get_feature_names_out())
        + list(ct.named_steps['preprocess'].transformers_[3][1].get_feature_names_out()) if include_radiomics else None)


## Testing different classifiers

In [43]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "Logistic Regression"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=7),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=.01, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    LogisticRegression(penalty='l2', solver='liblinear')]

In [44]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score

storage = []

kfold = KFold(n_splits=5, shuffle=True)

pbar = tqdm(enumerate(kfold.split(labels.index)), total=5)
for fold, (I_train, I_test) in pbar:
    pbar.set_description(f'Fold {fold}')
    
    print(I_train)
    print(I_test)
    
    ppor.fit(dataset.iloc[I_train])

    X_train = ppor.transform(dataset.iloc[I_train]) 
    X_test = ppor.transform(dataset.iloc[I_test]) 

    y_train = labels.iloc[I_train]
    y_test = labels.iloc[I_test]
    
    for name, clf in zip(names, classifiers):

        clf.fit(X_train, y_train)
    
        y_pred = clf.predict(X_test)
        
        # Add testing accuracy
        storage.append(
            dict(model=name,
                 metric='Accuracy - testing',
                 value=accuracy_score(y_test, y_pred))
        )
        
        # Add ROC AUC score
        storage.append(
            dict(model=name,
                 metric='ROC AUC - testing',
                 value=roc_auc_score(y_test, y_pred))
        )
        
        # Add other binary classification metrics
        bin_class_metrics = precision_recall_fscore_support(y_test, y_pred, average='binary')
        for value, metric in zip(list(bin_class_metrics)[:-1], ['precision', 'recall', 'fscore']):
            storage.append(dict(model=name, metric=(metric.capitalize() + ' - testing'), value=value))
        

  0%|          | 0/5 [00:00<?, ?it/s]

[ 0  1  2  4  5  6  8  9 10 11 12 14 15 16 17 18 20 21 22 23 24 25 26 28
 29 30 31 33 34 35 36 37 38 39 42 44 45 46 47 49 50 51 52 53 54 55 56 58
 59 60 61 62 63 64 65 66 67 68 70 71 72 74 77 78 81 83 84 85 86 87]
[ 3  7 13 19 27 32 40 41 43 48 57 69 73 75 76 79 80 82]



X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


X does not have valid feature names, but AdaBoostClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



[ 0  2  3  5  6  7  9 10 11 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27
 29 30 31 32 33 34 35 36 38 39 40 41 43 44 45 46 47 48 49 50 51 53 54 55
 56 57 58 60 64 65 67 68 69 70 72 73 74 75 76 77 78 79 80 81 82 85]
[ 1  4  8 16 28 37 42 52 59 61 62 63 66 71 83 84 86 87]



X does not have valid feature names, but AdaBoostClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



[ 0  1  3  4  5  6  7  8 10 11 13 14 15 16 18 19 20 21 22 23 24 25 26 27
 28 30 32 33 34 35 37 38 39 40 41 42 43 45 47 48 49 50 51 52 54 57 58 59
 61 62 63 65 66 68 69 70 71 73 74 75 76 78 79 80 82 83 84 85 86 87]
[ 2  9 12 17 29 31 36 44 46 53 55 56 60 64 67 72 77 81]



X does not have valid feature names, but AdaBoostClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



[ 1  2  3  4  5  7  8  9 12 13 15 16 17 18 19 21 22 23 24 27 28 29 31 32
 33 34 35 36 37 38 39 40 41 42 43 44 45 46 48 52 53 54 55 56 57 59 60 61
 62 63 64 66 67 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 86 87]
[ 0  6 10 11 14 20 25 26 30 47 49 50 51 58 65 68 85]



X does not have valid feature names, but AdaBoostClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



[ 0  1  2  3  4  6  7  8  9 10 11 12 13 14 16 17 19 20 25 26 27 28 29 30
 31 32 36 37 40 41 42 43 44 46 47 48 49 50 51 52 53 55 56 57 58 59 60 61
 62 63 64 65 66 67 68 69 71 72 73 75 76 77 79 80 81 82 83 84 85 86 87]
[ 5 15 18 21 22 23 24 33 34 35 38 39 45 54 70 74 78]



X does not have valid feature names, but AdaBoostClassifier was fitted with feature names



In [45]:
top_h_legend = dict(orientation='h', yanchor="bottom", y=1.02)

In [46]:
import plotly.express as px

results = pd.DataFrame(storage)

top_three = ['Logistic Regression', 'Linear SVM', 'Decision Tree']

results = results[results.model.isin(top_three)]

results_agg = results.groupby(['metric', 'model']).agg({'value':['mean', 'std']}).reset_index()
results_agg.columns = list(map('_'.join, results_agg.columns.values))

px.bar(results_agg, #.sort_values(['metric_', 'value_mean'], ascending=[True, False]),
       x='metric_', y='value_mean', color='model_', error_y='value_std', barmode='group') \
       .update_layout(legend=top_h_legend, xaxis_title='Evaluation metric', yaxis_title='Performance')

In [47]:
def get_best_worst_coefs(classifier):
    for feature, coef in list(sorted(zip(ppor.get_feature_names_out(), classifier.coef_.reshape(-1)), key=lambda x: x[1], reverse=True))[:5]:
        print(f'{feature:<35} {coef:,.4f}')
        
    for feature, coef in list(sorted(zip(ppor.get_feature_names_out(), classifier.coef_.reshape(-1)), key=lambda x: x[1], reverse=False))[:5]:
        print(f'{feature:<35} {coef:,.4f}')

In [58]:
list(zip(categorical, range(len(categorical))))

[('sex', 0),
 ('NRAS_MUTATION', 1),
 ('BRAF_MUTATION', 2),
 ('concomittant_tvec', 3),
 ('concomittant_LAG3', 4),
 ('prior_targeted_therapy', 5),
 ('prior_treatment', 6),
 ('nivo_maintenance', 7)]

In [56]:
for classifier in ['Logistic Regression', 'Linear SVM']:
    
    print(f'\n{classifier}:\n====')
    print(get_best_worst_coefs(dict(zip(names, classifiers))[classifier]))


Logistic Regression:
====
lymphnode_lowerlimb                 1.2520
neutro_absolus_gl                   0.5499
lympho_absolus_gl                   0.5328
ldh_sang_ul                         0.5142
leucocytes_sang_gl                  0.4673
sd_suv_val                          -1.4926
liver                               -0.9066
eosini_absolus_gl                   -0.8189
bones_thorax                        -0.6589
x2_y                                -0.5345
None

Linear SVM:
====
lympho_absolus_gl                   0.1821
ldh_sang_ul                         0.1689
x7_True                             0.1424
nivo                                0.1424
leucocytes_sang_gl                  0.1389
sd_suv_val                          -0.2120
eosini_absolus_gl                   -0.1463
x7_False                            -0.1424
max_suv_val                         -0.1275
bones_thorax                        -0.1000
None


In [60]:
dtc = dict(zip(names, classifiers))['Decision Tree']

print('Decision Tree\n====')

for feature, coef in list(sorted(zip(ppor.get_feature_names_out(), dtc.feature_importances_.reshape(-1)), key=lambda x: x[1], reverse=True))[:10]:
    print(f'{feature:<35} {coef:,.4f}')


Decision Tree
====
sd_suv_val                          0.2939
ldh_sang_ul                         0.1234
bmi                                 0.1044
eosini_absolus_gl                   0.0926
age_at_treatment_start_in_years     0.0909
min_suv_val                         0.0750
voxels                              0.0560
neutro_absolus_gl                   0.0453
x6_False                            0.0433
x2_n                                0.0308
