# Imports

In [1]:
import sys
# !{sys.executable} -m pip install -r requirements.txt

In [2]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import joblib
import xgboost as xgb
import dalex as dx

# Configurations

In [3]:
params = {
    "im2vec_file": "./siamese_network_results.csv",
    "pyradiomics_file": "./segmentation_results.csv",
    "model_name": 'model_xgb_estimator.json',
    "seed": 5
}

# Get representations

In [4]:
df_embeddings = pd.read_csv(params["im2vec_file"], sep = ',', converters={"class_anchor": lambda x: x.strip("[]' ,").replace("'","").split(", ")})
df_embeddings

Unnamed: 0,Anchor,class_anchor,patient_id,age,projection,0,1,2,3,4,...,517,518,519,520,521,522,523,524,525,526
0,10001.png,[normal],,1.0,,0.114415,-0.000384,-0.045945,-0.689675,0.074424,...,0.000406,0.000394,0.000381,0.000722,0.000436,0.029501,0.000436,0.000414,0.000401,0.000436
1,10002.png,[normal],,4.0,,0.161416,0.117186,0.079933,-0.689472,0.123091,...,0.000394,0.000382,0.000369,0.000723,0.000425,0.014500,0.000423,0.000402,0.000388,0.000425
2,10003.png,[normal],,11.0,,0.440090,-0.071779,-0.005180,-0.585554,0.302895,...,0.000425,0.000414,0.000402,0.000709,0.000453,0.106709,0.000456,0.000432,0.000420,0.000452
3,10004.png,[normal],,0.0,,0.199513,0.236111,0.138952,-0.607385,0.068279,...,0.000412,0.000400,0.000387,0.000727,0.000442,0.032948,0.000442,0.000420,0.000407,0.000442
4,10005.png,[normal],,2.0,,0.129760,0.011995,-0.030717,-0.662310,0.089031,...,0.000409,0.000398,0.000385,0.000713,0.000438,0.045341,0.000440,0.000417,0.000404,0.000438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2153,50496.png,[mass/nodule],,43.0,,-0.054664,0.006316,-0.078433,-0.403447,0.084872,...,0.000406,0.000394,0.000381,0.000739,0.000438,0.016468,0.000437,0.000415,0.000401,0.000438
2154,50497.png,[mass/nodule],,32.0,,-0.063231,-0.169365,-0.024540,-0.379094,0.110462,...,0.000406,0.000394,0.000381,0.000728,0.000437,0.022816,0.000436,0.000414,0.000400,0.000436
2155,50498.png,[mass/nodule],,38.0,,0.420875,0.009254,-0.171295,-0.415959,0.128509,...,0.000416,0.000405,0.000392,0.000724,0.000446,0.045377,0.000447,0.000424,0.000411,0.000446
2156,50499.png,[mass/nodule],,41.0,,0.090890,0.093846,-0.117923,-0.465891,0.090001,...,0.000396,0.000384,0.000371,0.000723,0.000427,0.016110,0.000426,0.000405,0.000391,0.000427


In [5]:
df_pyradiomics = pd.read_csv(params["pyradiomics_file"], sep = ',')
df_pyradiomics

Unnamed: 0,Anchor,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Elongation,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,...,diagnostics_Mask-original_CenterOfMassIndex_1,diagnostics_Mask-original_CenterOfMass_0,diagnostics_Mask-original_CenterOfMass_1,diagnostics_Image-original_Size_0,diagnostics_Image-original_Size_1,diagnostics_Image-original_Size_2,diagnostics_Mask-original_Spacing_0,diagnostics_Mask-original_Spacing_1,diagnostics_Mask-original_Size_0,diagnostics_Mask-original_Size_1
0,10001.png,3643.730799,0.0,16383.0,146995,4,0.489928,171.803021,140.9372,124.4544,...,449.692194,95.386521,83.282994,1093,933,1,0.1852,0.1852,1093,933
1,10002.png,5020.519682,0.0,16383.0,371548,4,0.646898,231.432628,184.4808,126.2736,...,473.640316,128.721183,89.802204,1305,1137,1,0.1896,0.1896,1305,1137
2,10003.png,1788.379015,1.0,4082.0,1497820,3,0.720892,313.900836,253.6720,197.4320,...,951.722451,167.579316,140.854923,2278,2061,1,0.1480,0.1480,2278,2061
3,10004.png,5606.361342,0.0,16383.0,136861,5,0.586460,153.657134,138.7148,68.8944,...,365.727687,94.583378,67.732768,1014,801,1,0.1852,0.1852,1014,801
4,10005.png,2806.019062,0.0,16383.0,228130,2,0.523663,207.943838,163.3464,89.0812,...,447.498422,97.495622,82.876708,1057,946,1,0.1852,0.1852,1057,946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2064,50482.png,566.375375,0.0,1023.0,1873074,2,0.762894,328.755683,259.8310,231.9460,...,1086.864444,155.905923,155.421616,2244,2652,1,0.1430,0.1430,2244,2652
2065,50483.png,545.424717,0.0,1023.0,1848472,2,0.677131,349.154369,285.1420,200.0570,...,1103.668330,205.968190,157.824571,2997,2797,1,0.1430,0.1430,2997,2797
2066,50484.png,612.332757,0.0,1023.0,2143555,2,0.618238,406.744610,341.3410,219.5050,...,1217.310802,197.860223,174.075445,2822,2977,1,0.1430,0.1430,2822,2977
2067,50485.png,543.799692,0.0,1023.0,1790211,4,0.648517,339.546090,272.9870,228.6570,...,1094.840549,203.279836,156.562199,3000,2961,1,0.1430,0.1430,3000,2961


# Prepare data for inference

In [6]:
X = df_embeddings.drop(df_embeddings.columns[0], axis=1, inplace =False)
X = X.to_numpy()

mlb = MultiLabelBinarizer(sparse_output=True)

X = df_embeddings.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df_embeddings['class_anchor']),
                index=df_embeddings.index,
                columns=mlb.classes_).add_prefix('class_'))

X.head()

Unnamed: 0,Anchor,class_anchor,patient_id,age,projection,0,1,2,3,4,...,523,524,525,526,class_airspace_opacification,class_cardiomegaly,class_fluid,class_mass/nodule,class_normal,class_pneumothorax
0,10001.png,[normal],,1.0,,0.114415,-0.000384,-0.045945,-0.689675,0.074424,...,0.000436,0.000414,0.000401,0.000436,0,0,0,0,1,0
1,10002.png,[normal],,4.0,,0.161416,0.117186,0.079933,-0.689472,0.123091,...,0.000423,0.000402,0.000388,0.000425,0,0,0,0,1,0
2,10003.png,[normal],,11.0,,0.44009,-0.071779,-0.00518,-0.585554,0.302895,...,0.000456,0.000432,0.00042,0.000452,0,0,0,0,1,0
3,10004.png,[normal],,0.0,,0.199513,0.236111,0.138952,-0.607385,0.068279,...,0.000442,0.00042,0.000407,0.000442,0,0,0,0,1,0
4,10005.png,[normal],,2.0,,0.12976,0.011995,-0.030717,-0.66231,0.089031,...,0.00044,0.000417,0.000404,0.000438,0,0,0,0,1,0


In [7]:
X.columns = [int(i) if str(i).isnumeric() else i for i in X.columns]

y = pd.DataFrame( X['class_anchor'].apply(lambda x: ','.join(map(str, x))), columns=['class_anchor'])
y['bin'] =  y.rank(method='dense').astype(int) -1
y['Anchor'] = X.loc[:, 'Anchor']

anchor = X.loc[:, 'Anchor']
emb = X.loc[:, 0:max([i for i in X.columns if isinstance(i, int) or i.isnumeric()])]
tabul = X.loc[:, 'age']
X = pd.concat([anchor, emb, tabul], axis=1)

X.head()

Unnamed: 0,Anchor,0,1,2,3,4,5,6,7,8,...,518,519,520,521,522,523,524,525,526,age
0,10001.png,0.114415,-0.000384,-0.045945,-0.689675,0.074424,-0.019164,-0.236524,0.519167,0.035036,...,0.000394,0.000381,0.000722,0.000436,0.029501,0.000436,0.000414,0.000401,0.000436,1.0
1,10002.png,0.161416,0.117186,0.079933,-0.689472,0.123091,0.06416,-0.290021,0.373386,-0.008939,...,0.000382,0.000369,0.000723,0.000425,0.0145,0.000423,0.000402,0.000388,0.000425,4.0
2,10003.png,0.44009,-0.071779,-0.00518,-0.585554,0.302895,0.020642,-0.275552,0.256461,-0.212354,...,0.000414,0.000402,0.000709,0.000453,0.106709,0.000456,0.000432,0.00042,0.000452,11.0
3,10004.png,0.199513,0.236111,0.138952,-0.607385,0.068279,0.09338,-0.076792,0.354622,0.123941,...,0.0004,0.000387,0.000727,0.000442,0.032948,0.000442,0.00042,0.000407,0.000442,0.0
4,10005.png,0.12976,0.011995,-0.030717,-0.66231,0.089031,0.062323,-0.135343,0.454197,-0.03771,...,0.000398,0.000385,0.000713,0.000438,0.045341,0.00044,0.000417,0.000404,0.000438,2.0


In [8]:
df_pyradiomics = pd.read_csv(params["pyradiomics_file"], sep = ',')

removal_list = list(set(X.Anchor.to_list()) - set(df_pyradiomics.Anchor.to_list()))

y = y[~y['Anchor'].isin(removal_list)]
y.drop(['Anchor'], inplace=True, axis=1)

del removal_list

In [9]:
X = pd.merge(X, df_pyradiomics, on="Anchor")

df_pyradiomics.drop(['Anchor'], inplace=True, axis=1)
X.drop(['Anchor'], inplace=True, axis=1)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,diagnostics_Mask-original_CenterOfMassIndex_1,diagnostics_Mask-original_CenterOfMass_0,diagnostics_Mask-original_CenterOfMass_1,diagnostics_Image-original_Size_0,diagnostics_Image-original_Size_1,diagnostics_Image-original_Size_2,diagnostics_Mask-original_Spacing_0,diagnostics_Mask-original_Spacing_1,diagnostics_Mask-original_Size_0,diagnostics_Mask-original_Size_1
0,0.114415,-0.000384,-0.045945,-0.689675,0.074424,-0.019164,-0.236524,0.519167,0.035036,0.513395,...,449.692194,95.386521,83.282994,1093,933,1,0.1852,0.1852,1093,933
1,0.161416,0.117186,0.079933,-0.689472,0.123091,0.06416,-0.290021,0.373386,-0.008939,0.487495,...,473.640316,128.721183,89.802204,1305,1137,1,0.1896,0.1896,1305,1137
2,0.44009,-0.071779,-0.00518,-0.585554,0.302895,0.020642,-0.275552,0.256461,-0.212354,0.396841,...,951.722451,167.579316,140.854923,2278,2061,1,0.148,0.148,2278,2061
3,0.199513,0.236111,0.138952,-0.607385,0.068279,0.09338,-0.076792,0.354622,0.123941,0.568928,...,365.727687,94.583378,67.732768,1014,801,1,0.1852,0.1852,1014,801
4,0.12976,0.011995,-0.030717,-0.66231,0.089031,0.062323,-0.135343,0.454197,-0.03771,0.467546,...,447.498422,97.495622,82.876708,1057,946,1,0.1852,0.1852,1057,946


In [10]:
_, X_test, _, y_test = train_test_split(X, y, test_size=0.20, random_state=params["seed"])

scaler = MinMaxScaler()
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)
X_test



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,diagnostics_Mask-original_CenterOfMassIndex_1,diagnostics_Mask-original_CenterOfMass_0,diagnostics_Mask-original_CenterOfMass_1,diagnostics_Image-original_Size_0,diagnostics_Image-original_Size_1,diagnostics_Image-original_Size_2,diagnostics_Mask-original_Spacing_0,diagnostics_Mask-original_Spacing_1,diagnostics_Mask-original_Size_0,diagnostics_Mask-original_Size_1
0,0.148033,0.984783,0.567092,0.691476,0.421484,0.694835,0.820932,1.000000,0.828816,0.691887,...,0.428471,0.605514,0.438796,0.468832,0.534834,0.0,0.509435,0.509435,0.468832,0.534834
1,0.417454,0.386013,0.622815,0.150327,0.606634,0.686947,0.496071,0.493719,0.485600,0.706690,...,0.695283,0.566992,0.434059,0.710351,1.000000,0.0,0.000000,0.000000,0.710351,1.000000
2,0.474705,0.217836,0.586546,0.483994,0.623095,0.379727,0.353990,0.676442,0.422375,0.325324,...,0.397067,0.951584,0.403700,0.589736,0.676301,0.0,0.509435,0.509435,0.589736,0.676301
3,0.606160,0.353712,0.557863,0.282206,0.637548,0.438712,0.609628,0.528418,0.368582,0.483791,...,0.315760,0.818340,0.295612,0.639606,0.714938,0.0,0.456369,0.456369,0.639606,0.714938
4,0.776237,0.121006,0.104796,0.321854,1.000000,0.278855,0.685602,0.811834,0.885033,0.745389,...,0.352518,0.763636,0.324136,0.648884,0.576209,0.0,0.424529,0.424529,0.648884,0.576209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409,0.141892,0.589641,0.615451,0.492879,0.424269,0.532218,0.232864,0.596349,0.648681,0.637762,...,0.391353,0.785959,0.397315,0.588286,0.619106,0.0,0.509435,0.509435,0.588286,0.619106
410,0.316088,0.615059,0.408907,0.595588,0.425406,0.481281,0.435999,0.528989,0.323581,0.335794,...,0.377789,0.674332,0.382157,0.554074,0.677213,0.0,0.509435,0.509435,0.554074,0.677213
411,0.665596,0.882603,0.698767,0.370100,0.655503,0.524283,0.720303,0.475892,0.624037,0.814357,...,0.490416,0.852852,0.508022,0.587707,0.670824,0.0,0.509435,0.509435,0.587707,0.670824
412,0.368302,0.620091,0.313862,0.207153,0.489669,0.690175,0.498964,0.656001,0.394609,0.695701,...,0.252374,0.000000,0.099620,0.121195,0.389413,0.0,0.000000,0.000000,0.121195,0.389413


## Load model

In [11]:
# model = joblib.load("model_xgb_estimator.json")

model = xgb.XGBClassifier()
model.load_model(params['model_name'])



## Explain model

In [12]:
exp = dx.Explainer(model, X_test, y_test[['bin']])
exp

Preparation of a new explainer is initiated

  -> data              : 414 rows 654 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 414 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7ff4a07a61f0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.000355, mean = 0.196, max = 0.978
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.658, mean = 2.47, max = 5.0
  -> model_info        : package xgboost

A new explainer has been created!


<dalex._explainer.object.Explainer at 0x7ff49ab97ca0>