In [240]:
# -*- coding: utf-8 -*-

"""
Created May 7, 2022
"""

import glob
import os
import pickle

import numpy as np
import pandas as pd
from IPython.display import display_html
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

import utils


BASED_STRUCTURED_DIR = "/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/structured_data"
PREDICTIONS_CSV = "/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/model_predictions/predictions_2022_12_06_08_09_19_298378.csv"
DICOM_METADATA_PATH = "./data/dicom_metadata_df.pkl"
VARIABLES_OF_INTEREST = [
    "(0008, 0070) Manufacturer",
    "(0008, 1090) Manufacturer's Model Name",
    "(0012, 0030) Clinical Trial Site ID",
    "(0018, 1000) Device Serial Number",
    "Site ID - Model",
    "hospital_site",
]
CLINICAL_WAVES_TO_FOLLOWUP = {
    "00": "Screening Visit",
    "01": "12 month Annual Visit",
    "03": "24 month Annual Visit",
    "05": "36 month Annual Visit",
    "06": "48 month Annual Visit",
    "07": "60 month Annual Visit",
    "08": "72 month Annual Visit",
    "09": "84 month Annual Visit",
    "10": "96 month Annual Visit",
    "11": "108 month Annual Visit",
}


## Structured Data

Here we combine the structured data from multiple timepoints.

In [241]:
original_dataframes: dict[str, pd.DataFrame] = {}

clinical_data_files = glob.glob(os.path.join(BASED_STRUCTURED_DIR, "AllClinical*.txt"))
xray_files = glob.glob(os.path.join(BASED_STRUCTURED_DIR, "XRay*.txt"))
enrollee_files = glob.glob(os.path.join(BASED_STRUCTURED_DIR, "Enrollees*.txt"))

combined_files = sorted(clinical_data_files + xray_files)

for filename in combined_files:
    name = os.path.basename(filename).replace(".txt", "").lower()

    df = utils.read_data_frame(os.path.join(BASED_STRUCTURED_DIR, filename))

    if df is None:
        continue

    original_dataframes[name] = df

if not len(enrollee_files):
    raise ValueError("missing Enrollee.txt from structured data")

df_site = utils.read_data_frame(enrollee_files[0])[["id", "v00site"]].astype("string")


In [242]:
dfs = {}

for side in ["left", "right"]:
    subset_cols = (
        ["id", "kooskpl", "womkpl"] if side == "left" else ["id", "kooskpr", "womkpr"]
    )

    dfs[side] = utils.concatenate_from_timepoints(
        original_dataframes,
        dataset_substring="allclinical",
        subset=subset_cols,
    )

    dfs[side] = dfs[side].merge(df_site, on="id", how="inner")
    dfs[side].rename(columns={"v00site": "hospital_site"}, inplace=True)

    utils.validate_column(
        pd.Series(dfs[side].columns),
        subset_cols + ["visit", "hospital_site"],
    )

    dfs[side].columns = [
        "id",
        "koos_pain_subscore",
        "womac_pain_subscore",
        "visit",
        "hospital_site",
    ]  # type:ignore
    dfs[side]["side"] = side

all_knee_pain_scores = pd.concat([val for val in dfs.values()])
all_knee_pain_scores["visit"] = all_knee_pain_scores["visit"].apply(
    lambda x: CLINICAL_WAVES_TO_FOLLOWUP.get(x, pd.NA)
)
all_knee_pain_scores


Unnamed: 0,id,koos_pain_subscore,womac_pain_subscore,visit,hospital_site,side
0,9000099,100.0,0.0,Screening Visit,B,left
1,9000099,88.9,0.0,12 month Annual Visit,B,left
2,9000099,80.6,4.0,24 month Annual Visit,B,left
3,9000099,97.2,0.0,,B,left
4,9000099,83.3,2.0,36 month Annual Visit,B,left
...,...,...,...,...,...,...
48737,9999878,94.4,0.0,60 month Annual Visit,C,right
48738,9999878,97.2,0.0,72 month Annual Visit,C,right
48739,9999878,97.2,0.0,84 month Annual Visit,C,right
48740,9999878,84.4,1.0,96 month Annual Visit,C,right


In [244]:
with open(DICOM_METADATA_PATH, "rb") as f:
    dicom_metadata = pickle.load(f)

dicom_metadata["join_id"] = dicom_metadata.index.values.astype("str")
dicom_metadata["dicom_join_id"] = dicom_metadata["(0010, 0020) Patient ID"].astype(
    "str"
)
dicom_metadata["Site ID - Model"] = dicom_metadata[
    "(0012, 0030) Clinical Trial Site ID"
].str.cat(dicom_metadata["(0008, 1090) Manufacturer's Model Name"])

dicom_metadata[
    ["join_id", "dicom_join_id"]
    + [v for v in VARIABLES_OF_INTEREST if v != "hospital_site"]
]


Unnamed: 0,join_id,dicom_join_id,"(0008, 0070) Manufacturer","(0008, 1090) Manufacturer's Model Name","(0012, 0030) Clinical Trial Site ID","(0018, 1000) Device Serial Number",Site ID - Model
00700804,00700804,9659701,AGFA,ADC_51xx,58,1134,58ADC_51xx
04168501,04168501,9485404,"""GE Healthcare""","""Definium 5000""",46,,"46""Definium 5000"""
02127103,02127103,9596662,"FUJI PHOTO FILM Co., ltd.",,23,,
03558001,03558001,9018489,Swissray,ddR Formula System,34,S402607,34ddR Formula System
01597503,01597503,9953183,LS100,Lumisys,10,,10Lumisys
...,...,...,...,...,...,...,...
03844601,03844601,9257071,FUJIFILM Corporation,,58,,
00507901,00507901,9101951,Agfa-Gevaert AG,ADC_5146,46,2205,46ADC_5146
02256603,02256603,9297051,,,11,,
03625601,03625601,9836547,FUJIFILM Corporation,,10,,


## Merging Dicom Metadata

We can join the OAI structured data with the dicom metadata by joining on the patient ID.  Our variables of interest for grouping koos predictions are:

- by manufacturer
- by model
- by serial number
- by site-id
- by site-id / model

In [245]:
df_combined = dicom_metadata.merge(
    all_knee_pain_scores,
    left_on="dicom_join_id",
    right_on="id",
    how="left",
).dropna(subset=["id", "koos_pain_subscore"])

dicom_timepoint = "(0012, 0051) Clinical Trial Time Point Description"

df_combined["bin_koos"] = df_combined["koos_pain_subscore"].apply(utils.binarize_koos)
df_combined["koos_pain_subscore"].astype("string")
df_combined[dicom_timepoint] = df_combined[dicom_timepoint].astype("string")
df_combined = df_combined.dropna(subset=["bin_koos"])
df_combined = df_combined.loc[df_combined["visit"] == df_combined[dicom_timepoint]]
df_combined


Unnamed: 0,"(0008, 0005) Specific Character Set","(0008, 0008) Image Type","(0008, 0012) Instance Creation Date","(0008, 0013) Instance Creation Time","(0008, 0016) SOP Class UID","(0008, 0018) SOP Instance UID","(0008, 0020) Study Date","(0008, 0022) Acquisition Date","(0008, 0030) Study Time","(0008, 0032) Acquisition Time",...,join_id,dicom_join_id,Site ID - Model,id,koos_pain_subscore,womac_pain_subscore,visit,hospital_site,side,bin_koos
1,ISO_IR 100,"[DERIVED, PRIMARY]",20050513,091344,1.2.840.10008.5.1.4.1.1.1,1.3.6.1.4.1.21767.172.16.8.165.1158825987.343....,20050513,20050513,092624,092624,...,00700804,9659701,58ADC_51xx,9659701,80.6,2.0,12 month Annual Visit,D,left,1.0
11,ISO_IR 100,"[DERIVED, PRIMARY]",20050513,091344,1.2.840.10008.5.1.4.1.1.1,1.3.6.1.4.1.21767.172.16.8.165.1158825987.343....,20050513,20050513,092624,092624,...,00700804,9659701,58ADC_51xx,9659701,100.0,0.0,12 month Annual Visit,D,right,0.0
28,ISO_IR 100,"[ORIGINAL, PRIMARY]",,,1.2.840.10008.5.1.4.1.1.1.1,1.3.6.1.4.1.21767.127.0.0.1.1377304274.0.0.1.1,20130806,20130806,142822,143521,...,04168501,9485404,"46""Definium 5000""",9485404,94.4,1.0,96 month Annual Visit,C,left,0.0
38,ISO_IR 100,"[ORIGINAL, PRIMARY]",,,1.2.840.10008.5.1.4.1.1.1.1,1.3.6.1.4.1.21767.127.0.0.1.1377304274.0.0.1.1,20130806,20130806,142822,143521,...,04168501,9485404,"46""Definium 5000""",9485404,94.4,1.0,96 month Annual Visit,C,right,0.0
42,,"[DERIVED, PRIMARY, POST_PROCESSED, , , , , , 1...",,,1.2.840.10008.5.1.4.1.1.1,1.3.6.1.4.1.21767.172.16.8.165.1183056852.849....,20070611,20070611,112952.000,113258.531,...,02127103,9596662,,9596662,90.6,1.0,24 month Annual Visit,E,left,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539975,,,,,1.2.840.10008.5.1.4.1.1.7,1.3.6.1.4.1.21767.172.16.10.155.1188580343.595...,20070822,20070822,000000,000000.0,...,02256603,9297051,,9297051,86.1,1.0,36 month Annual Visit,B,right,1.0
539988,,DERIVED,,,1.2.840.10008.5.1.4.1.1.1,1.3.6.1.4.1.21767.127.0.0.1.1300739971.0.0.1.1,20110308,20110308,091253,091432.500,...,03625601,9836547,,9836547,75.0,2.0,72 month Annual Visit,B,left,1.0
539998,,DERIVED,,,1.2.840.10008.5.1.4.1.1.1,1.3.6.1.4.1.21767.127.0.0.1.1300739971.0.0.1.1,20110308,20110308,091253,091432.500,...,03625601,9836547,,9836547,80.6,2.0,72 month Annual Visit,B,right,1.0
540002,,"[ORIGINAL, SECONDARY, RADIOGRAPH]",20050929,233900,1.2.840.10008.5.1.4.1.1.1,1.3.6.1.4.1.21767.172.16.8.165.1143501729.639....,20040615,20040615,000000,000000.0,...,00096003,9444196,22Lumisys,9444196,44.4,10.0,Screening Visit,E,left,1.0


We can aggregate koos scores by `Manufacterer model`.

## Koos Aggregation

Aggregation of `binarized koos` allows us to see the percentage of severe knee pain x-rays for each manufacturer.

In [247]:
koos_variables = ["koos_pain_subscore", "bin_koos"]

html_raw_tables = []
for target_variable in VARIABLES_OF_INTEREST:
    df_agg = (
        df_combined[koos_variables + [target_variable]]
        .replace("", np.nan)
        .dropna(subset=[target_variable])
        .groupby(target_variable)
        .agg(["mean", "median", "count"])
    )

    html_raw_tables.append(
        df_agg.style.set_table_attributes("style='display:inline'")
        .set_caption(target_variable)
        ._repr_html_()
    )

display_html("".join(html_raw_tables), raw=True)


Unnamed: 0_level_0,koos_pain_subscore,koos_pain_subscore,koos_pain_subscore,bin_koos,bin_koos,bin_koos
Unnamed: 0_level_1,mean,median,count,mean,median,count
"(0008, 0070) Manufacturer",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
"""GE Healthcare""",88.483013,94.4,7759,0.336126,0.0,7759
AGFA,87.368271,94.4,9455,0.364252,0.0,9455
Agfa-Gevaert AG,87.975816,94.4,7869,0.360147,0.0,7869
"FUJI PHOTO FILM Co., ltd.",81.182679,88.9,2396,0.481219,0.0,2396
FUJIFILM Corporation,88.121656,94.4,7402,0.341124,0.0,7402
GE Healthcare,89.328483,97.2,811,0.314427,0.0,811
LS100,85.277453,91.7,4271,0.401077,0.0,4271
Philips Medical Systems,83.130526,91.15,190,0.447368,0.0,190
SIEMENS,81.545692,88.9,650,0.467692,0.0,650
Swissray,82.619614,88.9,8713,0.46379,0.0,8713

Unnamed: 0_level_0,koos_pain_subscore,koos_pain_subscore,koos_pain_subscore,bin_koos,bin_koos,bin_koos
Unnamed: 0_level_1,mean,median,count,mean,median,count
"(0008, 1090) Manufacturer's Model Name",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
"""Definium 5000""",88.325206,94.4,6927,0.34084,0.0,6927
"""Thunder Platform""",89.796875,97.2,832,0.296875,0.0,832
ADC_5146,87.975816,94.4,7869,0.360147,0.0,7869
ADC_51xx,87.368271,94.4,9455,0.364252,0.0,9455
DigitalDiagnost,82.902151,90.6,186,0.451613,0.0,186
Discovery XR656,89.328483,97.2,811,0.314427,0.0,811
FLUOROSPOT_COMPACT,83.3,83.3,2,0.5,0.5,2
Lumisys,85.432595,91.7,3930,0.400763,0.0,3930
SIEMENS FD-X,81.540278,88.9,648,0.467593,0.0,648
ddR Combi System,77.6,83.3,8,0.5,0.5,8

Unnamed: 0_level_0,koos_pain_subscore,koos_pain_subscore,koos_pain_subscore,bin_koos,bin_koos,bin_koos
Unnamed: 0_level_1,mean,median,count,mean,median,count
"(0012, 0030) Clinical Trial Site ID",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,97.05,97.05,2,0.0,0.0,2
5,88.9,88.9,2,0.5,0.5,2
10,90.197833,97.2,2492,0.274077,0.0,2492
11,86.789731,94.4,8949,0.370544,0.0,8949
22,84.840659,91.15,182,0.428571,0.0,182
23,80.585016,87.5,4418,0.491852,0.0,4418
33,72.2,72.2,2,1.0,1.0,2
34,83.183036,90.6,560,0.442857,0.0,560
35,82.775842,88.9,7248,0.462748,0.0,7248
38,95.85,95.85,2,0.0,0.0,2

Unnamed: 0_level_0,koos_pain_subscore,koos_pain_subscore,koos_pain_subscore,bin_koos,bin_koos,bin_koos
Unnamed: 0_level_1,mean,median,count,mean,median,count
"(0018, 1000) Device Serial Number",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
08.02.366,100.0,100.0,2,0.0,0.0,2
08.02.399,87.5,87.5,2,0.5,0.5,2
1003,83.3,83.3,2,0.5,0.5,2
1018,88.120218,94.4,2666,0.349587,0.0,2666
1134,86.551434,91.7,5162,0.38396,0.0,5162
1522,88.990047,94.4,643,0.339036,0.0,643
1844,79.73,86.1,40,0.6,1.0,40
1845,81.659375,88.9,608,0.458882,0.0,608
2205,87.98644,94.4,7795,0.359589,0.0,7795
3677,86.856757,93.8,74,0.418919,0.0,74

Unnamed: 0_level_0,koos_pain_subscore,koos_pain_subscore,koos_pain_subscore,bin_koos,bin_koos,bin_koos
Unnamed: 0_level_1,mean,median,count,mean,median,count
Site ID - Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
5ddR Modulaire System,88.9,88.9,2,0.5,0.5,2
10,93.942424,100.0,66,0.151515,0.0,66
10Lumisys,91.20728,97.2,522,0.239464,0.0,522
11,85.047445,93.8,137,0.386861,0.0,137
11Lumisys,85.02262,91.7,3046,0.41694,0.0,3046
22,84.844304,90.6,79,0.43038,0.0,79
22Lumisys,83.7375,90.3,32,0.375,0.0,32
22ddR Formula System,84.22,87.5,40,0.5,0.5,40
23,79.675937,86.1,1521,0.518738,1.0,1521
23DigitalDiagnost,78.848276,86.1,58,0.534483,1.0,58

Unnamed: 0_level_0,koos_pain_subscore,koos_pain_subscore,koos_pain_subscore,bin_koos,bin_koos,bin_koos
Unnamed: 0_level_1,mean,median,count,mean,median,count
hospital_site,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,82.850781,88.9,8068,0.460833,0.0,8068
B,87.533456,94.4,11451,0.349402,0.0,11451
C,88.224542,94.4,15618,0.348252,0.0,15618
D,87.642052,94.4,13248,0.358318,0.0,13248
E,80.763158,88.9,4598,0.488691,0.0,4598


## Model predictions

These predictions are generated by the `koos` deep learning model.  They are written to the path indicated by `PREDICTIONS_CSV`.  Xrays were scored using a corresponding slurm model creating a `koos` numeric prediction and a binned prediction mapped to either `0` or `1`.

In [248]:
df_predictions = pd.read_csv(
    PREDICTIONS_CSV,
    names=["batch", "koos_prediction", "patient_id", "bin_koos_prediction"],
    header=None,
    skiprows=1,
)

df_predictions["join_id"] = df_predictions["patient_id"].map("{:08}".format)
df_predictions


Unnamed: 0,batch,koos_prediction,patient_id,bin_koos_prediction,join_id
0,0,97.930750,1652703,0.0,01652703
1,1,80.273544,2002706,1.0,02002706
2,2,91.560130,1623104,0.0,01623104
3,3,83.731750,4181102,1.0,04181102
4,4,86.088936,2300601,1.0,02300601
...,...,...,...,...,...
26515,3,82.593605,2063801,1.0,02063801
26516,4,98.483200,2338901,0.0,02338901
26517,5,94.040940,1371103,0.0,01371103
26518,6,82.667530,1632904,1.0,01632904


## Merging Predictions

Model predictions are merged with dicom metadata by joining again on patient ID.  This allows us to have both the actual and predicted koos value for a given xray.  The sides for each xray are included in the analysis.

In [249]:
koos_variables = ["koos_pain_subscore", "bin_koos"]

df_combined_predictions = df_combined.merge(
    df_predictions, left_on="join_id", right_on="join_id", how="inner"
).dropna(subset=["koos_prediction"] + VARIABLES_OF_INTEREST)

df_combined_predictions[
    ["join_id", "visit", "side"] + koos_variables + VARIABLES_OF_INTEREST
]


Unnamed: 0,join_id,visit,side,koos_pain_subscore,bin_koos,"(0008, 0070) Manufacturer","(0008, 1090) Manufacturer's Model Name","(0012, 0030) Clinical Trial Site ID","(0018, 1000) Device Serial Number",Site ID - Model,hospital_site
0,00700804,12 month Annual Visit,left,80.6,1.0,AGFA,ADC_51xx,58,1134,58ADC_51xx,D
1,00700804,12 month Annual Visit,right,100.0,0.0,AGFA,ADC_51xx,58,1134,58ADC_51xx,D
6,03558001,72 month Annual Visit,left,100.0,0.0,Swissray,ddR Formula System,34,S402607,34ddR Formula System,A
7,03558001,72 month Annual Visit,right,100.0,0.0,Swissray,ddR Formula System,34,S402607,34ddR Formula System,A
16,02102701,36 month Annual Visit,left,88.9,0.0,AGFA,ADC_51xx,58,1522,58ADC_51xx,D
...,...,...,...,...,...,...,...,...,...,...,...
52962,02143601,36 month Annual Visit,right,97.2,0.0,AGFA,ADC_51xx,58,1134,58ADC_51xx,D
52971,00333904,Screening Visit,left,100.0,0.0,AGFA,ADC_51xx,58,1134,58ADC_51xx,D
52972,00333904,Screening Visit,right,61.1,1.0,AGFA,ADC_51xx,58,1134,58ADC_51xx,D
52975,00507901,Screening Visit,left,100.0,0.0,Agfa-Gevaert AG,ADC_5146,46,2205,46ADC_5146,C


In [250]:
def prediction_performance(ytrue, ypred):
    stats = {}

    try:
        cm = confusion_matrix(ytrue, ypred)

        spec_sum = cm[1, 0] + cm[1, 1]
        sen_sum = cm[0, 0] + cm[0, 1]
        prec_sum = cm[0, 0] + cm[1, 0]

        if spec_sum == 0 or sen_sum == 0 or prec_sum == 0:
            raise Exception

        if not len(ypred) or len(cm) <= 1:
            raise Exception

        stats["Accuracy"] = (cm[0, 0] + cm[1, 1]) / cm.sum()
        stats["Precision"] = cm[0, 0] / (cm[0, 0] + cm[1, 0])
        stats["Specificity"] = cm[1, 1] / (spec_sum)
        stats["Sensitivity"] = cm[0, 0] / (sen_sum)
        stats["ER"] = (cm[0, 1] + cm[1, 0]) / cm.sum()
        stats["FPR"] = 1 - stats["Specificity"]
        stats["TP Count"] = cm[0, 0]
        stats["TN Count"] = cm[1, 1]
    except:
        stats["Accuracy"] = np.nan
        stats["Precision"] = np.nan
        stats["Specificity"] = np.nan
        stats["Sensitivity"] = np.nan
        stats["ER"] = np.nan
        stats["FPR"] = np.nan
        stats["TP Count"] = np.nan
        stats["TN Count"] = np.nan

    return stats


global_ytrue = df_combined_predictions["bin_koos"]
global_ypred = df_combined_predictions["bin_koos_prediction"]
global_cm = confusion_matrix(global_ytrue, global_ypred)

prediction_performance(global_ytrue, global_ypred)


{'Accuracy': 0.5068156930861307,
 'Precision': 0.5996302382908792,
 'Specificity': 0.4437785388127854,
 'Sensitivity': 0.5483063051781354,
 'ER': 0.4931843069138693,
 'FPR': 0.5562214611872146,
 'TP Count': 8757,
 'TN Count': 4665}

## Global Performance

The global performance of prediction gives an accuracy of 0.507 with a precision of 0.600.  Sensitivity is 0.548 and specificity is 0.444.

## Koos Accuracy by Manufacturer

Grouping the predictions by manufacturer shows varying levels of accuracy.  The most accurately predicted pain scores occured when the manufacturer model was the `SIEMENS`.  The least accurate occurred with `Agfa-Gevaert AG`.

## Accuracy Discrepancies

Many of the xray manufacturers show widely varying sensitivities and specificities.  Aside from the large drop in accuracy with `Agfa-Gevaert AG`, there is a large difference between sensitivity even at similar counts.  These differences also hold for model names and site id.

In [251]:
def grouping_func(row):
    ytrue = row["bin_koos"]
    ypred = row["bin_koos_prediction"]
    d = {"Count": len(ypred)}
    d.update(prediction_performance(ytrue, ypred))

    return pd.Series(d)


html_raw_tables = []
for grouping_var in VARIABLES_OF_INTEREST:
    df_combined_predictions[grouping_var] = df_combined_predictions[
        grouping_var
    ].astype("string")
    df_agg = (
        df_combined_predictions[df_combined_predictions[grouping_var] != ""]
        .groupby([grouping_var])
        .apply(grouping_func)
        .sort_values(by=["Count"], ascending=False)
    )

    html_raw_tables.append(
        df_agg.style.set_table_attributes("style='display:inline'")
        .set_caption(grouping_var)
        ._repr_html_()
    )

display_html("".join(html_raw_tables), raw=True)


Unnamed: 0_level_0,Count,Accuracy,Precision,Specificity,Sensitivity,ER,FPR,TP Count,TN Count
"(0008, 0070) Manufacturer",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AGFA,9343.0,0.551857,0.647763,0.373187,0.653085,0.448143,0.626813,3895.0,1261.0
Swissray,8421.0,0.525828,0.542532,0.290644,0.729874,0.474172,0.709356,3291.0,1137.0
Agfa-Gevaert AG,7867.0,0.427482,0.637922,0.753884,0.243893,0.572518,0.246116,1228.0,2135.0
SIEMENS,650.0,0.563077,0.564583,0.3125,0.783237,0.436923,0.6875,271.0,95.0
Philips Medical Systems,190.0,0.552632,0.586207,0.435294,0.647619,0.447368,0.564706,68.0,37.0
GE Healthcare,12.0,,,,,,,,

Unnamed: 0_level_0,Count,Accuracy,Precision,Specificity,Sensitivity,ER,FPR,TP Count,TN Count
"(0008, 1090) Manufacturer's Model Name",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ADC_51xx,9343.0,0.551857,0.647763,0.373187,0.653085,0.448143,0.626813,3895.0,1261.0
ADC_5146,7867.0,0.427482,0.637922,0.753884,0.243893,0.572518,0.246116,1228.0,2135.0
ddR Modulaire System,5100.0,0.523725,0.536206,0.252926,0.762925,0.476275,0.747074,2066.0,605.0
ddR Formula System,3261.0,0.529592,0.555658,0.352624,0.677746,0.470408,0.647376,1203.0,524.0
SIEMENS FD-X,648.0,0.563272,0.564854,0.313531,0.782609,0.436728,0.686469,270.0,95.0
DigitalDiagnost,186.0,0.548387,0.580357,0.440476,0.637255,0.451613,0.559524,65.0,37.0
ddR Multi System,52.0,0.538462,0.47619,0.266667,0.909091,0.461538,0.733333,20.0,8.0
Discovery XR656,12.0,,,,,,,,
ddR Combi System,8.0,0.25,0.333333,0.0,0.5,0.75,1.0,2.0,0.0
digital DIAGNOST,4.0,0.75,0.75,0.0,1.0,0.25,1.0,3.0,0.0

Unnamed: 0_level_0,Count,Accuracy,Precision,Specificity,Sensitivity,ER,FPR,TP Count,TN Count
"(0012, 0030) Clinical Trial Site ID",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
58,9351.0,0.551706,0.647997,0.373002,0.652771,0.448294,0.626998,3899.0,1260.0
46,7859.0,0.427535,0.637922,0.753623,0.244135,0.572465,0.246377,1228.0,2132.0
35,7194.0,0.528774,0.545489,0.287173,0.736869,0.471226,0.712827,2848.0,956.0
23,1467.0,0.534424,0.541459,0.345221,0.707572,0.465576,0.654779,542.0,242.0
34,556.0,0.535971,0.561275,0.278226,0.743506,0.464029,0.721774,229.0,69.0
22,40.0,0.25,0.291667,0.15,0.35,0.75,0.85,7.0,3.0
40,6.0,,,,,,,,
5,2.0,0.5,0.5,0.0,1.0,0.5,1.0,1.0,0.0
33,2.0,,,,,,,,
38,2.0,,,,,,,,

Unnamed: 0_level_0,Count,Accuracy,Precision,Specificity,Sensitivity,ER,FPR,TP Count,TN Count
"(0018, 1000) Device Serial Number",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2205,7793.0,0.42628,0.636602,0.752588,0.243189,0.57372,0.247412,1214.0,2108.0
1134,5056.0,0.547073,0.631546,0.385016,0.646458,0.452927,0.614984,2026.0,740.0
S402607,2769.0,0.524016,0.552471,0.361331,0.660252,0.475984,0.638669,995.0,456.0
1018,2666.0,0.550638,0.657277,0.373391,0.645905,0.449362,0.626609,1120.0,348.0
5094,944.0,0.598517,0.706154,0.356902,0.709428,0.401483,0.643098,459.0,106.0
1522,643.0,0.527216,0.641686,0.298165,0.644706,0.472784,0.701835,274.0,65.0
1845,608.0,0.570724,0.576233,0.322581,0.781155,0.429276,0.677419,257.0,90.0
963334016841,186.0,0.548387,0.580357,0.440476,0.637255,0.451613,0.559524,65.0,37.0
3677,74.0,0.554054,0.777778,0.870968,0.325581,0.445946,0.129032,14.0,27.0
1844,40.0,0.45,0.40625,0.208333,0.8125,0.55,0.791667,13.0,5.0

Unnamed: 0_level_0,Count,Accuracy,Precision,Specificity,Sensitivity,ER,FPR,TP Count,TN Count
Site ID - Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
58ADC_51xx,9339.0,0.551986,0.647763,0.373002,0.653414,0.448014,0.626998,3895.0,1260.0
46ADC_5146,7859.0,0.427535,0.637922,0.753623,0.244135,0.572465,0.246377,1228.0,2132.0
35ddR Modulaire System,4718.0,0.522043,0.532978,0.253142,0.762651,0.477957,0.746858,1899.0,564.0
35ddR Formula System,2294.0,0.540105,0.570687,0.351616,0.69128,0.459895,0.648384,880.0,359.0
23ddR Formula System,757.0,0.513871,0.52381,0.371585,0.647059,0.486129,0.628415,253.0,136.0
23SIEMENS FD-X,648.0,0.563272,0.564854,0.313531,0.782609,0.436728,0.686469,270.0,95.0
34ddR Modulaire System,374.0,0.545455,0.578014,0.25625,0.761682,0.454545,0.74375,163.0,41.0
34ddR Formula System,170.0,0.523529,0.543103,0.329114,0.692308,0.476471,0.670886,63.0,26.0
35DigitalDiagnost,126.0,0.579365,0.635135,0.490566,0.643836,0.420635,0.509434,47.0,26.0
23DigitalDiagnost,58.0,0.465517,0.444444,0.354839,0.592593,0.534483,0.645161,16.0,11.0

Unnamed: 0_level_0,Count,Accuracy,Precision,Specificity,Sensitivity,ER,FPR,TP Count,TN Count
hospital_site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
D,9347.0,0.551621,0.64788,0.372816,0.652764,0.448379,0.627184,3897.0,1259.0
C,7863.0,0.427445,0.637922,0.75371,0.24399,0.572555,0.24629,1228.0,2133.0
A,7764.0,0.529238,0.546567,0.28683,0.737081,0.470762,0.71317,3081.0,1028.0
E,1509.0,0.527502,0.536514,0.339806,0.699239,0.472498,0.660194,551.0,245.0


## Feature Selection Analysis

In [252]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest


TEST_VARIABLES = [
    "(0008, 0080) Institution Name",
    "(0008, 0008) Image Type",
    "(0008, 1030) Study Description",
    "(0018, 1000) Device Serial Number",
    "(0018, 1020) Software Versions",
    "(0018, 5101) View Position",
    *VARIABLES_OF_INTEREST,
]


def feature_select(dataset, yvar="bin_koos", xvars=[]):
    if not len(xvars):
        raise ValueError("xvars cannot be empty")

    le = LabelEncoder()
    oe = OrdinalEncoder()

    X = dataset[xvars].astype(str)
    y = dataset[yvar].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=0
    )

    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)

    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)

    fs = SelectKBest(score_func=chi2, k="all")
    fs.fit(X_train_enc, y_train_enc)

    cols = fs.get_support(indices=True)
    feature_names = X.iloc[:, cols].columns

    X_train_fs = fs.transform(X_train_enc)
    X_test_fs = fs.transform(X_test_enc)

    for i in range(len(fs.scores_)):
        print("Feature %s: %f" % (feature_names[i], fs.scores_[i]))


feature_select(df_combined_predictions, yvar="bin_koos", xvars=TEST_VARIABLES)


Feature (0008, 0080) Institution Name: 20.746257
Feature (0008, 0008) Image Type: 525.462432
Feature (0008, 1030) Study Description: 67.581534
Feature (0018, 1000) Device Serial Number: 34.454259
Feature (0018, 1020) Software Versions: 17.500046
Feature (0018, 5101) View Position: 0.422160
Feature (0008, 0070) Manufacturer: 391.664286
Feature (0008, 1090) Manufacturer's Model Name: 643.504002
Feature (0012, 0030) Clinical Trial Site ID: 161.297454
Feature (0018, 1000) Device Serial Number: 34.454259
Feature Site ID - Model: 197.672877
Feature hospital_site: 30.200640


In [253]:
pd.set_option("display.max_rows", 75)

target_variable = "hospital_site"
model_name = "(0008, 1090) Manufacturer's Model Name"

df_combined.groupby([target_variable, model_name])["id"].agg(["count"])


Unnamed: 0_level_0,Unnamed: 1_level_0,count
hospital_site,"(0008, 1090) Manufacturer's Model Name",Unnamed: 2_level_1
A,,0
A,"""Definium 5000""",4
A,"""Thunder Platform""",2
A,ADC_5146,4
A,ADC_51xx,8
A,DigitalDiagnost,128
A,Discovery XR656,0
A,FLUOROSPOT_COMPACT,0
A,Lumisys,2
A,SIEMENS FD-X,0
