In [65]:
# -*- coding: utf-8 -*-

"""
Created May 7, 2022
"""

import glob
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score

import utils


BASED_STRUCTURED_DIR = "/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/structured_data"
PREDICTIONS_CSV = "/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/model_predictions/predictions_2022_12_06_08_09_19_298378.csv"
DICOM_METADATA_PATH = "./data/dicom_metadata_df.pkl"


## Structured Data

Here we combine the structured data from multiple timepoints.

In [17]:
original_dataframes: dict[str, pd.DataFrame] = {}

clinical_data_files = glob.glob(os.path.join(BASED_STRUCTURED_DIR, "AllClinical*.txt"))
xray_files = glob.glob(os.path.join(BASED_STRUCTURED_DIR, "XRay*.txt"))

combined_files = sorted(clinical_data_files + xray_files)

for filename in combined_files:
    name = os.path.basename(filename).replace(".txt", "").lower()

    df = utils.read_data_frame(os.path.join(BASED_STRUCTURED_DIR, filename))

    if df is None:
        continue

    original_dataframes[name] = df


In [18]:
dfs: dict[str, pd.DataFrame] = {}

for side in ["left", "right"]:
    subset_cols = (
        ["id", "kooskpl", "womkpl"] if side == "left" else ["id", "kooskpr", "womkpr"]
    )
    dfs[side] = utils.concatenate_from_timepoints(
        original_dataframes,
        dataset_substring="allclinical",
        subset=subset_cols,
    )

    utils.validate_column(
        pd.Series(dfs[side].columns),
        subset_cols + ["visit"],
    )

    dfs[side].columns = [
        "id",
        "koos_pain_subscore",
        "womac_pain_subscore",
        "visit",
    ]  # type:ignore
    dfs[side]["side"] = side

all_knee_pain_scores = pd.concat([val for val in dfs.values()])


In [19]:
with open(DICOM_METADATA_PATH, "rb") as f:
    dicom_metadata = pickle.load(f)

dicom_metadata["join_id"] = dicom_metadata.index.values.astype("str")


## Merging Dicom Metadata

We can join the OAI structured data with the dicom metadata by joining on the patient ID.

In [35]:
SUBSET_COLUMNS = [
    "id",
    "join_id",
    "koos_pain_subscore",
    "side",
    "visit",
    "(0008, 1090) Manufacturer's Model Name",
    "(0018, 1000) Device Serial Number",
    "(0010, 0020) Patient ID",
]

df_combined = utils.subset_dataframe(
    dicom_metadata.merge(
        all_knee_pain_scores,
        left_on="(0010, 0020) Patient ID",
        right_on="id",
        how="left",
    ).dropna(subset=["id", "koos_pain_subscore"]),
    SUBSET_COLUMNS,
)

df_combined["bin_koos"] = df_combined["koos_pain_subscore"].apply(utils.binarize_koos)
df_combined["koos_pain_subscore"].astype("string")
df_combined = df_combined.dropna(subset=["bin_koos"])
df_combined


Unnamed: 0,id,join_id,koos_pain_subscore,side,visit,"(0008, 1090) Manufacturer's Model Name","(0018, 1000) Device Serial Number","(0010, 0020) Patient ID",bin_koos
0,9659701,00700804,97.2,left,00,ADC_51xx,1134,9659701,0.0
1,9659701,00700804,80.6,left,01,ADC_51xx,1134,9659701,1.0
2,9659701,00700804,91.7,left,03,ADC_51xx,1134,9659701,0.0
3,9659701,00700804,100.0,left,05,ADC_51xx,1134,9659701,0.0
4,9659701,00700804,100.0,left,06,ADC_51xx,1134,9659701,0.0
...,...,...,...,...,...,...,...,...,...
540016,9444196,00096003,72.2,right,06,Lumisys,,9444196,1.0
540017,9444196,00096003,61.1,right,07,Lumisys,,9444196,1.0
540018,9444196,00096003,83.3,right,08,Lumisys,,9444196,1.0
540020,9444196,00096003,65.6,right,10,Lumisys,,9444196,1.0


We can aggregate koos scores by `Manufacterer model`.

In [36]:
df_combined.drop(
    [
        "id",
        "join_id",
        "side",
        "visit",
        "(0010, 0020) Patient ID",
        "(0018, 1000) Device Serial Number",
    ],
    axis=1,
).groupby(["(0008, 1090) Manufacturer's Model Name"]).agg(["mean", "median"])


Unnamed: 0_level_0,koos_pain_subscore,koos_pain_subscore,bin_koos,bin_koos
Unnamed: 0_level_1,mean,median,mean,median
"(0008, 1090) Manufacturer's Model Name",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,82.28561,88.9,0.458663,0.0
"""Definium 5000""",88.460731,94.4,0.340028,0.0
"""Thunder Platform""",89.342762,94.4,0.31013,0.0
ADC_5146,87.432215,94.4,0.368998,0.0
ADC_51xx,87.205553,94.4,0.372047,0.0
DigitalDiagnost,82.227218,88.9,0.455643,0.0
Discovery XR656,89.247124,97.2,0.31324,0.0
FLUOROSPOT_COMPACT,85.96,94.4,0.45,0.0
Lumisys,86.171019,92.9,0.385325,0.0
SIEMENS FD-X,81.272059,88.9,0.471184,0.0


## Model predictions

These predictions are generated by the `koos` deep learning model.  They are written to the path indicated by `PREDICTIONS_CSV`.  Xrays were scored using a corresponding slurm model creating a `koos` numeric prediction and a binned prediction mapped to either `0` or `1`.

In [31]:
df_predictions = pd.read_csv(
    PREDICTIONS_CSV,
    names=["batch", "koos_prediction", "patient_id", "bin_koos_prediction"],
    header=None,
    skiprows=1,
)

df_predictions["join_id"] = df_predictions["patient_id"].map("{:08}".format)
df_predictions


Unnamed: 0,batch,koos_prediction,patient_id,bin_koos_prediction,join_id
0,0,97.930750,1652703,0.0,01652703
1,1,80.273544,2002706,1.0,02002706
2,2,91.560130,1623104,0.0,01623104
3,3,83.731750,4181102,1.0,04181102
4,4,86.088936,2300601,1.0,02300601
...,...,...,...,...,...
26515,3,82.593605,2063801,1.0,02063801
26516,4,98.483200,2338901,0.0,02338901
26517,5,94.040940,1371103,0.0,01371103
26518,6,82.667530,1632904,1.0,01632904


## Merging Predictions

Model predictions are merged with dicom metadata by joining again on patient ID.  This allows us to have both the actual and predicted koos value for a given xray.

In [32]:
SUBSET_COLUMNS = [
    "join_id",
    "koos_prediction",
    "koos_pain_subscore",
    "bin_koos_prediction",
    "bin_koos",
    "(0008, 1090) Manufacturer's Model Name",
    "(0018, 1000) Device Serial Number",
]

df_combined_predictions = utils.subset_dataframe(
    df_combined.merge(
        df_predictions, left_on="join_id", right_on="join_id", how="inner"
    ).dropna(subset=["koos_prediction", "(0008, 1090) Manufacturer's Model Name"]),
    SUBSET_COLUMNS,
).drop_duplicates(subset="join_id", keep="last")

df_combined_predictions


Unnamed: 0,join_id,koos_prediction,koos_pain_subscore,bin_koos_prediction,bin_koos,"(0008, 1090) Manufacturer's Model Name","(0018, 1000) Device Serial Number"
17,00700804,89.882675,100.0,0.0,0.0,ADC_51xx,1134
35,04168501,88.671280,88.9,0.0,0.0,"""Definium 5000""",
65,03558001,86.851900,100.0,0.0,0.0,ddR Formula System,S402607
81,01597503,91.696610,100.0,0.0,0.0,Lumisys,
99,01896403,94.442770,100.0,0.0,0.0,Lumisys,
...,...,...,...,...,...,...,...
450681,01051303,95.334170,100.0,0.0,0.0,Lumisys,
450699,03514501,87.965840,100.0,0.0,0.0,"""Definium 5000""",
450717,00333904,92.455330,94.4,0.0,0.0,ADC_51xx,1134
450739,00507901,83.471210,100.0,1.0,0.0,ADC_5146,2205


## Koos Accuracy by Manufacturer

Grouping the predictions by manufacturer shows varying levels of accuracy.  The most accurately predicted pain scores occured when the manufacturer model was the `ddR Multi System`.  The least accurate occurred with `Discovery XR656`.

In [69]:
def grouping_func(row):
    ytrue = row["bin_koos"]
    ypred = row["bin_koos_prediction"]
    d = {}

    if len(ypred):
        d["Accuracy"] = accuracy_score(ytrue, ypred)
        d["Specificity"] = recall_score(ytrue, ypred, pos_label=0, zero_division = 0)
    else:
        d["Accuracy"] = np.nan
        d["Specificity"] = np.nan

    d["Count"] = len(ypred)
    return pd.Series(d)

grouping_var = "(0008, 1090) Manufacturer's Model Name"


df_combined_predictions[grouping_var] = df_combined_predictions[grouping_var].astype("string")
df_combined_predictions[df_combined_predictions[grouping_var] != ""] \
    .groupby([grouping_var]).apply(grouping_func)

Unnamed: 0_level_0,Accuracy,Specificity,Count
"(0008, 1090) Manufacturer's Model Name",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"""Definium 5000""",0.543228,0.707728,3470.0
"""Thunder Platform""",0.519231,0.607407,416.0
ADC_5146,0.446927,0.239811,3938.0
ADC_51xx,0.541323,0.65914,4731.0
DigitalDiagnost,0.612903,0.685185,93.0
Discovery XR656,0.431034,0.218254,406.0
FLUOROSPOT_COMPACT,1.0,1.0,1.0
Lumisys,0.565107,0.960609,1966.0
SIEMENS FD-X,0.564815,0.784884,324.0
ddR Combi System,0.0,0.0,4.0


## Koos Accuracy by Serial Number

Grouping the predictions by serial number also shows varying levels of accuracy.  The most accurately predicted pain scores occured for device serial number  `963334016841`.

In [70]:
grouping_var = "(0018, 1000) Device Serial Number"
df_combined_predictions[grouping_var] = df_combined_predictions[grouping_var].astype("string")
df_combined_predictions[df_combined_predictions[grouping_var] != ""] \
    .groupby([grouping_var]).apply(grouping_func)

Unnamed: 0_level_0,Accuracy,Specificity,Count
"(0018, 1000) Device Serial Number",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
08.02.366,1.0,1.0,1.0
08.02.399,0.0,0.0,1.0
1003,1.0,1.0,1.0
1018,0.56072,0.66065,1334.0
1134,0.530596,0.647217,2582.0
1522,0.518634,0.656085,322.0
1844,0.35,0.714286,20.0
1845,0.578947,0.787879,304.0
2205,0.446809,0.239601,3901.0
3677,0.459459,0.26087,37.0
