In [58]:
# -*- coding: utf-8 -*-

"""
Created January 29, 2023
"""

import os
import pickle

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


DATA_DIR = "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/pixel_patterns"
BOX_CSV = "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/pixel_patterns/image_box_agg.csv"
IMAGE_CSV = "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/pixel_patterns/image_box_agg.csv"
DICOM_METADATA_PATH = "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/results_bias/data/dicom_metadata_df.pkl"
VARIABLES_OF_INTEREST = [
    "(0008, 0070) Manufacturer",
    "(0008, 1090) Manufacturer's Model Name",
    "(0012, 0030) Clinical Trial Site ID",
    "(0018, 1000) Device Serial Number",
    "Site ID - Model",
    "hospital_site",
]


In [3]:
boxes = pd.read_csv(os.path.join(DATA_DIR, "boxes.csv"))
image_boxes = pd.read_csv(os.path.join(DATA_DIR, "image_box_agg.csv"))


In [29]:
df_boxes = boxes.merge(image_boxes, on='image', how="inner") \
    .drop(columns=['path_x', 'path_y', 'Unnamed: 0_x', 'Unnamed: 0_y'])

df_boxes["join_id"] = df_boxes["image"].astype("int64")

df_boxes


Unnamed: 0,image,percent_total_area,horizontal_side,vertical_side,area_sum,area_sum_without_edges,n_boxes,join_id
0,3769001,1.008949,edge,edge,1.008949,0.000000,1,3769001
1,615703,1.008949,edge,edge,1.008949,0.000000,1,615703
2,3791802,1.008949,edge,edge,1.008949,0.000000,1,3791802
3,1638304,1.008949,edge,edge,1.008949,0.000000,1,1638304
4,2612401,1.008949,edge,edge,1.008949,0.000000,2,2612401
...,...,...,...,...,...,...,...,...
44263,3515401,0.001016,indeterminate,indeterminate,0.001016,0.001016,1,3515401
44264,3815302,0.001016,indeterminate,bottom,0.001016,0.001016,1,3815302
44265,3972401,0.001016,left,indeterminate,0.001016,0.001016,1,3972401
44266,3223704,0.001016,right,top,0.001016,0.001016,1,3223704


In [103]:
with open(DICOM_METADATA_PATH, "rb") as f:
    dicom_metadata = pickle.load(f)


def merge_metadata(target_variable):
    df_dicom_metadata = dicom_metadata.copy()
    df_dicom_metadata["join_id"] = dicom_metadata.index.values.astype("int64")
    df_dicom_metadata = df_dicom_metadata[['join_id', target_variable]]

    df = df_boxes.merge(df_dicom_metadata, on='join_id', how='inner') \
        .drop(columns=['join_id', 'image'])

    df = df.dropna(subset=[target_variable])

    return df


In [137]:
def is_categorical(c, test_type=np.float64):
    try:
        return not (np.issubdtype(c, np.float64) or np.issubdtype(c, np.int64))
    except:
        return True


def train_model(target_variable):
    df = merge_metadata(target_variable)
    y = df[target_variable]
    xt = df.drop([target_variable], axis=1)

    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    categorical_features = xt.columns[
        xt.dtypes.apply(lambda c: is_categorical(c))
    ]
    numeric_features = xt.columns[
        xt.dtypes.apply(lambda c: not is_categorical(c))
    ]
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    return preprocessor.fit(xt).transform(xt), xt, y


def get_model_accuracy(target_variable):
    model_xt, _, y = train_model(target_variable)

    X_train, X_test, y_train, y_test = train_test_split(
        model_xt, y, test_size=0.25, random_state=33)

    clf = SGDClassifier()

    clf.fit(X_train, y_train)

    y_train_pred = clf.predict(X_train)

    accuracy = metrics.accuracy_score(y_train, y_train_pred)

    return accuracy


In [141]:
for var in VARIABLES_OF_INTEREST:
    print(
        f"Variable {var} with a model accuracy of {get_model_accuracy(VARIABLES_OF_INTEREST[0])}")


Variable (0008, 0070) Manufacturer with a model accuracy of 0.3916028255894936
Variable (0008, 1090) Manufacturer's Model Name with a model accuracy of 0.42141743773422213
Variable (0012, 0030) Clinical Trial Site ID with a model accuracy of 0.4217822438894969
Variable (0018, 1000) Device Serial Number with a model accuracy of 0.409445163002023
Variable Site ID - Model with a model accuracy of 0.39080688488707593
Variable hospital_site with a model accuracy of 0.435180579046861
