In [None]:
# -*- coding: utf-8 -*-

"""
Created January 29, 2023
"""

import os
import pickle

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


DATA_DIR = (
    "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/pixel_patterns"
)
BOX_CSV = "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/pixel_patterns/image_box_agg.csv"
IMAGE_CSV = "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/pixel_patterns/image_box_agg.csv"
DICOM_METADATA_PATH = "/dartfs-hpc/rc/home/f/f005gzf/projects/xray_fingerprints/analysis/results_bias/data/dicom_metadata_df.pkl"
VARIABLES_OF_INTEREST = [
    "(0008, 0070) Manufacturer",
    "(0008, 1090) Manufacturer's Model Name",
    "(0012, 0030) Clinical Trial Site ID",
    "(0018, 1000) Device Serial Number",
    "Site ID - Model",
    "hospital_site",
]


In [193]:
boxes = pd.read_csv(os.path.join(DATA_DIR, "boxes.csv"))
image_boxes = pd.read_csv(os.path.join(DATA_DIR, "image_box_agg.csv"))

boxes


Unnamed: 0.1,Unnamed: 0,path,image,percent_total_area,horizontal_side,vertical_side
0,31145,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,3769001,1.008949,edge,edge
1,35907,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,615703,1.008949,edge,edge
2,24056,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,3791802,1.008949,edge,edge
3,5949,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,1638304,1.008949,edge,edge
4,5895,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,2612401,1.008949,edge,edge
...,...,...,...,...,...,...
44263,4631,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,1881003,0.001016,indeterminate,indeterminate
44264,2885,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,3348301,0.001016,indeterminate,indeterminate
44265,5887,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,3914102,0.001016,left,bottom
44266,24978,/dartfs/rc/nosnapshots/H/HillB-scratch/OAI/pro...,2603001,0.001016,left,indeterminate


In [99]:
df_boxes = boxes.join(image_boxes, how="inner", on="image", rsuffix="_right").drop(
    columns=["path", "path_right", "Unnamed: 0", "Unnamed: 0_right", "image_right"]
)

df_boxes["join_id"] = (
    df_boxes["image"].astype(str).str.pad(width=8, side="left", fillchar="0")
)

df_boxes.head()


Unnamed: 0,image,percent_total_area,horizontal_side,vertical_side,area_sum,area_sum_without_edges,n_boxes,join_id
28132,25505,0.00861,indeterminate,top,0.0,0.0,0,25505
28904,13404,0.007773,left,bottom,0.0,0.0,0,13404
32596,13404,0.004484,right,bottom,0.0,0.0,0,13404
38806,13404,0.001814,left,bottom,0.0,0.0,0,13404
31476,12304,0.005301,left,bottom,0.292949,0.001475,3,12304


In [196]:
with open(DICOM_METADATA_PATH, "rb") as f:
    dicom_metadata = pickle.load(f)

dicom_metadata["join_id"] = dicom_metadata.index.values.astype("str")
dicom_metadata = dicom_metadata[["join_id", VARIABLES_OF_INTEREST[0]]]

df = df_boxes.join(dicom_metadata, how="inner", on="join_id", rsuffix="_right").drop(
    columns=["join_id", "join_id_right", "image"]
)

dicom_metadata


Unnamed: 0,join_id,"(0008, 0070) Manufacturer"
00700804,00700804,AGFA
04168501,04168501,"""GE Healthcare"""
02127103,02127103,"FUJI PHOTO FILM Co., ltd."
03558001,03558001,Swissray
01597503,01597503,LS100
...,...,...
03844601,03844601,FUJIFILM Corporation
00507901,00507901,Agfa-Gevaert AG
02256603,02256603,
03625601,03625601,FUJIFILM Corporation


In [170]:
xt = df.drop([VARIABLES_OF_INTEREST[0]], axis=1)
y = df[VARIABLES_OF_INTEREST[0]]


In [172]:
def is_categorical(c, test_type=np.float64):
    try:
        return not (np.issubdtype(c, np.float64) or np.issubdtype(c, np.int64))
    except:
        return True


numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

categorical_features = xt.columns[xt.dtypes.apply(lambda c: is_categorical(c))]
numeric_features = xt.columns[xt.dtypes.apply(lambda c: not is_categorical(c))]
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

model_xt = preprocessor.fit(xt).transform(xt)


In [187]:
# Train the classifier using fit() function
# linear_clf.fit(model_xt, y)

# Print the learned coeficients
# print ("\nThe coefficients of the linear boundary are:", linear_clf.coef_)
# print ("\nThe point of intersection of the line are:",linear_clf.intercept_)

X_train, X_test, y_train, y_test = train_test_split(
    model_xt, y, test_size=0.25, random_state=33
)

clf = SGDClassifier()
# fit (train) the classifier
clf.fit(X_train, y_train)

df.shape


(50, 7)