# Build label dataframe

## Imports

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

import OAI_Utilities as utils # ln -s ../../OAI/notebooks/OAI_Utilities.py

## Constants

In [2]:
OAI_DATA_PATH = Path.home() / 'code/OAI/notebooks/data/'
idxSlc = pd.IndexSlice

VARIABLES_OF_INTEREST = [
    "(0008, 0070) Manufacturer",
    "(0008, 1090) Manufacturer's Model Name",
    "(0012, 0030) Clinical Trial Site ID",
    "(0018, 1000) Device Serial Number",
]

food_var = ['FFQ18', 'FFQ40', 'FFQ69', 'FFQ70', 'FFQ52', 'FFQ37']

## Read in data

In [16]:
enrollees_df = utils.read_parquet(OAI_DATA_PATH / 'enrollees_values.parquet')  # RACE, SITE
allclinical_df = utils.read_parquet(OAI_DATA_PATH / 'allclinical_values.parquet') # food_var
xr_df = utils.read_parquet(OAI_DATA_PATH / 'xray_bilat_pa_fixed_flex_knee_values.parquet')
metadata_df = utils.read_parquet(OAI_DATA_PATH / 'dicom_metadata_df.parquet') # Mfg, model, serial number

# Explore

In [22]:
# How big of a training set if it was only white men from C?
enrollees_df[(enrollees_df['SITE'] == 'C') & (enrollees_df['SEX'] == '1: Male') & (enrollees_df['RACE'] == '1: White or Caucasian')].shape

(567, 8)

# Create dataframes for model training

## All images
This is for the direct training of identifying SITE, SEX, RACE, MFG, MODEL, SERIAL, YEAR

In [4]:
no_answer = ['.E: Non-Exposed Control', '.R: Refused', '.A: Not Expected', '.M: Missing']
replacements = {ans: np.NaN for ans in no_answer}
# Drop rows without useable info
food_answers_df = allclinical_df[food_var].copy(deep=True)
food_answers_df = food_answers_df.reset_index('Visit', drop=True)
for col in food_var:
    food_answers_df[food_var] = food_answers_df[food_var].replace(replacements)
food_answers_df = food_answers_df.dropna(how='all')
for col in food_var:
    print('Patients w/ {} var {:,}'.format(col, food_answers_df[col].notna().sum()))

no_answer = ['.R: Refused', '.D: Don t Know/Unknown/Uncertain']
replacements = {ans: np.NaN for ans in no_answer}
enrollees_df['RACE'] = enrollees_df['RACE'].replace(replacements)
enrollees_df = enrollees_df[~enrollees_df['RACE'].isna()]
print('Patients w/ race answer {:,}'.format(len(enrollees_df)))

for col in food_var:
    cohort = set(food_answers_df[food_answers_df[col].notna()].index) & set(enrollees_df.index)
    print('Patients with both race and {} answers {:,}'.format(col, len(cohort)))

#enrollees_df = enrollees_df.loc[list(cohort)]
#food_answers_df = food_answers_df.loc[list(cohort)]

Patients w/ FFQ18 var 4,654
Patients w/ FFQ40 var 4,631
Patients w/ FFQ69 var 4,665
Patients w/ FFQ70 var 4,661
Patients w/ FFQ52 var 4,659
Patients w/ FFQ37 var 4,634
Patients w/ race answer 4,791
Patients with both race and FFQ18 answers 4,650
Patients with both race and FFQ40 answers 4,627
Patients with both race and FFQ69 answers 4,661
Patients with both race and FFQ70 answers 4,657
Patients with both race and FFQ52 answers 4,655
Patients with both race and FFQ37 answers 4,630


In [5]:
xr_df['YEAR'] = xr_df['Date'].dt.year.astype(str)
barcode_site_id_df = xr_df[['ID', 'Barcode', 'YEAR']].copy(deep=True)
barcode_site_id_df['ID'] = pd.to_numeric(barcode_site_id_df['ID'], downcast='unsigned')
barcode_site_id_df = barcode_site_id_df.set_index(['ID'])
print('X-rays we have records for {:,}'.format(len(barcode_site_id_df)))

# Result= XRBARCD: YEAR, ID, Visit, SITE, RACE, food_var, '(0008, 1090) Manufacturer's Model Name', '(0012, 0030) Clinical Trial Site ID', ....
#barcode_site_id_df = barcode_site_id_df.join(allclinical_df['AGE'], how='left') # add AGE
barcode_site_id_df = barcode_site_id_df.join(food_answers_df[food_var], how='left') #, how='inner')
barcode_site_id_df = barcode_site_id_df.join(enrollees_df[['SITE', 'RACE', 'SEX']], how='inner')  # Add clinical site and patient race
barcode_site_id_df = barcode_site_id_df.reset_index('ID').set_index('Barcode')  # Switch to index by barcode
barcode_site_id_df = barcode_site_id_df.join(metadata_df[VARIABLES_OF_INTEREST]) # Add Mfg model, and Clinical Site ID (xray machine location)
print('X-rays in set {:,}'.format(len(barcode_site_id_df)))  # Sanity check, the joins shouldn't be increase the number of entries
print('# of patients {:,}'.format(len(barcode_site_id_df['ID'].unique())))

X-rays we have records for 26,520
X-rays in set 26,495
# of patients 4,789


In [6]:
# Simplify column names
barcode_site_id_df = barcode_site_id_df.rename({'(0008, 0070) Manufacturer': 'MFG',
                           "(0008, 1090) Manufacturer's Model Name": 'MODEL', 
                           '(0012, 0030) Clinical Trial Site ID': 'XRAY SITE',
                           '(0018, 1000) Device Serial Number': 'SERIAL'}, axis=1)

In [7]:
# Clean up labels
var = 'MFG'
barcode_site_id_df[var] = barcode_site_id_df[var].astype(str)
barcode_site_id_df[var] = barcode_site_id_df[var].replace({'Agfa-Gevaert AG': 'Agfa-Gevaert',
                                                           'AGFA': 'Agfa-Gevaert',
                                                           'FUJI PHOTO FILM Co., ltd.': 'FUJI',
                                                           'FUJIFILM Corporation': 'FUJI',
                                                           '"GE Healthcare"': 'GE Healthcare',
                                                           '': np.NaN,
                                                           'nan': np.NaN
                                                          })
barcode_site_id_df[var] = barcode_site_id_df[var].astype('category')

var = 'MODEL'
barcode_site_id_df[var] = barcode_site_id_df[var].astype(str)
barcode_site_id_df[var] = barcode_site_id_df[var].replace({
    '"Definium 5000"': 'Definium 5000',
    '"Thunder Platform"': 'Thunder Platform',    
    '': np.NaN,
    'nan': np.NaN
})
barcode_site_id_df[var] = barcode_site_id_df[var].astype('category')

var = 'XRAY SITE'
barcode_site_id_df[var] = barcode_site_id_df[var].astype(str)
barcode_site_id_df[var] = barcode_site_id_df[var].replace({'None': np.NaN})
barcode_site_id_df[var] = barcode_site_id_df[var].astype('category')

var = 'SERIAL'
barcode_site_id_df[var] = barcode_site_id_df[var].astype(str)
barcode_site_id_df[var] = barcode_site_id_df[var].replace({'': np.NaN})
barcode_site_id_df[var] = barcode_site_id_df[var].replace({'nan': np.NaN})
barcode_site_id_df[var] = barcode_site_id_df[var].astype('category')

In [8]:
# Simplify class values
for col in barcode_site_id_df.select_dtypes(include=['category']).columns:
    barcode_site_id_df[col] = barcode_site_id_df[col].cat.remove_unused_categories()

In [9]:
# Simplify class values
for col in food_var:
    barcode_site_id_df[col] = barcode_site_id_df[col].cat.rename_categories({
    '1: Never' : 1,
    '2: A few times per year' : 2,
    '3: Once per month' : 3,
    '4: 2-3 times per month' : 4,
    '5: Once per week' : 5,
    '6: Twice per week' : 6,
    '7: 3-4 times per week' : 7,
    '8: 5-6 times per week' : 8,
    '9: Every day':9 })
    
    barcode_site_id_df[col] = barcode_site_id_df[col].astype('UInt8').astype('category') # otherwise this is int64

barcode_site_id_df['RACE'] = barcode_site_id_df['RACE'].cat.rename_categories({
    '0: Other Non-white' : 'O',
    '1: White or Caucasian' : 'W',
    '2: Black or African American' : 'B',
    '3: Asian' : 'A' })

barcode_site_id_df['SEX'] = barcode_site_id_df['SEX'].cat.rename_categories({
    '1: Male' : 'M',
    '2: Female' : 'F'})

In [10]:
barcode_site_id_df.loc['03841001']

ID                 9480338
YEAR                  2012
FFQ18                    3
FFQ40                    1
FFQ69                    1
FFQ70                    3
FFQ52                    6
FFQ37                    2
SITE                     C
RACE                     W
SEX                      F
MFG          GE Healthcare
MODEL        Definium 5000
XRAY SITE               46
SERIAL                 NaN
Name: 03841001, dtype: object

# Write to Parquet

In [11]:
utils.write_parquet(barcode_site_id_df, Path('data/xray_shortcutting_labels.parquet'))

Column FFQ18 is marked as categorical but will be stored as UInt8.
Column FFQ40 is marked as categorical but will be stored as UInt8.
Column FFQ69 is marked as categorical but will be stored as UInt8.
Column FFQ70 is marked as categorical but will be stored as UInt8.
Column FFQ52 is marked as categorical but will be stored as UInt8.
Column FFQ37 is marked as categorical but will be stored as UInt8.


In [12]:
# A dataset for local CNN tests
utils.write_parquet(barcode_site_id_df[barcode_site_id_df.index.str.startswith('000')], Path('data/xray_shortcutting_labels_test.parquet'))

Column FFQ18 is marked as categorical but will be stored as UInt8.
Column FFQ40 is marked as categorical but will be stored as UInt8.
Column FFQ69 is marked as categorical but will be stored as UInt8.
Column FFQ70 is marked as categorical but will be stored as UInt8.
Column FFQ52 is marked as categorical but will be stored as UInt8.
Column FFQ37 is marked as categorical but will be stored as UInt8.


In [13]:
barcode_site_id_df.dtypes

ID             uint64
YEAR           object
FFQ18        category
FFQ40        category
FFQ69        category
FFQ70        category
FFQ52        category
FFQ37        category
SITE         category
RACE         category
SEX          category
MFG          category
MODEL        category
XRAY SITE    category
SERIAL       category
dtype: object

In [4]:
barcode_site_id_df = utils.read_parquet(Path('data/xray_shortcutting_labels.parquet'))

In [10]:
barcode_site_id_df['MFG'].value_counts().sum()

26495

In [24]:
barcode_site_id_df[barcode_site_id_df['FFQ18'].notna()]['SITE'].value_counts()

C    7564
D    6280
B    5718
A    4000
E    2181
Name: SITE, dtype: int64