In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import seaborn as sns
import pickle
import math

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 700)

The data sampling for additional SOD labeling will be done according to the analysis results of Gelderman_SOD_cohort_analysis.ipynb.

# Import master dataset and process it

In [38]:
# import master dataset
master_data = pd.read_pickle('../data/master_dataset.pkl')
print(len(master_data))
display(master_data.head())

1132970


Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days,year,sex,ancestry,est_stature_cm,est_weight_lb,est_stature_in,age_at_death,true_BP,pred_BP,pred_BP_conf,true_SOD_G,BP_of_true_SOD_G,pred_SOD_G,pred_SOD_G_conf,true_SOD_M,BP_of_true_SOD_M
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,stake,100.0,,,,,,
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,85.51,,,,,,
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,90.52,,,,,,
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,99.5,,,,,,
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,43.47,,,,,,


In [None]:
# extract donor_date from img column
master_data['donor_date'] = master_data.img.str.split('.', expand=True)[0]
display(master_data.head(3))

In [None]:
# create final_BP column
master_data['final_BP'] = np.where(master_data['true_BP'].notnull(), 
                                            master_data['true_BP'],
                                            master_data['pred_BP'])
display(master_data.head(3))
len(master_data)

In [None]:
# remove images where true_BP.isnull() AND pred_BP_conf < 80
master_data_filtered = master_data[(master_data.true_BP.notnull()) | 
                                   ((master_data.true_BP.isnull()) & (master_data.pred_BP_conf >= 80.0))].copy()
display(master_data_filtered.head(3))
len(master_data_filtered)

In [None]:
# keep only where final_BP = head, torso, or  limbs imgs
BP_ls = ['head', 'torso', 'backside', 'arm', 'legs']
master_data_filtered = master_data_filtered[(master_data_filtered.final_BP.isin(BP_ls))].copy()
print(len(master_data_filtered))
print(master_data_filtered.final_BP.unique())
display(master_data_filtered.head())

In [None]:
# BMI distribution (CDC BMI categories: underweight, healthy, overweight, obese)
# https://www.cdc.gov/healthyweight/assessing/index.html#:~:text=If%20your%20BMI%20is%20less,falls%20within%20the%20obese%20range.
def bmi_imperial(weight_lb, height_in):
    if height_in != 0: 
        bmi = round(703*(weight_lb/(height_in**2)), 1)
    else:
        bmi = np.nan
    return bmi

def bmi_status(bmi):
    if bmi < 18.5:
        status = 'underweight'
    elif 18.5 <= bmi <= 24.9:
        status = 'healthy'
    elif 24.9 < bmi <= 29.9:
        status = 'overweight'
    elif bmi > 29.9:
        status = 'obese'
    elif pd.isna(bmi):
        status = float('nan')
    return status

# add BMI and BMI_status column
master_data_filtered['BMI'] = master_data_filtered.apply(lambda x: bmi_imperial(x['est_weight_lb'], x['est_stature_in'])
                                               , axis=1)
master_data_filtered['BMI_status'] = master_data_filtered.apply(lambda x: bmi_status(x['BMI']), axis=1)
display(master_data_filtered.head())

In [None]:
# split into individual bodypart dataframes
head_df = master_data_filtered[master_data_filtered.final_BP == 'head'].copy()
torso_df = master_data_filtered[(master_data_filtered.final_BP == 'torso') | (master_data_filtered.final_BP == 'backside')].copy()
limbs_df = master_data_filtered[(master_data_filtered.final_BP == 'arm') | (master_data_filtered.final_BP == 'legs')].copy()
print(head_df.shape, torso_df.shape, limbs_df.shape)

In [None]:
# check bodypart dataframes don't contain duplicate images
print(head_df.img.nunique())
print(torso_df.img.nunique())
print(limbs_df.img.nunique())

In [None]:
# align dataframes on donor_date column
head_torso_aligned = pd.merge(head_df , torso_df[['donor_date', 'img','true_SOD_G']], 
                            how='inner', on='donor_date', suffixes= ('_head', '_torso'))
print(head_torso_aligned.shape)
#display(head_torso_aligned.head())

head_torso_limbs_aligned = pd.merge(head_torso_aligned, limbs_df[['donor_date', 'img','true_SOD_G']], 
                                  how='inner', on='donor_date')
print(head_torso_limbs_aligned.shape)

In [None]:
# rename some columns
head_torso_limbs_aligned.rename(columns={"img": "img_limbs", "true_SOD_G": "true_SOD_G_limbs"}, inplace=True)

In [None]:
head_torso_limbs_aligned[['img_head', 'img_torso', 'img_limbs']].head()

In [None]:
head_torso_limbs_aligned.describe()

In [None]:
del master_data, master_data_filtered

# Sample data for additional SOD labeling
1. With conditions based on analysis results in Gelderman_SOD_cohort_analysis.ipynb.
2. Without conditions to get more SOD labeled image triplets.

In [None]:
# filter such that 0<=PMI_days<=365
img_trips = head_torso_limbs_aligned[head_torso_limbs_aligned.PMI_days <= 365.0].copy()
len(img_trips)

In [None]:
img_trips.info()

In [None]:
img_trips.dropna(subset=['sex', 'est_weight_lb', 'est_stature_in', 'age_at_death'], inplace=True)
img_trips.shape

## 1. Sampling with conditions

In [None]:
# filter such that BMI_status = overweight, healthy, or underweight (less overweight)
img_trips2 = img_trips[img_trips.BMI_status.isin([ 'obese',
                                                    'healthy', 
                                                    'underweight'])].copy()
print(len(img_trips2))
print(img_trips2.BMI_status.unique())

#### Sample for specific PMI bins
This is based on the histogram "pred_PMI_days (Gelderman SOD Cohort) w/ customized bins" in Gelderman_SOD_cohort_analysis.ipynb.

In [None]:
# 0-10
bin_0_10 = img_trips2[(img_trips2.PMI_days >= 0)&(img_trips2.PMI_days <= 10)].drop_duplicates('donor_date')\
                    .groupby('PMI_days').sample(1, random_state=1).copy()
print(bin_0_10.shape)
bin_0_10

In [None]:
img_trips2[(img_trips2.PMI_days > 85)&(img_trips2.PMI_days <= 95)].donor_date.unique()

In [None]:
# 85-95
bin_85_95 = img_trips2[(img_trips2.PMI_days > 85)&(img_trips2.PMI_days <= 95)].\
                    drop_duplicates('donor_date').sample(5, random_state=1).copy()
print(bin_85_95.shape)
bin_85_95

In [None]:
img_trips2[(img_trips2.PMI_days > 95)&(img_trips2.PMI_days <= 106)].donor_date.unique()

In [None]:
# 95-106
bin_95_106 = img_trips2[(img_trips2.PMI_days > 95)&(img_trips2.PMI_days <= 106)].\
                    drop_duplicates('donor_date').sample(5, random_state=1).copy()
print(bin_95_106.shape)
bin_95_106

In [None]:
img_trips2[(img_trips2.PMI_days > 131) & (img_trips2.PMI_days <= 145)].donor_date.unique()

In [None]:
# 131-145
bin_131_145 = img_trips2[(img_trips2.PMI_days > 131) & (img_trips2.PMI_days <= 145)].\
                    drop_duplicates('donor_date').sample(3, random_state=1).copy()
print(bin_131_145.shape)
bin_131_145

In [None]:
img_trips2[(img_trips2.PMI_days > 145) & (img_trips2.PMI_days <= 160)].donor_date.unique()

In [None]:
# 145-160
bin_145_160 = img_trips2[(img_trips2.PMI_days > 145) & (img_trips2.PMI_days <= 160)].\
                    drop_duplicates('donor_date').sample(5, random_state=1).copy()
print(bin_145_160.shape)
bin_145_160

In [None]:
img_trips2[(img_trips2.PMI_days > 160) & (img_trips2.PMI_days <= 176)].donor_date.unique()

In [None]:
# 160-176
bin_160_176 = img_trips2[(img_trips2.PMI_days > 160) & (img_trips2.PMI_days <= 176)].\
                drop_duplicates('donor_date').sample(2, random_state=1).copy()
bin_160_176

In [None]:
img_trips2[(img_trips2.PMI_days > 176) & (img_trips2.PMI_days <= 193)].donor_date.unique()

In [None]:
# 176-193
bin_176_193 = img_trips2[(img_trips2.PMI_days > 176) & (img_trips2.PMI_days <= 193)].\
                drop_duplicates('donor_date').sample(3, random_state=1).copy()
bin_176_193

In [None]:
img_trips2[(img_trips2.PMI_days > 193) & (img_trips2.PMI_days <= 211)].donor_date.unique()

In [None]:
# 193-211
bin_193_211 = img_trips2[(img_trips2.PMI_days > 193) & (img_trips2.PMI_days <= 211)].\
                drop_duplicates('donor_date').sample(3, random_state=1).copy()
bin_193_211

In [None]:
img_trips2[(img_trips2.PMI_days > 211) & (img_trips2.PMI_days <= 230)].donor_date.unique()

In [None]:
# 211-230,
bin_211_230 = img_trips2[(img_trips2.PMI_days > 211) & (img_trips2.PMI_days <= 230)].\
                drop_duplicates('donor_date').sample(5, random_state=1).copy()
bin_211_230

In [None]:
img_trips2[(img_trips2.PMI_days > 230) & (img_trips2.PMI_days <= 250)].donor_date.unique()

In [None]:
# 230-250
bin_230_250 = img_trips2[(img_trips2.PMI_days > 230) & (img_trips2.PMI_days <= 250)].\
                drop_duplicates('donor_date').sample(3, random_state=1).copy()
bin_230_250

In [None]:
img_trips2[(img_trips2.PMI_days > 250) & (img_trips2.PMI_days <= 271)].donor_date.unique()

In [None]:
# 250-271
bin_250_271 = img_trips2[(img_trips2.PMI_days > 250) & (img_trips2.PMI_days <= 271)].\
                drop_duplicates('donor_date').sample(5, random_state=1).copy()
bin_250_271

In [None]:
img_trips2[(img_trips2.PMI_days > 271) & (img_trips2.PMI_days <= 293)].donor_date.unique()

In [None]:
# 271-293
bin_271_293 = img_trips2[(img_trips2.PMI_days > 271) & (img_trips2.PMI_days <= 293)].\
                drop_duplicates('donor_date').sample(5, random_state=1).copy()
bin_271_293

In [None]:
img_trips2[(img_trips2.PMI_days > 293) & (img_trips2.PMI_days <= 316)].donor_date.unique()

In [None]:
# 293-316
bin_293_316 = img_trips2[(img_trips2.PMI_days > 293) & (img_trips2.PMI_days <= 316)].\
                drop_duplicates('donor_date').sample(4, random_state=1).copy()
bin_293_316

In [None]:
img_trips2[(img_trips2.PMI_days > 316) & (img_trips2.PMI_days <= 340)].donor_date.unique()

In [None]:
# 316-340
bin_316_340 = img_trips2[(img_trips2.PMI_days > 316) & (img_trips2.PMI_days <= 340)]\
                .drop_duplicates('donor_date').sample(4, random_state=1).copy()
bin_316_340

In [None]:
img_trips2[(img_trips2.PMI_days > 340) & (img_trips2.PMI_days <= 365)].donor_date.unique()

In [None]:
# 340-365
bin_340_365 = img_trips2[(img_trips2.PMI_days > 340) & (img_trips2.PMI_days <= 365)]\
                .drop_duplicates('donor_date').sample(5, random_state=1).copy()
bin_340_365

In [None]:
under50 = img_trips2[img_trips2.age_at_death <= 50.0].drop_duplicates('donor_date').\
            sample(30, random_state=1).copy()

In [None]:
underweight = img_trips2[img_trips2.BMI_status == 'underweight'].drop_duplicates('donor_date').\
            sample(20, random_state=1).copy()

In [None]:
df_ls = [bin_0_10, bin_85_95, bin_95_106, bin_131_145, bin_145_160, bin_160_176, bin_176_193, bin_193_211, 
         bin_211_230, bin_230_250, bin_250_271, 
         bin_271_293, bin_293_316, bin_316_340, bin_340_365, under50, underweight]
len(df_ls)

In [None]:
# concat new sample into a single df
new_samples = pd.concat(df_ls, axis=0)
new_samples.shape

In [None]:
# drop duplicates rows 
new_samples.drop_duplicates(inplace=True)
len(new_samples)

In [None]:
# check if donor_date are unique meaning no duplicates
new_samples.donor_date.nunique()

In [None]:
new_samples[new_samples.new_id == 'ff9']

In [None]:
# save as CSV file
#new_samples.to_csv('../data/Gelderman_SOD_cohort/new_samples.csv', index_label=False)

In [None]:
new_samples.describe()

#### Add to existing Gelderman SOD cohort and analyze histograms

In [None]:
g_data = pd.read_csv('../data/Gelderman_SOD_cohort/unique_img_triplets.csv')
g_data.shape

In [None]:
g_data_plus = pd.concat([g_data[['new_id', 'donor_date', 'age_at_death', 'PMI_days', 'est_weight_lb', 'est_stature_in', ]], 
                         new_samples[['new_id','donor_date','age_at_death','PMI_days', 'est_weight_lb', 'est_stature_in']]], axis=0)
print(g_data_plus.shape)
display(g_data_plus.head())
print(g_data_plus.donor_date.nunique())

In [None]:
g_data_plus.hist(column='PMI_days')
plt.title('PMI_days (Gelderman SOD Cohort)')
plt.show()

In [None]:
custom_bins = [0,1,2,3,4,5,6,7,8,9,10,  #1
12,14,16,18,20,  #2
23,26,29,32,  #3
36,40,  #4
45,50,  #5
56,61,
68,
76,
85,
95, #10              
106,
118,
131,
145,
160, #15
176,
193,               
211,
230,
250, #20
271,
293,              
316,               
340,
365]

In [None]:
# PMI_days histogram with customized bin widths. 
g_data_plus.hist(column='PMI_days', bins=custom_bins)
plt.title('pred_PMI_days (Gelderman SOD Cohort) w/ customized bins')
plt.show()

In [None]:
g_data_plus.hist(column='age_at_death', bins=3)
plt.title('age_at_death (Gelderman SOD Cohort)')
plt.show()

In [None]:
g_data_plus['BMI'] = g_data_plus.apply(lambda x: bmi_imperial(x['est_weight_lb'], x['est_stature_in'])
                                               , axis=1)
g_data_plus['BMI_status'] = g_data_plus.apply(lambda x: bmi_status(x['BMI']), axis=1)
display(g_data_plus.head())

In [None]:
g_data_plus['BMI_status'].value_counts().plot(kind='bar')
plt.title('BMI_status (Gelderman SOD Cohort)')
plt.show()

In [None]:
g_data_plus.new_id.nunique()

In [None]:
g_data_plus.info()

In [None]:
g_data_plus.describe()

#### Prepare file for IPCUTRD labeling

In [None]:
all_new = pd.concat([new_samples], axis=0)
print(len(all_new))
display(all_new.head())

In [None]:
new_head = all_new[['img_head']].copy()
new_head['BP'] = 'head'
new_head.columns = ['img', 'BP']
display(new_head.head(2))

new_torso = all_new[['img_torso']].copy()
new_torso['BP'] = 'torso'
new_torso.columns = ['img', 'BP']
display(new_torso.head(2))

new_limbs = all_new[['img_limbs']].copy()
new_limbs['BP'] = 'limbs'
new_limbs.columns = ['img', 'BP']
display(new_limbs.head(2))

all_new = pd.concat([new_head, new_torso, new_limbs], axis=0)
print(len(all_new))

all_new_no_dups = all_new.drop_duplicates('img').copy()
print(len(all_new_no_dups))
display(all_new_no_dups.head())

In [None]:
# create img_path column
all_new_no_dups['img_path'] = '/da1_data/icputrd/arf/mean.js/public/anau_img3/' \
                                + all_new_no_dups['img'].str[:3] + '/' + all_new_no_dups['img'].astype(str)
display(all_new_no_dups.head())
len(all_new_no_dups)

In [None]:
# save to CSV file which will be used to populate ICPUTRd for labeling
all_new_no_dups[['img_path', 'BP']].to_csv('/home/anau/SOD_labeling/for_PMI_estimation/additional_labeling_samples.csv',
                                          header=False, index=False)

# Other for now

## 2. Sampling without any conditions

In [None]:
print(img_trips.shape)
print(new_samples.shape)

In [None]:
# remove image triplets already sampled in 1.
img_trips_unused = img_trips.drop(new_samples.index, axis=0).copy()
img_trips_unused.shape

In [None]:
# sample per custom_bins
new_samples_no_condition = pd.DataFrame()
for i, bin in enumerate(custom_bins):
    if custom_bins[i] != 365:
        lb = custom_bins[i]
        ub = custom_bins[i+1]
        try:
            sampled_imgs = img_trips_unused[(img_trips_unused.PMI_days >= lb) & (img_trips_unused.PMI_days <= ub)]\
                    .drop_duplicates('donor_date').sample(8, random_state=1).copy()
            new_samples_no_condition = pd.concat([new_samples_no_condition, sampled_imgs], axis=0)
        except:
            sampled_imgs = img_trips_unused[(img_trips_unused.PMI_days >= lb) & (img_trips_unused.PMI_days <= ub)]\
                    .drop_duplicates('donor_date').sample(7, random_state=1).copy()
            new_samples_no_condition = pd.concat([new_samples_no_condition, sampled_imgs], axis=0) 

In [None]:
# PMI_days histogram with customized bin widths 
new_samples_no_condition.hist(column='PMI_days', bins=custom_bins)
plt.title('pred_PMI_days (Gelderman SOD Cohort) w/ customized bins')
plt.show()

In [None]:
new_samples_no_condition.describe()

In [None]:
# check that samples from new_samples were not re-sampled in new_samples_no_condition
new_samples_no_condition[new_samples_no_condition.index.isin(new_samples.index)]