In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 700)

This notebook creates a kind of "master" dataset containing all available image attributes potentially required for downstream tasks, such as PMI estimation. Available image attributes:
- anatomic data (i.e., true or predicted bodypart)
- donor demographic data
- decay data (i.e, true or predicted SOD)

Since a valid PMI needs to exist, the dataset cohort will consist of donors that have a valid PMI. This data can be found here: '../data/img_PMIs_no_negs.csv'. This will be the "base" dataset from which the "master" dataset will be created.

Outputs:
- base_df7: final "master" dataset containing all image attributes. This dataset can be found at ../data/master_dataset.pkl and ../data/master_dataset.csv.

# Import base dataset
The base dataset consists of all images with a valid PMI. 

In [2]:
base_df = pd.read_csv('../data/img_PMIs_no_negs.csv', usecols=['new_id', 'old_id', 'new_path', 'new_img', 'old_date',
                                                               'date_placed_ARF', 'img_pmi_days'])
base_df.rename(columns={"new_img": "img", "new_path": "img_path", "old_date": "correct_img_date",
                       "img_pmi_days": "PMI_days"}, inplace=True)
base_df = base_df.loc[:,['new_id', 'old_id', 'img_path', 'img', 'correct_img_date', 
                         'date_placed_ARF', 'PMI_days']].copy()
display(base_df.head())
display(base_df.info())
print(base_df.shape)

Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132970 entries, 0 to 1132969
Data columns (total 7 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
dtypes: float64(1), object(6)
memory usage: 60.5+ MB


None

(1132970, 7)


In [3]:
# make sure images are unique
base_df.img.nunique()

1132970

# Add demographic data

In [4]:
# import previously processed demography data
demogr_df = pd.read_pickle('/da1_data/icputrd/arf/mean.js/public/anau_demography/demography_processed.pkl')
display(demogr_df.head())
display(demogr_df.info())
print(demogr_df.shape)

Unnamed: 0,old_id,year,sex,ancestry,est_stature_cm,est_weight_lb,date_placed_ARF,est_stature_in,age_at_death
1025,UT01-12D,2012,Male,,175.26,220.0,2012-01-09,69.000037,74.0
440,UT01-13D,2013,Female,,160.02,,2013-01-23,63.000034,70.0
894,UT01-14D,2014,Female,,162.56,225.0,2014-01-10,64.000035,75.0
543,UT01-15D,2015,Male,,180.34,226.0,NaT,71.000038,67.0
2087,UT01-16D,2016,Male,,190.5,220.0,2016-01-08,75.000041,63.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1058 entries, 1025 to 1933
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   old_id           1058 non-null   object        
 1   year             1058 non-null   int64         
 2   sex              1054 non-null   object        
 3   ancestry         317 non-null    object        
 4   est_stature_cm   708 non-null    float64       
 5   est_weight_lb    968 non-null    float64       
 6   date_placed_ARF  820 non-null    datetime64[ns]
 7   est_stature_in   1010 non-null   float64       
 8   age_at_death     1050 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(1), object(3)
memory usage: 82.7+ KB


None

(1058, 9)


In [5]:
# drop date_placed_ARF column
demogr_df.drop('date_placed_ARF', axis=1, inplace=True)

In [6]:
# make sure old_id are unique
demogr_df.old_id.nunique()

1058

In [7]:
# left join base_df with demogr_df
base_df2 = pd.merge(base_df, demogr_df, how='left', on='old_id')
display(base_df2.head())
display(base_df2.info())
print(base_df2.shape)
del demogr_df

Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days,year,sex,ancestry,est_stature_cm,est_weight_lb,est_stature_in,age_at_death
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 14 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
dtypes: float64(6), object(8)
memory usage: 129.7+ MB


None

(1132970, 14)


# Add anatomic data

### True anatomic data

In [8]:
# import true anatomic data (clusters_w_stakes MongoDB collection)
true_anatomic_df = pd.read_csv('/da1_data/icputrd/decaying_human_body_part_classifier/from_anau/data/all_labeled_img.txt',
                              header=None, names=['img_path', 'true_BP'])
display(true_anatomic_df.head())
display(true_anatomic_df.info())
print(true_anatomic_df.shape)
print(true_anatomic_df.img_path.nunique())

Unnamed: 0,img_path,true_BP
0,/anau_img3/00b/00b00312.22.JPG,arm
1,/anau_img3/00b/00b00310.26.JPG,foot
2,/anau_img3/00b/00b00311.19.JPG,arm
3,/anau_img3/00b/00b00320.23.JPG,hips
4,/anau_img3/00b/00b00312.27.JPG,foot


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76948 entries, 0 to 76947
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   img_path  76948 non-null  object
 1   true_BP   76948 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


None

(76948, 2)
76948


In [9]:
# create image column from img_path column
true_anatomic_df['img'] = true_anatomic_df.img_path.str.split('/', expand=True)[3]
display(true_anatomic_df.head())

Unnamed: 0,img_path,true_BP,img
0,/anau_img3/00b/00b00312.22.JPG,arm,00b00312.22.JPG
1,/anau_img3/00b/00b00310.26.JPG,foot,00b00310.26.JPG
2,/anau_img3/00b/00b00311.19.JPG,arm,00b00311.19.JPG
3,/anau_img3/00b/00b00320.23.JPG,hips,00b00320.23.JPG
4,/anau_img3/00b/00b00312.27.JPG,foot,00b00312.27.JPG


In [10]:
# left join base_df2 with true_anatomic_df
base_df3 = pd.merge(base_df2, true_anatomic_df[['img', 'true_BP']], how='left', on='img')
display(base_df3.head())
display(base_df3.info())
print(base_df3.shape)
del true_anatomic_df

Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days,year,sex,ancestry,est_stature_cm,est_weight_lb,est_stature_in,age_at_death,true_BP
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 15 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
 14  true_BP           45930 non-null    object 
dtypes: float64(6), object(9)
memory usage: 138.3+ MB


None

(1132970, 15)


### Predicted anatomic data

In [11]:
# import predicted anatomic data 
pred_anatomic_df = pd.read_csv('/da1_data/icputrd/decaying_human_body_part_classifier/from_anau/ex1_preds', 
                               delimiter= ':', header=None, names=['img_path', 'pred_BP', 'pred_conf'])
pred_anatomic_df.rename(columns={"pred_conf": "pred_BP_conf"}, inplace=True)
display(pred_anatomic_df.head())
display(pred_anatomic_df.info())
print(pred_anatomic_df.shape)
print(pred_anatomic_df.img_path.nunique())

Unnamed: 0,img_path,pred_BP,pred_BP_conf
0,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.10.JPG,legs,99.99
1,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.01.JPG,stake,99.99
2,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.02.JPG,stake,99.99
3,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.03.JPG,fullbody,99.63
4,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.04.JPG,torso,100.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376574 entries, 0 to 1376573
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   img_path      1376574 non-null  object 
 1   pred_BP       1376574 non-null  object 
 2   pred_BP_conf  1376574 non-null  float64
dtypes: float64(1), object(2)
memory usage: 31.5+ MB


None

(1376574, 3)
1376574


In [12]:
# create image column from img_path column
pred_anatomic_df['img'] = pred_anatomic_df.img_path.str.split('/', expand=True)[8]
display(pred_anatomic_df.head())

Unnamed: 0,img_path,pred_BP,pred_BP_conf,img
0,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.10.JPG,legs,99.99,88a00106.10.JPG
1,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.01.JPG,stake,99.99,88a00106.01.JPG
2,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.02.JPG,stake,99.99,88a00106.02.JPG
3,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.03.JPG,fullbody,99.63,88a00106.03.JPG
4,/da1_data/icputrd/arf/mean.js/public/anau_img3/88a/88a00106.04.JPG,torso,100.0,88a00106.04.JPG


In [13]:
# left join base_df3 with pred_anatomic_df
base_df4 = pd.merge(base_df3, pred_anatomic_df[['img', 'pred_BP', 'pred_BP_conf']], how='left', on='img')
display(base_df4.head())
display(base_df4.info())
print(base_df4.shape)
del pred_anatomic_df

Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days,year,sex,ancestry,est_stature_cm,est_weight_lb,est_stature_in,age_at_death,true_BP,pred_BP,pred_BP_conf
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,stake,100.0
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,85.51
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,90.52
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,99.5
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,43.47


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
 14  true_BP           45930 non-null    object 
 15  pred_BP           1132692 non-null  object 
 16  

None

(1132970, 17)


# Add SOD data
This can be from the Gelderman or Megyesi scoring methods.

### True SOD Gelderman data
Data that was labeled using Gelderman's SOD scoring method.

In [14]:
# import true SOD Gelderman data
files_ls = ['head_labeled', 'torso_labeled', 'limbs_labeled']
true_sod_df = pd.concat((pd.read_csv('/home/anau/SOD_labeling/Gelderman/from_preds/'+f, usecols=[0,1,2],
                                    names=['img_path', 'true_SOD_G', 'BP_of_true_SOD_G']) for f in files_ls))

display(true_sod_df.head())
display(true_sod_df.info())
print(true_sod_df.shape)
print(true_sod_df.img_path.nunique())

Unnamed: 0,img_path,true_SOD_G,BP_of_true_SOD_G
0,/anau_img3/abd/abd00928.20.icon.JPG,h-4,head
1,/anau_img3/abd/abd00928.07.icon.JPG,h-4,head
2,/anau_img3/abd/abd00928.08.icon.JPG,h-4,head
3,/anau_img3/dcd/dcd10203.30.icon.JPG,h-4,head
4,/anau_img3/dcd/dcd10203.31.icon.JPG,h-4,head


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6055 entries, 0 to 2031
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   img_path          6055 non-null   object
 1   true_SOD_G        6055 non-null   object
 2   BP_of_true_SOD_G  6055 non-null   object
dtypes: object(3)
memory usage: 189.2+ KB


None

(6055, 3)
6055


In [15]:
# create image column from img_path column
true_sod_df['img'] = true_sod_df.img_path.str.split('/', expand=True)[3]
true_sod_df['img'] = true_sod_df['img'].str.replace('.icon','')
display(true_sod_df.head())

  true_sod_df['img'] = true_sod_df['img'].str.replace('.icon','')


Unnamed: 0,img_path,true_SOD_G,BP_of_true_SOD_G,img
0,/anau_img3/abd/abd00928.20.icon.JPG,h-4,head,abd00928.20.JPG
1,/anau_img3/abd/abd00928.07.icon.JPG,h-4,head,abd00928.07.JPG
2,/anau_img3/abd/abd00928.08.icon.JPG,h-4,head,abd00928.08.JPG
3,/anau_img3/dcd/dcd10203.30.icon.JPG,h-4,head,dcd10203.30.JPG
4,/anau_img3/dcd/dcd10203.31.icon.JPG,h-4,head,dcd10203.31.JPG


In [16]:
# left join base_df4 with true_sod_df
base_df5 = pd.merge(base_df4, true_sod_df[['img', 'true_SOD_G', 'BP_of_true_SOD_G']], how='left', on='img')
display(base_df5.head())
display(base_df5.info())
print(base_df5.shape)
del true_sod_df

Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days,year,sex,ancestry,est_stature_cm,est_weight_lb,est_stature_in,age_at_death,true_BP,pred_BP,pred_BP_conf,true_SOD_G,BP_of_true_SOD_G
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,stake,100.0,,
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,85.51,,
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,90.52,,
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,99.5,,
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,43.47,,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 19 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
 14  true_BP           45930 non-null    object 
 15  pred_BP           1132692 non-null  object 
 16  

None

(1132970, 19)


### Predicted SOD Gelderman data
Data that was labeled with AI trained on Gelderman's SOD data. 

In [17]:
# import predicted SOD data
files_ls = ['head_clusters_preds', 'torso_clusters_preds', 'limbs_clusters_preds']
pred_sod_df = pd.concat((pd.read_csv('/home/anau/SOD_labeling/Gelderman/from_preds/predict/'+f) for f in files_ls))
pred_sod_df.rename(columns={"img": "img_path", "pred": "pred_SOD_G_conf", "k=1": "pred_SOD_G"}, inplace=True)
pred_sod_df = pred_sod_df.loc[:,['img_path', 'pred_SOD_G', 'pred_SOD_G_conf']].copy()
display(pred_sod_df.head())
display(pred_sod_df.info())
print(pred_sod_df.shape)
print(pred_sod_df.img_path.nunique())

Unnamed: 0,img_path,pred_SOD_G,pred_SOD_G_conf
0,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00323.18.JPG,3,"[1.8596528830738812e-11, 1.0770167136797681e-05, 0.9999768733978271, 1.2416985555319116e-05, 6.144854625134144e-10, 1.945529071178953e-09]"
1,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00324.19.JPG,3,"[2.7908717004088146e-12, 9.229962074641662e-07, 0.999977707862854, 2.1307549104676582e-05, 7.363529785919809e-10, 3.004331505795932e-10]"
2,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00326.19.JPG,3,"[1.9816505381076155e-10, 2.2630556486546993e-06, 0.9999904632568359, 7.294603165064473e-06, 1.6762526922065035e-09, 6.09018738217948e-11]"
3,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00327.19.JPG,3,"[4.763862568313293e-11, 6.425679657695582e-06, 0.9992665648460388, 0.0007268352201208472, 1.2655030445785087e-07, 1.2454775877301927e-08]"
4,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00329.23.JPG,3,"[1.8943072110921833e-10, 1.5520903616561554e-05, 0.9999841451644897, 3.515644380058802e-07, 1.3943256271797821e-10, 6.341189379144296e-11]"


<class 'pandas.core.frame.DataFrame'>
Int64Index: 32378 entries, 0 to 18265
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   img_path         32378 non-null  object
 1   pred_SOD_G       32378 non-null  int64 
 2   pred_SOD_G_conf  32378 non-null  object
dtypes: int64(1), object(2)
memory usage: 1011.8+ KB


None

(32378, 3)
32378


In [18]:
# create image column from img_path column
pred_sod_df['img'] = pred_sod_df.img_path.str.split('/', expand=True)[8]
display(pred_sod_df.head())

Unnamed: 0,img_path,pred_SOD_G,pred_SOD_G_conf,img
0,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00323.18.JPG,3,"[1.8596528830738812e-11, 1.0770167136797681e-05, 0.9999768733978271, 1.2416985555319116e-05, 6.144854625134144e-10, 1.945529071178953e-09]",00b00323.18.JPG
1,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00324.19.JPG,3,"[2.7908717004088146e-12, 9.229962074641662e-07, 0.999977707862854, 2.1307549104676582e-05, 7.363529785919809e-10, 3.004331505795932e-10]",00b00324.19.JPG
2,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00326.19.JPG,3,"[1.9816505381076155e-10, 2.2630556486546993e-06, 0.9999904632568359, 7.294603165064473e-06, 1.6762526922065035e-09, 6.09018738217948e-11]",00b00326.19.JPG
3,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00327.19.JPG,3,"[4.763862568313293e-11, 6.425679657695582e-06, 0.9992665648460388, 0.0007268352201208472, 1.2655030445785087e-07, 1.2454775877301927e-08]",00b00327.19.JPG
4,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00329.23.JPG,3,"[1.8943072110921833e-10, 1.5520903616561554e-05, 0.9999841451644897, 3.515644380058802e-07, 1.3943256271797821e-10, 6.341189379144296e-11]",00b00329.23.JPG


In [19]:
# extract highest confidence from pred_SOD_G_conf and save as new column
def modify_dtype(row):
    row = [float(x) for x in row] 
    max_prob = max(row)
    return max_prob

pred_sod_df['pred_SOD_G_conf_new'] = pred_sod_df.pred_SOD_G_conf.apply(lambda x: x.strip('[]').split(','))
pred_sod_df['pred_SOD_G_conf_new'] = pred_sod_df.pred_SOD_G_conf_new.apply(modify_dtype)
display(pred_sod_df.head(10))
pred_sod_df.drop(['pred_SOD_G_conf'], axis=1, inplace=True)
pred_sod_df.rename(columns={"pred_SOD_G_conf_new": "pred_SOD_G_conf"}, inplace=True)
pred_sod_df = pred_sod_df.loc[:,['img_path', 'img', 'pred_SOD_G', 'pred_SOD_G_conf']].copy()
display(pred_sod_df.head(10))

Unnamed: 0,img_path,pred_SOD_G,pred_SOD_G_conf,img,pred_SOD_G_conf_new
0,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00323.18.JPG,3,"[1.8596528830738812e-11, 1.0770167136797681e-05, 0.9999768733978271, 1.2416985555319116e-05, 6.144854625134144e-10, 1.945529071178953e-09]",00b00323.18.JPG,0.999977
1,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00324.19.JPG,3,"[2.7908717004088146e-12, 9.229962074641662e-07, 0.999977707862854, 2.1307549104676582e-05, 7.363529785919809e-10, 3.004331505795932e-10]",00b00324.19.JPG,0.999978
2,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00326.19.JPG,3,"[1.9816505381076155e-10, 2.2630556486546993e-06, 0.9999904632568359, 7.294603165064473e-06, 1.6762526922065035e-09, 6.09018738217948e-11]",00b00326.19.JPG,0.99999
3,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00327.19.JPG,3,"[4.763862568313293e-11, 6.425679657695582e-06, 0.9992665648460388, 0.0007268352201208472, 1.2655030445785087e-07, 1.2454775877301927e-08]",00b00327.19.JPG,0.999267
4,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00329.23.JPG,3,"[1.8943072110921833e-10, 1.5520903616561554e-05, 0.9999841451644897, 3.515644380058802e-07, 1.3943256271797821e-10, 6.341189379144296e-11]",00b00329.23.JPG,0.999984
5,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00331.25.JPG,3,"[7.235614885026109e-12, 1.937119691319822e-07, 0.9999927282333374, 7.00877399140154e-06, 8.420728558888868e-10, 2.1879263678581395e-10]",00b00331.25.JPG,0.999993
6,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00330.11.JPG,3,"[4.102733430766392e-11, 6.931198015536211e-08, 0.9999673366546631, 3.255257252021693e-05, 2.6792886131943305e-08, 4.1194665734156644e-11]",00b00330.11.JPG,0.999967
7,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00401.25.JPG,3,"[2.908651985933375e-09, 1.8822316860678256e-06, 0.9998575448989868, 0.0001399808970745653, 5.377323191169125e-07, 2.710314994658347e-08]",00b00401.25.JPG,0.999858
8,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00331.12.JPG,3,"[5.96862337332027e-09, 6.183307505125413e-06, 0.9991486072540283, 0.0008440593373961747, 6.347546559481998e-07, 4.337266261700279e-07]",00b00331.12.JPG,0.999149
9,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00425.22.JPG,3,"[6.315889322650037e-07, 2.1791613562527345e-06, 0.9972829818725586, 0.00023548606259282678, 0.0024752113968133926, 3.446732534939656e-06]",00b00425.22.JPG,0.997283


Unnamed: 0,img_path,img,pred_SOD_G,pred_SOD_G_conf
0,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00323.18.JPG,00b00323.18.JPG,3,0.999977
1,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00324.19.JPG,00b00324.19.JPG,3,0.999978
2,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00326.19.JPG,00b00326.19.JPG,3,0.99999
3,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00327.19.JPG,00b00327.19.JPG,3,0.999267
4,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00329.23.JPG,00b00329.23.JPG,3,0.999984
5,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00331.25.JPG,00b00331.25.JPG,3,0.999993
6,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00330.11.JPG,00b00330.11.JPG,3,0.999967
7,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00401.25.JPG,00b00401.25.JPG,3,0.999858
8,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00331.12.JPG,00b00331.12.JPG,3,0.999149
9,/da1_data/icputrd/arf/mean.js/public/sara_img/00b/00b00425.22.JPG,00b00425.22.JPG,3,0.997283


In [20]:
# left join base_df5 with pred_sod_df
base_df6 = pd.merge(base_df5, pred_sod_df[['img', 'pred_SOD_G', 'pred_SOD_G_conf']], how='left', on='img')
display(base_df6.head())
display(base_df6.info())
print(base_df6.shape)
del pred_sod_df

Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days,year,sex,ancestry,est_stature_cm,est_weight_lb,est_stature_in,age_at_death,true_BP,pred_BP,pred_BP_conf,true_SOD_G,BP_of_true_SOD_G,pred_SOD_G,pred_SOD_G_conf
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,stake,100.0,,,,
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,85.51,,,,
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,90.52,,,,
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,99.5,,,,
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,43.47,,,,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 21 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
 14  true_BP           45930 non-null    object 
 15  pred_BP           1132692 non-null  object 
 16  

None

(1132970, 21)


### True SOD Megyesi data
Data that was labeled using Megyesi's SOD scoring method.

In [21]:
true_sod_head_df = pd.read_csv('/home/anau/SOD_labeling/Megyesi/head/experiment_5/head_labeled_merged',
                              usecols=[0,1,], names=['img_path', 'true_SOD_M'])
true_sod_head_df['BP_of_true_SOD_M'] = 'head'
display(true_sod_head_df.head())
print(true_sod_head_df.shape)

Unnamed: 0,img_path,true_SOD_M,BP_of_true_SOD_M
0,/anau_img3/4cd/4cd00612.21.icon.JPG,1,head
1,/anau_img3/b4b/b4b10107.16.icon.JPG,1,head
2,/anau_img3/38e/38e10122.04.icon.JPG,1,head
3,/anau_img3/0e4/0e401201.29.icon.JPG,1,head
4,/anau_img3/0e4/0e401201.28.icon.JPG,1,head


(4220, 3)


In [22]:
true_sod_torso_df = pd.read_csv('/home/anau/SOD_labeling/Megyesi/torso/exp2/torso_labeled',
                              usecols=[0,1,2], names=['img_path', 'true_SOD_M', 'BP_of_true_SOD_M'])
display(true_sod_torso_df.head())
print(true_sod_torso_df.shape)

Unnamed: 0,img_path,true_SOD_M,BP_of_true_SOD_M
0,/sara_img/34d/34d00228.16.icon.JPG,2,backside
1,/sara_img/db7/db700607.11.icon.JPG,3,torso
2,/sara_img/0db/0db00530.07.icon.JPG,2,backside
3,/sara_img/a18/a1800325.07.icon.JPG,2,backside
4,/sara_img/2c3/2c310131.18.icon.JPG,3,backside


(1979, 3)


In [23]:
true_sod_limbs_df = pd.read_csv('/home/anau/SOD_labeling/Megyesi/limbs/exp1/limbs_labeled',
                              usecols=[0,1,2], names=['img_path', 'true_SOD_M', 'BP_of_true_SOD_M'])
display(true_sod_limbs_df.head())
print(true_sod_limbs_df.shape)

Unnamed: 0,img_path,true_SOD_M,BP_of_true_SOD_M
0,/sara_img/65b/65b01001.35.icon.JPG,1,arm
1,/sara_img/35a/35a00330.22.icon.JPG,2,arm
2,/sara_img/3de/3de10120.10.icon.JPG,2,legs
3,/sara_img/243/24300801.50.icon.JPG,2,legs
4,/sara_img/9f4/9f400511.19.icon.JPG,4,legs


(2152, 3)


In [24]:
# concat true_sod_head_df, true_sod_torso_df, true_sod_limbs_df
true_sod_df = pd.concat([true_sod_head_df, true_sod_torso_df, true_sod_limbs_df], axis=0)

display(true_sod_df.head())
display(true_sod_df.info())
print(true_sod_df.shape)
print(true_sod_df.img_path.nunique())

Unnamed: 0,img_path,true_SOD_M,BP_of_true_SOD_M
0,/anau_img3/4cd/4cd00612.21.icon.JPG,1,head
1,/anau_img3/b4b/b4b10107.16.icon.JPG,1,head
2,/anau_img3/38e/38e10122.04.icon.JPG,1,head
3,/anau_img3/0e4/0e401201.29.icon.JPG,1,head
4,/anau_img3/0e4/0e401201.28.icon.JPG,1,head


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8351 entries, 0 to 2151
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   img_path          8351 non-null   object
 1   true_SOD_M        8351 non-null   int64 
 2   BP_of_true_SOD_M  8351 non-null   object
dtypes: int64(1), object(2)
memory usage: 261.0+ KB


None

(8351, 3)
8346


In [25]:
# analyze duplicate images
display(true_sod_df[true_sod_df.duplicated('img_path', keep=False)].sort_values('img_path'))

Unnamed: 0,img_path,true_SOD_M,BP_of_true_SOD_M
533,/anau_img3/a49/a4901205.27.icon.JPG,2,head
2982,/anau_img3/a49/a4901205.27.icon.JPG,2,head
807,/anau_img3/c00/c0000913.04.icon.JPG,3,head
3245,/anau_img3/c00/c0000913.04.icon.JPG,4,head
1054,/anau_img3/fc0/fc010405.36.icon.JPG,2,head
3857,/anau_img3/fc0/fc010405.36.icon.JPG,2,head
1259,/anau_img3/fc0/fc010405.50.icon.JPG,2,head
3859,/anau_img3/fc0/fc010405.50.icon.JPG,2,head
771,/anau_img3/ff1/ff101022.07.icon.JPG,2,head
3907,/anau_img3/ff1/ff101022.07.icon.JPG,2,head


In [26]:
# analyze duplicate images
display(true_sod_df[true_sod_df.duplicated('img_path', keep=False)])

# drop all duplicates
true_sod_df.drop_duplicates(subset='img_path', keep=False, inplace=True)
print(true_sod_df.shape)

Unnamed: 0,img_path,true_SOD_M,BP_of_true_SOD_M
533,/anau_img3/a49/a4901205.27.icon.JPG,2,head
771,/anau_img3/ff1/ff101022.07.icon.JPG,2,head
807,/anau_img3/c00/c0000913.04.icon.JPG,3,head
1054,/anau_img3/fc0/fc010405.36.icon.JPG,2,head
1259,/anau_img3/fc0/fc010405.50.icon.JPG,2,head
2982,/anau_img3/a49/a4901205.27.icon.JPG,2,head
3245,/anau_img3/c00/c0000913.04.icon.JPG,4,head
3857,/anau_img3/fc0/fc010405.36.icon.JPG,2,head
3859,/anau_img3/fc0/fc010405.50.icon.JPG,2,head
3907,/anau_img3/ff1/ff101022.07.icon.JPG,2,head


(8341, 3)


In [27]:
# check unique bodyparts 
true_sod_df.BP_of_true_SOD_M.unique()

array(['head', 'backside', 'torso', 'arm', 'legs'], dtype=object)

In [28]:
# create image column from img_path column
true_sod_df['img'] = true_sod_df.img_path.str.split('/', expand=True)[3]
true_sod_df['img'] = true_sod_df['img'].str.replace('.icon','')
display(true_sod_df.head())

  true_sod_df['img'] = true_sod_df['img'].str.replace('.icon','')


Unnamed: 0,img_path,true_SOD_M,BP_of_true_SOD_M,img
0,/anau_img3/4cd/4cd00612.21.icon.JPG,1,head,4cd00612.21.JPG
1,/anau_img3/b4b/b4b10107.16.icon.JPG,1,head,b4b10107.16.JPG
2,/anau_img3/38e/38e10122.04.icon.JPG,1,head,38e10122.04.JPG
3,/anau_img3/0e4/0e401201.29.icon.JPG,1,head,0e401201.29.JPG
4,/anau_img3/0e4/0e401201.28.icon.JPG,1,head,0e401201.28.JPG


In [29]:
# check unique number of images
true_sod_df.img.nunique()

8341

In [30]:
# left join base_df6 with true_sod_df
base_df7 = pd.merge(base_df6, true_sod_df[['img', 'true_SOD_M', 'BP_of_true_SOD_M']], how='left', on='img')
display(base_df7.head())
display(base_df7.info())
print(base_df7.shape)
del true_sod_df

Unnamed: 0,new_id,old_id,img_path,img,correct_img_date,date_placed_ARF,PMI_days,year,sex,ancestry,est_stature_cm,est_weight_lb,est_stature_in,age_at_death,true_BP,pred_BP,pred_BP_conf,true_SOD_G,BP_of_true_SOD_G,pred_SOD_G,pred_SOD_G_conf,true_SOD_M,BP_of_true_SOD_M
0,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,00000121.01.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,stake,100.0,,,,,,
1,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,00000121.02.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,85.51,,,,,,
2,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,00000121.03.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,90.52,,,,,,
3,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,00000121.04.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,99.5,,,,,,
4,0,UT01-18D,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,00000121.05.JPG,2018-01-21,2018-01-21,0.0,2018.0,Male,,177.8,160.0,70.000038,64.0,,fullbody,43.47,,,,,,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 23 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
 14  true_BP           45930 non-null    object 
 15  pred_BP           1132692 non-null  object 
 16  

None

(1132970, 23)


### Predicted SOD Megyesi data
Does not exist because Megyesi SOD models have not been used to predict other images.

In [31]:
# free up memory 
del base_df, base_df2, base_df3, base_df4, base_df5, base_df6

# Processing of base_df7 into master_df

In [32]:
base_df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 23 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
 14  true_BP           45930 non-null    object 
 15  pred_BP           1132692 non-null  object 
 16  

In [33]:
base_df7.describe()

Unnamed: 0,PMI_days,year,est_stature_cm,est_weight_lb,est_stature_in,age_at_death,pred_BP_conf,pred_SOD_G,pred_SOD_G_conf,true_SOD_M
count,1132970.0,1125446.0,878776.0,1052284.0,1086628.0,1125128.0,1132692.0,20873.0,20873.0,6115.0
mean,53.95654,2015.312,168.865678,185.116,65.79434,65.96982,90.13993,3.448953,0.912515,2.644481
std,78.0539,2.217571,20.280189,61.20444,10.32446,15.10841,16.11813,1.114769,0.140383,0.972058
min,0.0,2012.0,0.0,75.0,0.0,18.0,15.91,1.0,0.302266,1.0
25%,16.0,2014.0,162.56,141.0,64.00003,57.0,87.05,3.0,0.883303,2.0
50%,36.0,2015.0,170.18,175.0,67.00004,67.0,99.12,3.0,0.990769,2.0
75%,67.0,2017.0,177.8,212.0,70.00004,76.0,99.96,4.0,0.999857,4.0
max,2359.0,2022.0,195.58,516.0,80.0,98.0,100.0,6.0,1.0,4.0


In [34]:
# analyze BP columns
print(base_df7.true_BP.unique())
print(base_df7.pred_BP.unique())
print(base_df7.BP_of_true_SOD_G.unique())
print(base_df7.BP_of_true_SOD_M.unique())

[nan 'fullbody' 'backside' 'arm' 'foot' 'hips' 'head' 'legs' 'hand' 'back'
 'other' 'plastic' 'torso' 'stake' 'knee' 'shade']
['stake' 'fullbody' 'foot' 'plastic' 'torso' 'legs' 'head' 'arm' 'hand'
 'backside' nan]
[nan 'torso' 'legs' 'head' 'arm']
[nan 'head' 'backside' 'legs' 'arm' 'torso']


In [35]:
# analyze SOD Gelderman columns
print(base_df7.true_SOD_G.unique())
print(base_df7.pred_SOD_G.unique())

[nan 't-3' 'l-3' 'h-3' 'l-4' 't-2' 't-5' 'h-5' 't-6' 'h-6' 't-4' 'h-4'
 'h-2' 'l-2' 't-1' 'h-1' 'l-6' 'l-5' 'l-1']
[nan  2.  3.  4.  6.  5.  1.]


In [36]:
# convert SOD Gelderman columns to numeric
base_df7['true_SOD_G'] = base_df7['true_SOD_G'].str.extract('(\d+)', expand=False).copy()
base_df7['true_SOD_G'] = base_df7['true_SOD_G'].astype(float)
print(base_df7.true_SOD_G.unique())
base_df7['pred_SOD_G'] = base_df7['pred_SOD_G'].astype(float)
print(base_df7.pred_SOD_G.unique())
print(base_df7.info())

[nan  3.  4.  2.  5.  6.  1.]
[nan  2.  3.  4.  6.  5.  1.]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132970 entries, 0 to 1132969
Data columns (total 23 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   new_id            1132970 non-null  object 
 1   old_id            1132970 non-null  object 
 2   img_path          1132970 non-null  object 
 3   img               1132970 non-null  object 
 4   correct_img_date  1132970 non-null  object 
 5   date_placed_ARF   1132970 non-null  object 
 6   PMI_days          1132970 non-null  float64
 7   year              1125446 non-null  float64
 8   sex               1125106 non-null  object 
 9   ancestry          222775 non-null   object 
 10  est_stature_cm    878776 non-null   float64
 11  est_weight_lb     1052284 non-null  float64
 12  est_stature_in    1086628 non-null  float64
 13  age_at_death      1125128 non-null  float64
 14  true_BP           45930 non-null    ob

In [37]:
# analyze SOD Megyesi columns
print(base_df7.true_SOD_M.unique())

[nan  2.  3.  1.  4.]


In [38]:
base_df7.shape

(1132970, 23)

In [39]:
# save processed base_df6 as pickle file
base_df7.to_csv('../data/master_dataset.csv')
base_df7.to_pickle('../data/master_dataset.pkl')