In [12]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import pickle

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 700)

This notebook creates temperature and humidity features.

In [13]:
# import subset master dataset data
data_df = pd.read_pickle('../data/Gelderman_SOD_cohort/for_pmi_estimation.pkl')
print(data_df.shape)
display(data_df.head())
display(data_df.info())

(256, 27)


Unnamed: 0,new_id,donor_date,correct_img_date,date_placed_ARF,PMI_days,age_at_death,est_weight_lb,est_stature_in,img_head,true_SOD_G_head,img_torso,true_SOD_G_torso,img_limbs,true_SOD_G_limbs,sex_male,month,season_of_recovery,fall,spring,summer,ADD_thres0,ADD_thres5,ADD_thres10,ADD_thres15,ADD_thres20,ADD_thres25,ADD_thres30
0,004,00400124,2018-01-24,2018-01-09,15.0,77.0,180.0,64.0,00400124.12.JPG,3.0,00400124.07.JPG,3.0,00400124.10.JPG,3.0,0,1,winter,0,0,0,73.881146,65.115868,32.350972,0.0,0.0,0.0,0.0
1,00b,00b00525,2016-05-25,2016-03-07,79.0,38.0,516.0,73.000039,00b00525.08.JPG,5.0,00b00525.04.JPG,5.0,00b00525.27.JPG,3.0,1,5,spring,0,1,0,1326.323441,1326.323441,1279.608857,983.358857,391.756402,0.0,0.0
2,00b,00b00818,2016-08-18,2016-03-07,164.0,38.0,516.0,73.000039,00b00818.07.JPG,6.0,00b00818.05.JPG,6.0,00b00818.11.JPG,3.0,1,8,summer,0,0,1,3552.068684,3552.068684,3505.3541,3209.1041,2617.501645,1785.331086,30.05625
3,00d,00d10116,2019-01-16,2018-11-27,50.0,73.0,235.0,73.000039,00d10116.07.JPG,5.0,00d10116.04.JPG,4.0,00d10116.06.JPG,4.0,1,1,winter,0,0,0,317.835928,268.610789,126.581344,30.420863,0.0,0.0,0.0
4,011,01101210,2018-12-10,2018-11-29,11.0,90.0,170.0,69.000037,01101210.06.JPG,2.0,01101210.03.JPG,2.0,01101210.08.JPG,2.0,0,12,winter,0,0,0,69.692188,54.013368,39.090451,15.414583,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 256 entries, 0 to 255
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   new_id              256 non-null    object        
 1   donor_date          256 non-null    object        
 2   correct_img_date    256 non-null    datetime64[ns]
 3   date_placed_ARF     256 non-null    datetime64[ns]
 4   PMI_days            256 non-null    float64       
 5   age_at_death        256 non-null    float64       
 6   est_weight_lb       256 non-null    float64       
 7   est_stature_in      256 non-null    float64       
 8   img_head            256 non-null    object        
 9   true_SOD_G_head     256 non-null    float64       
 10  img_torso           256 non-null    object        
 11  true_SOD_G_torso    256 non-null    float64       
 12  img_limbs           256 non-null    object        
 13  true_SOD_G_limbs    256 non-null    float64       

None

In [14]:
# import LCD weather data (hourly)
weather_df = pd.read_pickle('/data/anau/temp_humidity_data/data/LCD/lcd_daily_avg.pkl')
print(weather_df.shape)
display(weather_df.head())
display(weather_df.info())

(4473, 3)


Unnamed: 0,date,HourlyDryBulbTemperature,HourlyRelativeHumidity
0,2011-01-01,11.472727,91.80303
1,2011-01-02,3.828571,64.628571
2,2011-01-03,-0.06875,57.8125
3,2011-01-04,3.084375,60.15625
4,2011-01-05,1.008333,83.216667


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4473 entries, 0 to 4472
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      4473 non-null   datetime64[ns]
 1   HourlyDryBulbTemperature  4473 non-null   float64       
 2   HourlyRelativeHumidity    4473 non-null   float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 139.8 KB


None

In [15]:
subset_dict = data_df.to_dict('records')
row_counter = 0
# for each instance
for i,row in enumerate(subset_dict):
    print('iter:', i, 'ID:', row['new_id'])
    img_date = row['correct_img_date']
    #print(img_date)
    
    time_windows = [(1,3), (4,7), (8,21), (22,56), (57,154), (155, 365)]
    
    for t in time_windows:
        # get start and end date of time window
        start = img_date - datetime.timedelta(days=t[0])
        end = img_date - datetime.timedelta(days=t[1])
        #print(start, end)
        
        # get weather data (temp and humidity) for time window
        mask = (weather_df['date'] <= start) & (weather_df['date'] >= end)
        subset_weather_df = weather_df.loc[mask].copy()
        #display(subset_weather_df.loc[mask])
        
        # aggregate weather data
        agg_df = subset_weather_df[['HourlyDryBulbTemperature', 'HourlyRelativeHumidity']].agg(['mean', 'std']).copy()
        #display(agg_df)
        temp_mean = round(agg_df.loc['mean', 'HourlyDryBulbTemperature'], 3)
        temp_std = round(agg_df.loc['std', 'HourlyDryBulbTemperature'], 3)
        hum_mean = round(agg_df.loc['mean', 'HourlyRelativeHumidity'], 3)
        hum_std = round(agg_df.loc['std', 'HourlyRelativeHumidity'], 3)
        
        # add to dict
        time_window_s = str(t[0]) + '_' + str(t[1])

        row['temp'+'_'+time_window_s+'_mean'] = temp_mean
        row['temp'+'_'+time_window_s+'_std'] = temp_std

        row['hum'+'_'+time_window_s+'_mean'] = hum_mean
        row['hum'+'_'+time_window_s+'_std'] = hum_std
        
        #print(row)
        #print()
        
    #break

    print()

# convert dict to dataframe
data_w_weather_df = pd.DataFrame.from_dict(subset_dict)

iter: 0 ID: 004

iter: 1 ID: 00b

iter: 2 ID: 00b

iter: 3 ID: 00d

iter: 4 ID: 011

iter: 5 ID: 01c

iter: 6 ID: 01e

iter: 7 ID: 02a

iter: 8 ID: 02d

iter: 9 ID: 02d

iter: 10 ID: 044

iter: 11 ID: 044

iter: 12 ID: 059

iter: 13 ID: 059

iter: 14 ID: 076

iter: 15 ID: 07b

iter: 16 ID: 07b

iter: 17 ID: 07b

iter: 18 ID: 07c

iter: 19 ID: 07c

iter: 20 ID: 082

iter: 21 ID: 090

iter: 22 ID: 091

iter: 23 ID: 095

iter: 24 ID: 097

iter: 25 ID: 097

iter: 26 ID: 09b

iter: 27 ID: 0a5

iter: 28 ID: 0ae

iter: 29 ID: 0b2

iter: 30 ID: 0b9

iter: 31 ID: 0b9

iter: 32 ID: 0c2

iter: 33 ID: 0e8

iter: 34 ID: 0f2

iter: 35 ID: 0f2

iter: 36 ID: 12a

iter: 37 ID: 12b

iter: 38 ID: 12b

iter: 39 ID: 139

iter: 40 ID: 13c

iter: 41 ID: 153

iter: 42 ID: 169

iter: 43 ID: 16c

iter: 44 ID: 18c

iter: 45 ID: 197

iter: 46 ID: 1bc

iter: 47 ID: 1db

iter: 48 ID: 1e3

iter: 49 ID: 1f7

iter: 50 ID: 215

iter: 51 ID: 215

iter: 52 ID: 24a

iter: 53 ID: 24a

iter: 54 ID: 270

iter: 55 ID: 28e

it

In [16]:
data_w_weather_df.head()

Unnamed: 0,new_id,donor_date,correct_img_date,date_placed_ARF,PMI_days,age_at_death,est_weight_lb,est_stature_in,img_head,true_SOD_G_head,img_torso,true_SOD_G_torso,img_limbs,true_SOD_G_limbs,sex_male,month,season_of_recovery,fall,spring,summer,ADD_thres0,ADD_thres5,ADD_thres10,ADD_thres15,ADD_thres20,ADD_thres25,ADD_thres30,temp_1_3_mean,temp_1_3_std,hum_1_3_mean,hum_1_3_std,temp_4_7_mean,temp_4_7_std,hum_4_7_mean,hum_4_7_std,temp_8_21_mean,temp_8_21_std,hum_8_21_mean,hum_8_21_std,temp_22_56_mean,temp_22_56_std,hum_22_56_mean,hum_22_56_std,temp_57_154_mean,temp_57_154_std,hum_57_154_mean,hum_57_154_std,temp_155_365_mean,temp_155_365_std,hum_155_365_mean,hum_155_365_std
0,004,00400124,2018-01-24,2018-01-09,15.0,77.0,180.0,64.0,00400124.12.JPG,3.0,00400124.07.JPG,3.0,00400124.10.JPG,3.0,0,1,winter,0,0,0,73.881146,65.115868,32.350972,0.0,0.0,0.0,0.0,9.315,0.888,71.621,12.679,-3.108,7.418,62.885,6.736,-0.699,6.638,74.493,16.49,4.205,5.889,78.049,14.677,16.426,6.113,76.979,10.014,18.181,7.197,67.221,13.361
1,00b,00b00525,2016-05-25,2016-03-07,79.0,38.0,516.0,73.000039,00b00525.08.JPG,5.0,00b00525.04.JPG,5.0,00b00525.27.JPG,3.0,1,5,spring,0,1,0,1326.323441,1326.323441,1279.608857,983.358857,391.756402,0.0,0.0,19.485,0.935,57.198,2.203,18.725,0.697,76.769,8.882,17.617,4.23,64.476,8.287,16.59,4.368,58.561,15.086,7.484,7.232,69.266,15.024,19.483,6.77,74.838,10.009
2,00b,00b00818,2016-08-18,2016-03-07,164.0,38.0,516.0,73.000039,00b00818.07.JPG,6.0,00b00818.05.JPG,6.0,00b00818.11.JPG,3.0,1,8,summer,0,0,1,3552.068684,3552.068684,3505.3541,3209.1041,2617.501645,1785.331086,30.05625,27.174,1.174,71.959,6.483,28.471,0.317,63.152,0.949,26.955,1.031,72.737,6.045,26.864,1.68,67.719,7.432,18.606,5.25,63.088,13.413,12.155,8.104,73.635,12.741
3,00d,00d10116,2019-01-16,2018-11-27,50.0,73.0,235.0,73.000039,00d10116.07.JPG,5.0,00d10116.04.JPG,4.0,00d10116.06.JPG,4.0,1,1,winter,0,0,0,317.835928,268.610789,126.581344,30.420863,0.0,0.0,0.0,3.511,1.705,89.241,11.812,2.666,2.899,57.956,9.74,10.313,2.75,84.482,10.465,5.37,3.923,79.734,12.914,18.516,7.254,81.323,8.885,16.984,8.703,73.738,13.603
4,011,01101210,2018-12-10,2018-11-29,11.0,90.0,170.0,69.000037,01101210.06.JPG,2.0,01101210.03.JPG,2.0,01101210.08.JPG,2.0,0,12,winter,0,0,0,69.692188,54.013368,39.090451,15.414583,0.0,0.0,0.0,3.825,1.623,80.427,17.534,3.942,4.081,64.24,4.184,6.799,5.078,79.896,12.073,10.311,4.085,80.913,11.039,24.042,2.56,80.822,7.902,12.899,9.807,73.032,14.609


In [17]:
data_w_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   new_id              256 non-null    object        
 1   donor_date          256 non-null    object        
 2   correct_img_date    256 non-null    datetime64[ns]
 3   date_placed_ARF     256 non-null    datetime64[ns]
 4   PMI_days            256 non-null    float64       
 5   age_at_death        256 non-null    float64       
 6   est_weight_lb       256 non-null    float64       
 7   est_stature_in      256 non-null    float64       
 8   img_head            256 non-null    object        
 9   true_SOD_G_head     256 non-null    float64       
 10  img_torso           256 non-null    object        
 11  true_SOD_G_torso    256 non-null    float64       
 12  img_limbs           256 non-null    object        
 13  true_SOD_G_limbs    256 non-null    float64       

In [18]:
data_w_weather_df.describe()

Unnamed: 0,PMI_days,age_at_death,est_weight_lb,est_stature_in,true_SOD_G_head,true_SOD_G_torso,true_SOD_G_limbs,sex_male,fall,spring,summer,ADD_thres0,ADD_thres5,ADD_thres10,ADD_thres15,ADD_thres20,ADD_thres25,ADD_thres30,temp_1_3_mean,temp_1_3_std,hum_1_3_mean,hum_1_3_std,temp_4_7_mean,temp_4_7_std,hum_4_7_mean,hum_4_7_std,temp_8_21_mean,temp_8_21_std,hum_8_21_mean,hum_8_21_std,temp_22_56_mean,temp_22_56_std,hum_22_56_mean,hum_22_56_std,temp_57_154_mean,temp_57_154_std,hum_57_154_mean,hum_57_154_std,temp_155_365_mean,temp_155_365_std,hum_155_365_mean,hum_155_365_std
count,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0,256.0
mean,78.871094,63.972656,188.765625,66.945343,3.761719,3.484375,3.472656,0.484375,0.21875,0.324219,0.277344,1150.905508,1123.009439,1039.212466,869.310861,635.165801,265.462488,3.098058,16.418324,1.983098,68.507812,7.601488,16.181133,2.305602,68.755551,8.63077,15.900449,3.352543,69.357027,11.147977,15.093629,3.987012,70.026723,11.897113,13.840684,5.411148,69.84084,12.668477,16.187664,7.22248,71.141344,12.237195
std,87.438959,16.293245,69.435298,3.884366,1.148682,1.058926,1.028627,0.500735,0.414208,0.468999,0.448565,1460.947874,1443.446826,1386.95252,1240.046806,1019.806109,575.830494,13.369336,8.100735,1.426231,11.154685,5.321425,8.07937,1.654926,9.926778,5.331941,7.902068,1.635904,7.773733,4.128593,7.938396,1.613071,5.963854,3.315223,6.975205,1.539247,4.801758,2.512202,4.091097,1.51193,3.593349,1.566577
min,0.0,26.0,93.0,59.000032,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.735417,0.0,0.0,0.0,0.0,0.0,0.0,-7.121,0.124,38.323,0.297,-5.316,0.208,46.812,0.593,-1.662,0.898,49.971,2.979,0.534,1.167,56.909,5.716,3.417,1.657,60.149,7.029,8.33,4.08,63.527,9.65
25%,15.0,53.75,135.0,64.000035,3.0,3.0,3.0,0.0,0.0,0.0,0.0,224.541773,207.786198,144.396119,80.000955,0.0,0.0,0.0,10.769,0.88775,61.67975,3.36825,10.02675,1.009,61.607,4.341,8.761,1.92,62.63725,7.85875,7.70575,2.63125,65.7535,9.1165,7.736,4.60225,66.451,10.84775,12.50275,5.98875,68.867,10.974
50%,47.5,63.0,180.0,67.000036,4.0,3.0,3.0,0.0,0.0,0.0,0.0,565.709945,527.853068,463.586597,325.678423,125.220799,0.0,0.0,17.8495,1.6235,68.8765,6.564,17.94,1.8585,69.6425,7.613,16.2875,3.128,69.9675,10.7905,15.4085,4.068,70.6635,11.68,13.4215,5.7125,69.9915,12.965,16.7675,7.3575,71.4115,12.165
75%,106.75,76.0,233.25,70.0,5.0,4.0,4.0,1.0,0.0,1.0,1.0,1247.915797,1214.307633,1202.876067,1013.331716,801.204321,236.971042,0.0,23.28825,2.74475,77.2795,10.4585,23.233,3.01625,76.562,12.354,23.1695,4.47475,75.99225,14.41225,22.9505,5.0575,74.28175,14.74625,20.726,6.5375,72.741,14.7875,20.01425,8.56325,74.13225,13.431
max,353.0,96.0,516.0,77.000042,6.0,6.0,6.0,1.0,1.0,1.0,1.0,5889.655055,5797.420345,5540.646377,4779.720588,3934.570925,2371.437128,126.045139,29.529,7.186,93.632,27.439,29.107,9.172,88.449,28.526,27.644,9.247,87.105,21.573,27.46,8.074,82.669,20.187,26.391,8.775,81.323,18.008,22.728,10.024,80.873,16.365


In [19]:
# write to pickle
#data_w_weather_df.to_pickle('../data/Gelderman_SOD_cohort/for_pmi_estimation2.pkl')