In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 700)

### Key findings

Aim of this notebook was to further analyze and clean the PMI data generated by generate_pmi.ipynb: img_pmi.csv.
The PMI data was also joined with the SOD-labeled heads data to analyze the PMI per SOD. 
TODO: Need to come up with method/heuristics to deal with PMI outliers/noise per SOD. E.g., PMI=371 when in SOD 1, which is unlikely to happen. This will require domain expertise. 

Outputs:
- df_pmi2 (./data/img_PMIs_no_negs.csv): images with non-negative PMIs. Note, if a donor had at least one image with a negative PMI, all of its images were excluded, meaning it will be excluded from any downstream tasks.  
- df_merged (./data/img_PMIs_to_SOD.csv): df_pmi2 mapped with SOD data.

# Import PMI data

In [13]:
# import pmi data 
df_pmi = pd.read_csv('./data/img_PMIs.csv')
display(df_pmi.head())
print(df_pmi.shape)

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21,2018-01-21,0.0


(1154836, 12)


In [14]:
display(df_pmi.describe())
display(df_pmi.info())

Unnamed: 0,old_month,old_day,old_yr,img_pmi_days
count,1154836.0,1154836.0,1154836.0,1154836.0
mean,6.071983,15.59713,2015.564,55.15684
std,3.288228,8.783255,2.242833,81.17176
min,1.0,1.0,2011.0,-637.0
25%,3.0,8.0,2014.0,16.0
50%,6.0,15.0,2015.0,36.0
75%,9.0,23.0,2017.0,67.0
max,12.0,31.0,2022.0,2359.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154836 entries, 0 to 1154835
Data columns (total 12 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   new_id           1154836 non-null  object 
 1   new_img          1154836 non-null  object 
 2   old_path         1154836 non-null  object 
 3   new_path         1154836 non-null  object 
 4   old_id           1154836 non-null  object 
 5   old_img          1154836 non-null  object 
 6   old_month        1154836 non-null  int64  
 7   old_day          1154836 non-null  int64  
 8   old_yr           1154836 non-null  int64  
 9   old_date         1154836 non-null  object 
 10  date_placed_ARF  1154836 non-null  object 
 11  img_pmi_days     1154836 non-null  float64
dtypes: float64(1), int64(3), object(8)
memory usage: 105.7+ MB


None

## Negative PMI analysis and removal
Negative PMIs cannot occurr. After consulting with the rest of the team, we decided to exclude these donors from the PMI estimation study.

In [15]:
# PMI cannot be negative, so analyze 
print(df_pmi[df_pmi.img_pmi_days < 0].shape)
print(df_pmi[df_pmi.img_pmi_days < 0].new_id.unique().shape)
print(df_pmi[df_pmi.img_pmi_days < 0].new_id.unique())
print(df_pmi[df_pmi.img_pmi_days < 0].old_id.unique())
print(df_pmi[df_pmi.img_pmi_days < 0].img_pmi_days.unique())

(910, 12)
(25,)
['009' '022' '02f' '034' '038' '053' '054' '073' '074' '09f' '0ab' '0e0'
 '0e4' '109' '10a' '11a' '121' '13f' '16b' '20f' '4c2' '844' '8ea' 'c79'
 'd83']
['UT101-18D' 'UT24-18D' 'UT34-18D' 'UT40-18D' 'UT43-18D' 'UT70-18D'
 'UT71-18D' 'UT95-18D' 'UT96-18D' 'UT36-19D' 'UT48-19D' 'UT98-19D'
 'UT92-16D' 'UT60-20D' 'UT61-20D' 'UT79-20D' 'UT101-21D' 'UT31-21D'
 'UT97-17D' 'UT86-16D' 'UT33-17D' 'UT51-17D' 'UT49-17D' 'UT24-17D'
 'UT78-17D']
[ -88.   -6.   -5.   -4.   -2.   -1. -637.  -24.  -42.  -34. -149. -326.
 -176. -288.  -76. -218. -109.  -12.  -11.  -10.   -9.   -8.   -7.   -3.
  -90.  -84.  -77.  -60.  -26.]


In [16]:
# drop imgs of donors that have at least one negative PMI
drop_ls = ['009','022', '02f', '034', '038', '053', '054', '073', '074', '09f', '0ab', '0e0', \
 '0e4', '109', '10a', '11a', '121', '13f', '16b', '20f', '4c2', '844', '8ea', 'c79', \
 'd83']
df_pmi2 = df_pmi[~df_pmi['new_id'].isin(drop_ls)].copy()
display(df_pmi2.head())
print('No. imgs dropped:', df_pmi.shape[0] - df_pmi2.shape[0])

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21,2018-01-21,0.0


No. imgs dropped: 24965


In [17]:
display(df_pmi2.describe())

Unnamed: 0,old_month,old_day,old_yr,img_pmi_days
count,1129871.0,1129871.0,1129871.0,1129871.0
mean,6.069671,15.60275,2015.514,55.28659
std,3.280162,8.787504,2.233141,80.94713
min,1.0,1.0,2011.0,0.0
25%,3.0,8.0,2014.0,16.0
50%,5.0,15.0,2015.0,36.0
75%,9.0,23.0,2017.0,67.0
max,12.0,31.0,2022.0,2359.0


In [18]:
# save as csv
df_pmi2.to_csv('./data/img_PMIs_no_negs.csv', index=False)

# Analyze PMI per SOD

In [20]:
# import SOD-labeled data
df_SOD = pd.read_csv('/home/anau/SOD_labeling/head/experiment_5/head_labeled_merged', 
                        header=None, delimiter=',', usecols=[0,1])
df_SOD.columns = ['path', 'label']
display(df_SOD.head())
print(df_SOD.shape)

Unnamed: 0,path,label
0,/anau_img3/4cd/4cd00612.21.icon.JPG,1
1,/anau_img3/b4b/b4b10107.16.icon.JPG,1
2,/anau_img3/38e/38e10122.04.icon.JPG,1
3,/anau_img3/0e4/0e401201.29.icon.JPG,1
4,/anau_img3/0e4/0e401201.28.icon.JPG,1


(4220, 2)


In [21]:
# process path column
df_SOD['path'] = df_SOD['path'].str.replace('.icon','')
df_SOD['path'] = df_SOD['path'].str.replace('/anau_img3/','/da1_data/icputrd/arf/mean.js/public/anau_img3/')
df_SOD['img'] = df_SOD['path'].str.split('/').str[-1]
display(df_SOD.head())
print(df_SOD.shape)

  df_SOD['path'] = df_SOD['path'].str.replace('.icon','')


Unnamed: 0,path,label,img
0,/da1_data/icputrd/arf/mean.js/public/anau_img3/4cd/4cd00612.21.JPG,1,4cd00612.21.JPG
1,/da1_data/icputrd/arf/mean.js/public/anau_img3/b4b/b4b10107.16.JPG,1,b4b10107.16.JPG
2,/da1_data/icputrd/arf/mean.js/public/anau_img3/38e/38e10122.04.JPG,1,38e10122.04.JPG
3,/da1_data/icputrd/arf/mean.js/public/anau_img3/0e4/0e401201.29.JPG,1,0e401201.29.JPG
4,/da1_data/icputrd/arf/mean.js/public/anau_img3/0e4/0e401201.28.JPG,1,0e401201.28.JPG


(4220, 3)


In [22]:
# join SOD-labeled data with PMI data
df_merged = pd.merge(df_SOD, df_pmi2, left_on="path", right_on="new_path",
                    how='inner')
df_merged.sort_values(by='path', inplace=True)
display(df_merged.head())
print(df_merged.shape, df_SOD.shape)

Unnamed: 0,path,label,img,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
1039,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000122.08.JPG,2,00000122.08.JPG,0,00000122.08.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_22_2018 (8).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000122.08.JPG,UT01-18D,UT01-18D_01_22_2018 (8).JPG,1,22,2018,2018-01-22,2018-01-21,1.0
1040,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000129.14.JPG,2,00000129.14.JPG,0,00000129.14.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_29_2018 (14).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000129.14.JPG,UT01-18D,UT01-18D_01_29_2018 (14).JPG,1,29,2018,2018-01-29,2018-01-21,8.0
1041,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000213.21.JPG,2,00000213.21.JPG,0,00000213.21.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_02_13_2018 (21).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000213.21.JPG,UT01-18D,UT01-18D_02_13_2018 (21).JPG,2,13,2018,2018-02-13,2018-01-21,23.0
1042,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000219.07.JPG,3,00000219.07.JPG,0,00000219.07.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_02_19_2018 (7).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000219.07.JPG,UT01-18D,UT01-18D_02_19_2018 (7).JPG,2,19,2018,2018-02-19,2018-01-21,29.0
1043,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000222.08.JPG,3,00000222.08.JPG,0,00000222.08.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_02_22_2018 (8).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000222.08.JPG,UT01-18D,UT01-18D_02_22_2018 (8).JPG,2,22,2018,2018-02-22,2018-01-21,32.0


(3417, 15) (4220, 3)


In [23]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3417 entries, 1039 to 3416
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   path             3417 non-null   object 
 1   label            3417 non-null   int64  
 2   img              3417 non-null   object 
 3   new_id           3417 non-null   object 
 4   new_img          3417 non-null   object 
 5   old_path         3417 non-null   object 
 6   new_path         3417 non-null   object 
 7   old_id           3417 non-null   object 
 8   old_img          3417 non-null   object 
 9   old_month        3417 non-null   int64  
 10  old_day          3417 non-null   int64  
 11  old_yr           3417 non-null   int64  
 12  old_date         3417 non-null   object 
 13  date_placed_ARF  3417 non-null   object 
 14  img_pmi_days     3417 non-null   float64
dtypes: float64(1), int64(4), object(10)
memory usage: 427.1+ KB


In [11]:
df_merged.to_csv('./data/img_PMIs_to_SOD.csv', index=False)

In [15]:
# convert dtype of date_placed_ARF
df_merged['date_placed_ARF'] = pd.to_datetime(df_merged['date_placed_ARF'], format='%Y-%m-%d', errors='coerce')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3417 entries, 1039 to 3416
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   path             3417 non-null   object        
 1   label            3417 non-null   int64         
 2   img              3417 non-null   object        
 3   new_id           3417 non-null   object        
 4   new_img          3417 non-null   object        
 5   old_path         3417 non-null   object        
 6   new_path         3417 non-null   object        
 7   old_id           3417 non-null   object        
 8   old_img          3417 non-null   object        
 9   old_month        3417 non-null   int64         
 10  old_day          3417 non-null   int64         
 11  old_yr           3417 non-null   int64         
 12  old_date         3417 non-null   object        
 13  date_placed_ARF  3417 non-null   datetime64[ns]
 14  img_pmi_days     3417 non-null   floa

In [16]:
df_merged.describe()

Unnamed: 0,label,old_month,old_day,old_yr,img_pmi_days
count,3417.0,3417.0,3417.0,3417.0,3417.0
mean,2.544337,6.214223,15.441323,2015.399181,40.611062
std,1.009256,3.3269,8.772003,2.22488,54.210218
min,1.0,1.0,1.0,2011.0,0.0
25%,2.0,3.0,8.0,2014.0,10.0
50%,2.0,6.0,15.0,2015.0,27.0
75%,3.0,9.0,23.0,2017.0,54.0
max,4.0,12.0,31.0,2022.0,802.0


In [24]:
# split data by SOD
df_pmi_one = df_merged[df_merged.label == 1]
print(df_pmi_one.shape)
df_pmi_two = df_merged[df_merged.label == 2]
print(df_pmi_two.shape)
df_pmi_three = df_merged[df_merged.label == 3]
print(df_pmi_three.shape)
df_pmi_four = df_merged[df_merged.label == 4]
print(df_pmi_four.shape)

(518, 15)
(1319, 15)
(782, 15)
(798, 15)


In [25]:
df_pmi_one[df_pmi_one.img_pmi_days == 379]

Unnamed: 0,path,label,img,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
2161,/da1_data/icputrd/arf/mean.js/public/anau_img3/84b/84b10120.17.JPG,1,84b10120.17.JPG,84b,84b10120.17.JPG,/da1_data/icputrd/arf/mean.js/public/2014/UT96-14D/Daily Photos/UT96-14D_01_20_2015 (17).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/84b/84b10120.17.JPG,UT96-14D,UT96-14D_01_20_2015 (17).JPG,1,20,2015,2015-01-20,2014-01-06,379.0


### SOD 1

In [18]:
df_pmi_one.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 518 entries, 339 to 3410
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   path             518 non-null    object        
 1   label            518 non-null    int64         
 2   img              518 non-null    object        
 3   new_id           518 non-null    object        
 4   new_img          518 non-null    object        
 5   old_path         518 non-null    object        
 6   new_path         518 non-null    object        
 7   old_id           518 non-null    object        
 8   old_img          518 non-null    object        
 9   old_month        518 non-null    int64         
 10  old_day          518 non-null    int64         
 11  old_yr           518 non-null    int64         
 12  old_date         518 non-null    object        
 13  date_placed_ARF  518 non-null    datetime64[ns]
 14  img_pmi_days     518 non-null    float6

In [19]:
display(df_pmi_one.describe(datetime_is_numeric=True))

Unnamed: 0,label,old_month,old_day,old_yr,date_placed_ARF,img_pmi_days
count,518.0,518.0,518.0,518.0,518,518.0
mean,1.0,5.696911,14.227799,2015.816602,2016-03-20 13:48:25.019304960,9.53668
min,1.0,1.0,1.0,2011.0,2011-12-26 00:00:00,0.0
25%,1.0,1.0,7.0,2014.0,2014-01-07 00:00:00,0.0
50%,1.0,6.0,13.0,2016.0,2016-01-11 00:00:00,1.0
75%,1.0,9.0,21.0,2018.0,2018-01-29 00:00:00,3.0
max,1.0,12.0,31.0,2021.0,2021-11-17 00:00:00,379.0
std,0.0,3.966351,8.201656,2.458781,,48.396669


In [20]:
# analyze 4th quartile, meaning pmi > 3
display(df_pmi_one[df_pmi_one.img_pmi_days > 3].sort_values('path').describe(datetime_is_numeric=True))
display(df_pmi_one[df_pmi_one.img_pmi_days > 3].sort_values('path').sort_values('path').shape)
display(df_pmi_one[df_pmi_one.img_pmi_days > 3]['date_placed_ARF'].dt.month.value_counts())
# placement mostly during winter where decomp is slower, some during fall and spring, and one during summer

Unnamed: 0,label,old_month,old_day,old_yr,date_placed_ARF,img_pmi_days
count,121.0,121.0,121.0,121.0,121,121.0
mean,1.0,4.033058,13.066116,2014.818182,2014-12-30 19:14:22.809917440,39.099174
min,1.0,1.0,1.0,2011.0,2011-12-26 00:00:00,4.0
25%,1.0,1.0,6.0,2013.0,2013-03-09 00:00:00,6.0
50%,1.0,2.0,13.0,2014.0,2014-02-21 00:00:00,11.0
75%,1.0,3.0,19.0,2016.0,2016-02-10 00:00:00,18.0
max,1.0,12.0,31.0,2021.0,2021-11-09 00:00:00,379.0
std,0.0,4.057778,7.919107,2.028957,,94.548789


(121, 15)

1     53
12    22
2     21
3     11
11    10
10     3
7      1
Name: date_placed_ARF, dtype: int64

### SOD 2

In [22]:
display(df_pmi_two.describe(datetime_is_numeric=True))

Unnamed: 0,label,old_month,old_day,old_yr,date_placed_ARF,img_pmi_days
count,1319.0,1319.0,1319.0,1319.0,1319,1319.0
mean,2.0,5.312358,15.939348,2015.349507,2015-08-22 12:12:33.297952768,39.721001
min,2.0,1.0,1.0,2012.0,2011-12-26 00:00:00,0.0
25%,2.0,3.0,8.5,2014.0,2013-12-19 00:00:00,10.0
50%,2.0,4.0,16.0,2015.0,2015-09-28 00:00:00,26.0
75%,2.0,9.0,24.0,2016.0,2016-10-27 00:00:00,56.0
max,2.0,12.0,31.0,2022.0,2022-05-06 00:00:00,449.0
std,0.0,3.310363,8.838165,2.223116,,48.813893


In [23]:
# when pmi == 0 but we are in SOD 2
display(df_pmi_two[df_pmi_two.img_pmi_days == 0].sort_values('path').sort_values('path'))
display(df_pmi_two[df_pmi_two.img_pmi_days == 0]['date_placed_ARF'].dt.month.value_counts())

# not many with pmi=0. Double check SOD labeling. 

Unnamed: 0,path,label,img,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
1067,/da1_data/icputrd/arf/mean.js/public/anau_img3/00f/00f10920.10.JPG,2,00f10920.10.JPG,00f,00f10920.10.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT105-18D/Daily Photos/UT105-18D_09_20_2019 (10).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/00f/00f10920.10.JPG,UT105-18D,UT105-18D_09_20_2019 (10).JPG,9,20,2019,2019-09-20,2019-09-20,0.0
743,/da1_data/icputrd/arf/mean.js/public/anau_img3/045/04500611.10.JPG,2,04500611.10.JPG,045,04500611.10.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT54-18D/Daily Photos/UT54-18D_06_11_2018 (10).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/045/04500611.10.JPG,UT54-18D,UT54-18D_06_11_2018 (10).JPG,6,11,2018,2018-06-11,2018-06-11,0.0
1192,/da1_data/icputrd/arf/mean.js/public/anau_img3/05d/05d01001.21.JPG,2,05d01001.21.JPG,05d,05d01001.21.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT76-18D/Daily Photos/UT76-18D_10_01_2018 (21).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/05d/05d01001.21.JPG,UT76-18D,UT76-18D_10_01_2018 (21).JPG,10,1,2018,2018-10-01,2018-10-01,0.0
1518,/da1_data/icputrd/arf/mean.js/public/anau_img3/12b/12b00604.07.JPG,2,12b00604.07.JPG,12b,12b00604.07.JPG,/da1_data/icputrd/arf/mean.js/public/2021/UT15-21D/Daily Photos/UT15-21D_06_04_2021 (7).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/12b/12b00604.07.JPG,UT15-21D,UT15-21D_06_04_2021 (7).JPG,6,4,2021,2021-06-04,2021-06-04,0.0
1697,/da1_data/icputrd/arf/mean.js/public/anau_img3/2b7/2b701014.21.JPG,2,2b701014.21.JPG,2b7,2b701014.21.JPG,/da1_data/icputrd/arf/mean.js/public/2016/UT73-16D/Daily Photos/UT73-16D_10_14_2016 (21).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/2b7/2b701014.21.JPG,UT73-16D,UT73-16D_10_14_2016 (21).JPG,10,14,2016,2016-10-14,2016-10-14,0.0
2095,/da1_data/icputrd/arf/mean.js/public/anau_img3/7d0/7d000927.62.JPG,2,7d000927.62.JPG,7d0,7d000927.62.JPG,/da1_data/icputrd/arf/mean.js/public/2013/UT58-13D/Daily Photos/UT58-13D_09_27_2013 (62).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/7d0/7d000927.62.JPG,UT58-13D,UT58-13D_09_27_2013 (62).JPG,9,27,2013,2013-09-27,2013-09-27,0.0
2234,/da1_data/icputrd/arf/mean.js/public/anau_img3/93c/93c00113.19.JPG,2,93c00113.19.JPG,93c,93c00113.19.JPG,/da1_data/icputrd/arf/mean.js/public/2012/UT05-12D/Daily Photos/UT05-12D_01_13_2012 (19).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/93c/93c00113.19.JPG,UT05-12D,UT05-12D_01_13_2012 (19).JPG,1,13,2012,2012-01-13,2012-01-13,0.0
2289,/da1_data/icputrd/arf/mean.js/public/anau_img3/9bf/9bf01221.04.JPG,2,9bf01221.04.JPG,9bf,9bf01221.04.JPG,/da1_data/icputrd/arf/mean.js/public/2012/UT90-12D/Daily Photos/UT90-12D_12_21_2012 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/9bf/9bf01221.04.JPG,UT90-12D,UT90-12D_12_21_2012 (4).JPG,12,21,2012,2012-12-21,2012-12-21,0.0
2293,/da1_data/icputrd/arf/mean.js/public/anau_img3/9c5/9c501003.22.JPG,2,9c501003.22.JPG,9c5,9c501003.22.JPG,/da1_data/icputrd/arf/mean.js/public/2014/UT82-14D/Daily Photos/UT82-14D_10_03_2014 (22).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/9c5/9c501003.22.JPG,UT82-14D,UT82-14D_10_03_2014 (22).JPG,10,3,2014,2014-10-03,2014-10-03,0.0
959,/da1_data/icputrd/arf/mean.js/public/anau_img3/9d3/9d300707.67.JPG,2,9d300707.67.JPG,9d3,9d300707.67.JPG,/da1_data/icputrd/arf/mean.js/public/2016/UT44-16D/Daily Photos/UT44-16D_07_07_2016 (67.JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/9d3/9d300707.67.JPG,UT44-16D,UT44-16D_07_07_2016 (67.JPG,7,7,2016,2016-07-07,2016-07-07,0.0


10    4
6     3
9     2
5     2
1     1
12    1
7     1
3     1
8     1
Name: date_placed_ARF, dtype: int64

In [24]:
# analyze 4th quartile, meaning pmi > 56
display(df_pmi_two[df_pmi_two.img_pmi_days > 56].sort_values('path').sort_values('path').shape)
display(df_pmi_two[df_pmi_two.img_pmi_days > 56]['date_placed_ARF'].dt.month.value_counts())

# Majority with placement in winter months. 

(328, 15)

1     182
11     55
2      36
12     31
3      11
10      9
4       3
7       1
Name: date_placed_ARF, dtype: int64

### SOD 3

In [25]:
display(df_pmi_three.describe())

Unnamed: 0,label,old_month,old_day,old_yr,img_pmi_days
count,782.0,782.0,782.0,782.0,782.0
mean,3.0,7.2289,15.849105,2015.507673,48.73913
std,0.0,3.000219,8.994677,2.007494,47.668628
min,3.0,1.0,1.0,2012.0,4.0
25%,3.0,5.0,7.25,2014.0,21.0
50%,3.0,7.0,16.0,2016.0,35.0
75%,3.0,10.0,23.75,2016.0,58.0
max,3.0,12.0,31.0,2022.0,443.0


In [28]:
# analyze 4th quartile, meaning pmi > 58
display(df_pmi_three[df_pmi_three.img_pmi_days > 58].sort_values('path').sort_values('path'))
display(df_pmi_three[df_pmi_three.img_pmi_days > 58]['date_placed_ARF'].dt.month.value_counts())

# Majority with placement in winter months. 

Unnamed: 0,path,label,img,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
1064,/da1_data/icputrd/arf/mean.js/public/anau_img3/00e/00e10305.19.JPG,3,00e10305.19.JPG,00e,00e10305.19.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT104-18D/Daily Photos/UT104-18D_03_05_2019 (19).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/00e/00e10305.19.JPG,UT104-18D,UT104-18D_03_05_2019 (19).JPG,3,5,2019,2019-03-05,2018-12-20,75.0
1065,/da1_data/icputrd/arf/mean.js/public/anau_img3/00e/00e10313.21.JPG,3,00e10313.21.JPG,00e,00e10313.21.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT104-18D/Daily Photos/UT104-18D_03_13_2019 (21).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/00e/00e10313.21.JPG,UT104-18D,UT104-18D_03_13_2019 (21).JPG,3,13,2019,2019-03-13,2018-12-20,83.0
492,/da1_data/icputrd/arf/mean.js/public/anau_img3/00e/00e10315.21.JPG,3,00e10315.21.JPG,00e,00e10315.21.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT104-18D/Daily Photos/UT104-18D_03_15_2019 (21).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/00e/00e10315.21.JPG,UT104-18D,UT104-18D_03_15_2019 (21).JPG,3,15,2019,2019-03-15,2018-12-20,85.0
598,/da1_data/icputrd/arf/mean.js/public/anau_img3/013/01310320.08.JPG,3,01310320.08.JPG,013,01310320.08.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT108-18D/Daily Photos/UT108-18D_03_20_2019 (8).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/013/01310320.08.JPG,UT108-18D,UT108-18D_03_20_2019 (8).JPG,3,20,2019,2019-03-20,2018-12-13,97.0
1113,/da1_data/icputrd/arf/mean.js/public/anau_img3/01b/01b00425.11.JPG,3,01b00425.11.JPG,01b,01b00425.11.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT17-18D/Daily Photos/UT17-18D_04_25_2018 (11).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/01b/01b00425.11.JPG,UT17-18D,UT17-18D_04_25_2018 (11).JPG,4,25,2018,2018-04-25,2018-01-26,89.0
787,/da1_data/icputrd/arf/mean.js/public/anau_img3/01b/01b00426.13.JPG,3,01b00426.13.JPG,01b,01b00426.13.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT17-18D/Daily Photos/UT17-18D_04_26_2018 (13).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/01b/01b00426.13.JPG,UT17-18D,UT17-18D_04_26_2018 (13).JPG,4,26,2018,2018-04-26,2018-01-26,90.0
1132,/da1_data/icputrd/arf/mean.js/public/anau_img3/024/02400507.07.JPG,3,02400507.07.JPG,024,02400507.07.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT25-18D/Daily Photos/UT25-18D_05_07_2018 (7).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/024/02400507.07.JPG,UT25-18D,UT25-18D_05_07_2018 (7).JPG,5,7,2018,2018-05-07,2018-03-02,66.0
1164,/da1_data/icputrd/arf/mean.js/public/anau_img3/040/04010128.03.JPG,3,04010128.03.JPG,040,04010128.03.JPG,/da1_data/icputrd/arf/mean.js/public/2016/UT42-16D/Daily Photos/UT42-16D_01_28_2017 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/040/04010128.03.JPG,UT42-16D,UT42-16D_01_28_2017 (3).JPG,1,28,2017,2017-01-28,2016-07-07,205.0
1178,/da1_data/icputrd/arf/mean.js/public/anau_img3/055/05501213.20.JPG,3,05501213.20.JPG,055,05501213.20.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT72-18D/Daily Photos/UT72-18D_12_13_2018 (20).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/055/05501213.20.JPG,UT72-18D,UT72-18D_12_13_2018 (20).JPG,12,13,2018,2018-12-13,2018-10-12,62.0
1264,/da1_data/icputrd/arf/mean.js/public/anau_img3/078/07800409.08.JPG,3,07800409.08.JPG,078,07800409.08.JPG,/da1_data/icputrd/arf/mean.js/public/2019/UT02-19D/Daily Photos/UT02-19D_04_09_2019 (8).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/078/07800409.08.JPG,UT02-19D,UT02-19D_04_09_2019 (8).JPG,4,9,2019,2019-04-09,2019-01-14,85.0


1     41
2     40
10    22
11    19
12    14
3     14
7     14
5     12
9      8
4      5
8      3
6      1
Name: date_placed_ARF, dtype: int64

### SOD 4

In [27]:
display(df_pmi_four.describe())

Unnamed: 0,label,old_month,old_day,old_yr,img_pmi_days
count,798.0,798.0,798.0,798.0,798.0
mean,4.0,7.046366,15.006266,2015.10401,54.288221
std,0.0,2.658592,8.717436,2.22633,63.467947
min,4.0,1.0,1.0,2011.0,5.0
25%,4.0,5.0,7.0,2013.0,22.0
50%,4.0,7.0,14.0,2014.0,37.0
75%,4.0,9.0,22.0,2017.0,67.75
max,4.0,12.0,31.0,2022.0,802.0


In [127]:
# analyze 4th quartile, meaning pmi > 67
display(df_pmi_four[df_pmi_four.img_pmi_days > 67].sort_values('path').sort_values('path').shape)
display(df_pmi_four[df_pmi_four.img_pmi_days > 67]['date_placed_ARF'].dt.month.value_counts())

# Placement more uniformally distributed  

(200, 6)

1     32
3     24
2     24
4     21
7     20
12    19
8     15
6     15
11    13
10     8
5      5
9      4
Name: date_placed_ARF, dtype: int64