In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 700)

This notebook calculates the PMI (in days) for each image. img_PMI = img_date - date_placed_ARF
* date_placed_ARF - the date a donor was placed at the ARF to start decomposing. Will be used as the time of death when calculating the PMI of an image. Can be found in the demography data. 
* img_date - the date an image was taken. Since it was discovered that this date was incorrectly mapped during old-to-new ID mapping, this date will be extracted from the old image file (e.g., UT01-18D_01_21_2018 (1).JPG). 

**Note**: Column names prefixed with 'old' represent data before old-to-new ID mapping, and names prefixed with 'new' after.

**Metadata**:
- Total # of donors: 841
    - \# of images: 1,376,741
- Total # of donors with a valid PMI: 689 (-152 donors)
    - \# of images: 1,154,836 (-221,905 images)
    - Valid PMI means that the date placed at the ARF exists and is not negative
        - 127 donors do not have a date placed at the ARF, meaning no PMI
        - 25 donors have a negative PMI
    - Saved as ../data/img_PMIs_no_negs.csv

**Notebook outputs**:
- mapping_metad_df3 (../data/img_PMIs.csv): images with a PMI, including outliers/noise (e.g., negative pmi).
- df_pmi2 (../data/img_PMIs_no_negs.csv): images with non-negative PMIs. Note, if a donor had at least one image with a negative PMI, all of its images were excluded, meaning it will be excluded from any downstream tasks.  

# Import old-to-new mapping metadata

In [24]:
mapping_metad_df = pd.read_csv('/da1_data/icputrd/arf/mean.js/public/anau_scripts/out/mapping_metadata.txt',
                              header=None, names=['new_id', 'new_img', 'old_path'])
display(mapping_metad_df.head())
print(mapping_metad_df.shape)
print(mapping_metad_df.info())

Unnamed: 0,new_id,new_img,old_path
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG


(1376741, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376741 entries, 0 to 1376740
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   new_id    1376741 non-null  object
 1   new_img   1376741 non-null  object
 2   old_path  1376741 non-null  object
dtypes: object(3)
memory usage: 31.5+ MB
None


In [25]:
# create new_path column
mapping_metad_df['new_path'] = '/da1_data/icputrd/arf/mean.js/public/anau_img3/' + mapping_metad_df.new_id.str.cat(mapping_metad_df.new_img, sep='/')
mapping_metad_df.head()

Unnamed: 0,new_id,new_img,old_path,new_path
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG


In [26]:
# process old_path column into several columns which will be needed to derive the correct img date
mapping_metad_df['old_id'] = mapping_metad_df['old_path'].str.split('/', expand=True)[7]
mapping_metad_df['old_img'] = mapping_metad_df['old_path'].str.split('/', expand=True)[9]
mapping_metad_df['old_month'] = mapping_metad_df['old_img'].str.split('_', expand=True)[1]
mapping_metad_df['old_day'] = mapping_metad_df['old_img'].str.split('_', expand=True)[2]
mapping_metad_df['old_yr'] = mapping_metad_df['old_img'].str.split('_', expand=True)[3].str[:4]

display(mapping_metad_df.head())
print(mapping_metad_df.shape)
print(mapping_metad_df.info())

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018


(1376741, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376741 entries, 0 to 1376740
Data columns (total 9 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   new_id     1376741 non-null  object
 1   new_img    1376741 non-null  object
 2   old_path   1376741 non-null  object
 3   new_path   1376741 non-null  object
 4   old_id     1376741 non-null  object
 5   old_img    1376741 non-null  object
 6   old_month  1376741 non-null  object
 7   old_day    1376741 non-null  object
 8   old_yr     1376741 non-null  object
dtypes: object(9)
memory usage: 94.5+ MB
None


In [27]:
# number of donor IDs
print(mapping_metad_df.old_id.unique().shape)

(841,)


In [28]:
# check unique values of old_month
print(mapping_metad_df.old_month.sort_values().unique())
print(len(mapping_metad_df.old_month.sort_values().unique()))

['01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12']
12


In [29]:
# check unique values of old_day
print(mapping_metad_df.old_day.sort_values().unique())
print(len(mapping_metad_df.old_day.sort_values().unique()))

['01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12' '13' '14'
 '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28'
 '29' '30' '31']
31


In [30]:
# check unique values of old_yr
print(mapping_metad_df.old_yr.sort_values().unique())
print(len(mapping_metad_df.old_yr.sort_values().unique()))

['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022']
12


In [31]:
# To fix the unwanted values, the following commands were executed in 
# /da1_data/icputrd/arf/mean.js/public/anau_scripts/out:

#sed -ibackup 's/_22\ (/_2022 (/g' mapping_metadata.txt
#sed -ibackup 's/_20\ (/_2020 (/g' mapping_metadata.txt
#sed -ibackup 's/__/_/g' mapping_metadata.txt
#sed -ibackup 's/\ 08_/08_(/g' mapping_metadata.txt
#sed -ibackup 's/_(16_/_16_/g' mapping_metadata.txt
#sed -ibackup 's/_5_/_05_/g' mapping_metadata.txt

In [32]:
# construct old_date (i.e., correct image date)
mapping_metad_df['old_date'] = mapping_metad_df.old_yr.str.cat(mapping_metad_df.old_month, sep='-').str.cat(mapping_metad_df.old_day, sep='-')
display(mapping_metad_df.head())
mapping_metad_df.info()

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376741 entries, 0 to 1376740
Data columns (total 10 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   new_id     1376741 non-null  object
 1   new_img    1376741 non-null  object
 2   old_path   1376741 non-null  object
 3   new_path   1376741 non-null  object
 4   old_id     1376741 non-null  object
 5   old_img    1376741 non-null  object
 6   old_month  1376741 non-null  object
 7   old_day    1376741 non-null  object
 8   old_yr     1376741 non-null  object
 9   old_date   1376741 non-null  object
dtypes: object(10)
memory usage: 105.0+ MB


# Import date_place_ARF (PMI) data

In [33]:
date_placed_df = pd.read_csv('/da1_data/icputrd/arf/mean.js/public/anau_demography/demography_processed.csv')[['UTID', 'date_placed_ARF']]
display(date_placed_df.head())
print(date_placed_df.shape)
print(date_placed_df.info())

Unnamed: 0,UTID,date_placed_ARF
0,UT01-00D,
1,UT01-01D,
2,UT01-02D,
3,UT01-03D,
4,UT01-04D,


(2443, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2443 entries, 0 to 2442
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   UTID             2443 non-null   object
 1   date_placed_ARF  829 non-null    object
dtypes: object(2)
memory usage: 38.3+ KB
None


# Join mapping_metad_df with date_placed_df

In [34]:
mapping_metad_df2 = pd.merge(mapping_metad_df, date_placed_df, left_on="old_id", right_on="UTID", how='left')
mapping_metad_df2.drop(['UTID'], axis=1, inplace=True)
display(mapping_metad_df2.head())
print(mapping_metad_df2.shape)
print(mapping_metad_df2.info())

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21,2018-01-21
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21,2018-01-21
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21,2018-01-21
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21,2018-01-21
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21,2018-01-21


(1376741, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1376741 entries, 0 to 1376740
Data columns (total 11 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   new_id           1376741 non-null  object
 1   new_img          1376741 non-null  object
 2   old_path         1376741 non-null  object
 3   new_path         1376741 non-null  object
 4   old_id           1376741 non-null  object
 5   old_img          1376741 non-null  object
 6   old_month        1376741 non-null  object
 7   old_day          1376741 non-null  object
 8   old_yr           1376741 non-null  object
 9   old_date         1376741 non-null  object
 10  date_placed_ARF  1154836 non-null  object
dtypes: object(11)
memory usage: 126.0+ MB
None


In [35]:
# drop rows with missing value in date_placed_ARF column
mapping_metad_df3 = mapping_metad_df2[mapping_metad_df2.date_placed_ARF.notnull()].copy()
display(mapping_metad_df3.head())
print(mapping_metad_df2.shape[0] - mapping_metad_df3.shape[0])
print(mapping_metad_df3.shape)

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21,2018-01-21
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21,2018-01-21
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21,2018-01-21
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21,2018-01-21
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21,2018-01-21


221905
(1154836, 11)


In [44]:
# number of donors without a date placed at ARF, meaning no PMI
mapping_metad_df2[mapping_metad_df2.date_placed_ARF.isnull()][['old_id', 'new_id']].nunique()

old_id    127
new_id    127
dtype: int64

In [36]:
# number of unique donors
print(mapping_metad_df3.new_id.unique().shape)

(714,)


In [14]:
display(mapping_metad_df3.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1154836 entries, 0 to 1376740
Data columns (total 11 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   new_id           1154836 non-null  object
 1   new_img          1154836 non-null  object
 2   old_path         1154836 non-null  object
 3   new_path         1154836 non-null  object
 4   old_id           1154836 non-null  object
 5   old_img          1154836 non-null  object
 6   old_month        1154836 non-null  object
 7   old_day          1154836 non-null  object
 8   old_yr           1154836 non-null  object
 9   old_date         1154836 non-null  object
 10  date_placed_ARF  1154836 non-null  object
dtypes: object(11)
memory usage: 105.7+ MB


None

In [48]:
# convert dtypes
mapping_metad_df3['old_date'] = pd.to_datetime(mapping_metad_df3['old_date'], format='%Y-%m-%d', errors='coerce')
mapping_metad_df3['date_placed_ARF'] = pd.to_datetime(mapping_metad_df3['date_placed_ARF'], format='%Y-%m-%d', errors='coerce')
mapping_metad_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1154836 entries, 0 to 1376740
Data columns (total 11 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   new_id           1154836 non-null  object        
 1   new_img          1154836 non-null  object        
 2   old_path         1154836 non-null  object        
 3   new_path         1154836 non-null  object        
 4   old_id           1154836 non-null  object        
 5   old_img          1154836 non-null  object        
 6   old_month        1154836 non-null  object        
 7   old_day          1154836 non-null  object        
 8   old_yr           1154836 non-null  object        
 9   old_date         1154836 non-null  datetime64[ns]
 10  date_placed_ARF  1154836 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(9)
memory usage: 105.7+ MB


In [49]:
# calculate pmi per image 
mapping_metad_df3['img_pmi_days'] = (mapping_metad_df3['old_date'] - mapping_metad_df3['date_placed_ARF']) / np.timedelta64(1,'D')
display(mapping_metad_df3.head())
display(mapping_metad_df3.info())

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21,2018-01-21,0.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1154836 entries, 0 to 1376740
Data columns (total 12 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   new_id           1154836 non-null  object        
 1   new_img          1154836 non-null  object        
 2   old_path         1154836 non-null  object        
 3   new_path         1154836 non-null  object        
 4   old_id           1154836 non-null  object        
 5   old_img          1154836 non-null  object        
 6   old_month        1154836 non-null  object        
 7   old_day          1154836 non-null  object        
 8   old_yr           1154836 non-null  object        
 9   old_date         1154836 non-null  datetime64[ns]
 10  date_placed_ARF  1154836 non-null  datetime64[ns]
 11  img_pmi_days     1154836 non-null  float64       
dtypes: datetime64[ns](2), float64(1), object(9)
memory usage: 114.5+ MB


None

In [51]:
mapping_metad_df3.sample(5)

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
1136523,d60,d6000608.16.JPG,/da1_data/icputrd/arf/mean.js/public/2014/UT17-14D/Daily Photos/UT17-14D_06_08_2014 (16).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/d60/d6000608.16.JPG,UT17-14D,UT17-14D_06_08_2014 (16).JPG,6,8,2014,2014-06-08,2014-03-10,90.0
1209461,e20,e2000930.31.JPG,/da1_data/icputrd/arf/mean.js/public/2015/UT54-15D/Daily Photos/UT54-15D_09_30_2015 (31).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/e20/e2000930.31.JPG,UT54-15D,UT54-15D_09_30_2015 (31).JPG,9,30,2015,2015-09-30,2015-09-22,8.0
1266569,ecd,ecd00407.42.JPG,/da1_data/icputrd/arf/mean.js/public/2014/UT03-14D/Daily Photos/UT03-14D_04_07_2014 (42).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/ecd/ecd00407.42.JPG,UT03-14D,UT03-14D_04_07_2014 (42).JPG,4,7,2014,2014-04-07,2014-01-27,70.0
789306,8dd,8dd00320.51.JPG,/da1_data/icputrd/arf/mean.js/public/2016/UT02-16D/Daily Photos/UT02-16D_03_20_2016 (51).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/8dd/8dd00320.51.JPG,UT02-16D,UT02-16D_03_20_2016 (51).JPG,3,20,2016,2016-03-20,2016-01-08,72.0
109920,06a,06a01106.40.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT88-18D/Daily Photos/UT88-18D_11_06_2018 (40).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/06a/06a01106.40.JPG,UT88-18D,UT88-18D_11_06_2018 (40).JPG,11,6,2018,2018-11-06,2018-10-08,29.0


In [52]:
mapping_metad_df3.describe()

Unnamed: 0,img_pmi_days
count,1154836.0
mean,55.15684
std,81.17176
min,-637.0
25%,16.0
50%,36.0
75%,67.0
max,2359.0


In [54]:
# write to csv
mapping_metad_df3.to_csv('../data/img_PMIs.csv', index=False)

# Negative PMI analysis and removal
Negative PMIs cannot occurr. After consulting with the rest of the team, we decided to exclude these donors from the PMI estimation study.

In [11]:
df_pmi = pd.read_csv('../data/img_PMIs.csv')
print(df_pmi.shape)
display(df_pmi.head())

(1154836, 12)


Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21,2018-01-21,0.0


In [55]:
# number of donor IDs
print(df_pmi.new_id.unique().shape)

(714,)


In [56]:
# PMI cannot be negative, so analyze 
print(df_pmi[df_pmi.img_pmi_days < 0].shape)
print(df_pmi[df_pmi.img_pmi_days < 0].new_id.unique().shape)
print(df_pmi[df_pmi.img_pmi_days < 0].new_id.unique())
print(df_pmi[df_pmi.img_pmi_days < 0].old_id.unique())
print(df_pmi[df_pmi.img_pmi_days < 0].img_pmi_days.unique())

(910, 12)
(25,)
['009' '022' '02f' '034' '038' '053' '054' '073' '074' '09f' '0ab' '0e0'
 '0e4' '109' '10a' '11a' '121' '13f' '16b' '20f' '4c2' '844' '8ea' 'c79'
 'd83']
['UT101-18D' 'UT24-18D' 'UT34-18D' 'UT40-18D' 'UT43-18D' 'UT70-18D'
 'UT71-18D' 'UT95-18D' 'UT96-18D' 'UT36-19D' 'UT48-19D' 'UT98-19D'
 'UT92-16D' 'UT60-20D' 'UT61-20D' 'UT79-20D' 'UT101-21D' 'UT31-21D'
 'UT97-17D' 'UT86-16D' 'UT33-17D' 'UT51-17D' 'UT49-17D' 'UT24-17D'
 'UT78-17D']
[ -88.   -6.   -5.   -4.   -2.   -1. -637.  -24.  -42.  -34. -149. -326.
 -176. -288.  -76. -218. -109.  -12.  -11.  -10.   -9.   -8.   -7.   -3.
  -90.  -84.  -77.  -60.  -26.]


In [57]:
# drop imgs of donors that have at least one negative PMI
drop_ls = ['009','022', '02f', '034', '038', '053', '054', '073', '074', '09f', '0ab', '0e0', \
 '0e4', '109', '10a', '11a', '121', '13f', '16b', '20f', '4c2', '844', '8ea', 'c79', \
 'd83']
df_pmi2 = df_pmi[~df_pmi['new_id'].isin(drop_ls)].copy()
display(df_pmi2.head())
print('No. imgs dropped:', df_pmi.shape[0] - df_pmi2.shape[0])
display(df_pmi2.describe())

Unnamed: 0,new_id,new_img,old_path,new_path,old_id,old_img,old_month,old_day,old_yr,old_date,date_placed_ARF,img_pmi_days
0,0,00000121.01.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (1).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.01.JPG,UT01-18D,UT01-18D_01_21_2018 (1).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
1,0,00000121.02.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (2).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.02.JPG,UT01-18D,UT01-18D_01_21_2018 (2).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
2,0,00000121.03.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (3).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.03.JPG,UT01-18D,UT01-18D_01_21_2018 (3).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
3,0,00000121.04.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (4).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.04.JPG,UT01-18D,UT01-18D_01_21_2018 (4).JPG,1,21,2018,2018-01-21,2018-01-21,0.0
4,0,00000121.05.JPG,/da1_data/icputrd/arf/mean.js/public/2018/UT01-18D/Daily Photos/UT01-18D_01_21_2018 (5).JPG,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000121.05.JPG,UT01-18D,UT01-18D_01_21_2018 (5).JPG,1,21,2018,2018-01-21,2018-01-21,0.0


No. imgs dropped: 24965


Unnamed: 0,old_month,old_day,old_yr,img_pmi_days
count,1129871.0,1129871.0,1129871.0,1129871.0
mean,6.069671,15.60275,2015.514,55.28659
std,3.280162,8.787504,2.233141,80.94713
min,1.0,1.0,2011.0,0.0
25%,3.0,8.0,2014.0,16.0
50%,5.0,15.0,2015.0,36.0
75%,9.0,23.0,2017.0,67.0
max,12.0,31.0,2022.0,2359.0


In [58]:
# number of donor IDs
print(df_pmi2.new_id.unique().shape)

(689,)


In [59]:
# save as csv
df_pmi2.to_csv('../data/img_PMIs_no_negs.csv', index=False)