In [4]:
import pandas as pd
from datetime import timedelta

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 700)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# Import & process SOD-labeled data
This data will be used to label the following unlabeled data:
1. Pre-labeled as head
2. Predicted as head (will require evaluation to ensure data only consists of heads)

In [67]:
df_SOD_labeled = pd.read_csv('../data/original/stages.csv.20221114_correct.processed', header=None, delimiter='/',
                    usecols=[8])
display(df_SOD_labeled.head())

Unnamed: 0,8
0,"00000122.08.JPG,0"
1,"00000129.14.JPG,0"
2,"00000213.21.JPG,1"
3,"00000219.07.JPG,1"
4,"00000222.08.JPG,2"


In [68]:
# remove any spaces 
df_SOD_labeled[8] = df_SOD_labeled[8].str.replace(' ','')

In [69]:
# split into file and label columns
df_SOD_labeled[['file','label']] = df_SOD_labeled[8].str.split(',', expand=True)
display(df_SOD_labeled.head())

Unnamed: 0,8,file,label
0,"00000122.08.JPG,0",00000122.08.JPG,0
1,"00000129.14.JPG,0",00000129.14.JPG,0
2,"00000213.21.JPG,1",00000213.21.JPG,1
3,"00000219.07.JPG,1",00000219.07.JPG,1
4,"00000222.08.JPG,2",00000222.08.JPG,2


In [70]:
# remove duplicates
print(df_SOD_labeled.shape)
df_SOD_labeled.drop_duplicates(subset='file', keep="first", inplace=True)
print(df_SOD_labeled.shape)

(4731, 3)
(4706, 3)


In [71]:
# split file column into multiple columns
df_SOD_labeled[['id_date', 'count','ext']] = df_SOD_labeled['file'].str.split('.', expand=True)
display(df_SOD_labeled.head())

Unnamed: 0,8,file,label,id_date,count,ext
0,"00000122.08.JPG,0",00000122.08.JPG,0,122,8,JPG
1,"00000129.14.JPG,0",00000129.14.JPG,0,129,14,JPG
2,"00000213.21.JPG,1",00000213.21.JPG,1,213,21,JPG
3,"00000219.07.JPG,1",00000219.07.JPG,1,219,7,JPG
4,"00000222.08.JPG,2",00000222.08.JPG,2,222,8,JPG


In [72]:
# check all values in id_date column are 8 characters long
df_SOD_labeled["id_date"].str.len().unique()

array([8])

In [73]:
# split id_date column into multiple columns
df_SOD_labeled['id'] = df_SOD_labeled['id_date'].str[:3]
df_SOD_labeled['yrs_in_fac'] = df_SOD_labeled['id_date'].str[3]
df_SOD_labeled['month_day'] = df_SOD_labeled['id_date'].str[4:]
df_SOD_labeled.drop([8,'ext'], axis=1, inplace=True)
display(df_SOD_labeled.head())
print(df_SOD_labeled.shape)

Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day
0,00000122.08.JPG,0,122,8,0,0,122
1,00000129.14.JPG,0,129,14,0,0,129
2,00000213.21.JPG,1,213,21,0,0,213
3,00000219.07.JPG,1,219,7,0,0,219
4,00000222.08.JPG,2,222,8,0,0,222


(4706, 7)


In [74]:
# get unique values of yrs_in_fac column
df_SOD_labeled.yrs_in_fac.unique()

array(['0', '1', '2'], dtype=object)

In [75]:
df_SOD_labeled[df_SOD_labeled.yrs_in_fac == '2']

Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day
299,07320814.08.JPG,3,07320814,8,073,2,814
2119,84a20422.04.JPG,3,84a20422,4,84a,2,422
4413,ff120811.01.JPG,3,ff120811,1,ff1,2,811
4414,ff121013.05.JPG,3,ff121013,5,ff1,2,1013


In [76]:
# create 'year' column from 'yrs_in_fac' column and then a 'date' column
def assign_year(row):
    if row['yrs_in_fac'] == '0':
        val = '12'
    elif row['yrs_in_fac']  == '1':
        val = '13'
    elif row['yrs_in_fac']  == '2':
        val = '14'
    return val

df_SOD_labeled['year'] = df_SOD_labeled.apply(assign_year, axis=1)  # create year column 
df_SOD_labeled['date'] = df_SOD_labeled['year']+df_SOD_labeled['month_day']  # create date column

# convert 'date' to dtype datetime
df_SOD_labeled['date'] = pd.to_datetime(df_SOD_labeled['date'], format='%y%m%d', errors='coerce')
display(df_SOD_labeled[df_SOD_labeled['date'].isnull()])
display(df_SOD_labeled.head())
print(df_SOD_labeled.shape)

df_SOD_labeled = df_SOD_labeled[df_SOD_labeled.date.notnull()].copy()
print(df_SOD_labeled.shape)

Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day,year,date
4027,fc010229.10.JPG,0,fc010229,10,fc0,1,229,13,NaT
4028,fc010229.11.JPG,0,fc010229,11,fc0,1,229,13,NaT
4029,fc010229.12.JPG,0,fc010229,12,fc0,1,229,13,NaT
4030,fc010229.13.JPG,0,fc010229,13,fc0,1,229,13,NaT
4031,fc010229.34.JPG,0,fc010229,34,fc0,1,229,13,NaT
4032,fc010229.35.JPG,0,fc010229,35,fc0,1,229,13,NaT


Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day,year,date
0,00000122.08.JPG,0,122,8,0,0,122,12,2012-01-22
1,00000129.14.JPG,0,129,14,0,0,129,12,2012-01-29
2,00000213.21.JPG,1,213,21,0,0,213,12,2012-02-13
3,00000219.07.JPG,1,219,7,0,0,219,12,2012-02-19
4,00000222.08.JPG,2,222,8,0,0,222,12,2012-02-22


(4706, 9)
(4700, 9)


In [77]:
df_SOD_labeled2 = df_SOD_labeled[['file','label', 'id','date']].copy()
display(df_SOD_labeled2.head())
print(df_SOD_labeled2.shape)

Unnamed: 0,file,label,id,date
0,00000122.08.JPG,0,0,2012-01-22
1,00000129.14.JPG,0,0,2012-01-29
2,00000213.21.JPG,1,0,2012-02-13
3,00000219.07.JPG,1,0,2012-02-19
4,00000222.08.JPG,2,0,2012-02-22


(4700, 4)


# Import & process pre-labeled head data
Need SOD label

In [78]:
df_head_labeled = pd.read_csv('/da1_data/icputrd/decaying_human_body_part_classifier/from_anau/ex1_preds_labeled_head', 
                                header=None, delimiter=':', 
                                   usecols=[0,2])
df_head_labeled.columns =['path', 'confidence']

df_head_labeled['path'] = df_head_labeled['path'].str.replace(' ','')
df_head_labeled['file'] = df_head_labeled['path'].str.split('/').str[-1]
df_head_labeled[['id_date', 'count','ext']] = df_head_labeled['file'].str.split('.', expand=True)
df_head_labeled['id'] = df_head_labeled['id_date'].str[:3]
df_head_labeled['yrs_in_fac'] = df_head_labeled['id_date'].str[3]
df_head_labeled['month_day'] = df_head_labeled['id_date'].str[4:]
df_head_labeled.drop(['path','ext'], axis=1, inplace=True)
display(df_head_labeled.head())
print(df_head_labeled.shape)
print(df_head_labeled.info())
display(df_head_labeled.describe())

Unnamed: 0,confidence,file,id_date,count,id,yrs_in_fac,month_day
0,99.99,1bc10209.36.JPG,1bc10209,36,1bc,1,209
1,100.0,1bc10212.30.JPG,1bc10212,30,1bc,1,212
2,100.0,1bc10213.30.JPG,1bc10213,30,1bc,1,213
3,99.99,1bc10214.32.JPG,1bc10214,32,1bc,1,214
4,100.0,1bc10216.31.JPG,1bc10216,31,1bc,1,216


(12629, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12629 entries, 0 to 12628
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   confidence  12629 non-null  float64
 1   file        12629 non-null  object 
 2   id_date     12629 non-null  object 
 3   count       12629 non-null  object 
 4   id          12629 non-null  object 
 5   yrs_in_fac  12629 non-null  object 
 6   month_day   12629 non-null  object 
dtypes: float64(1), object(6)
memory usage: 690.8+ KB
None


Unnamed: 0,confidence
count,12629.0
mean,98.857434
std,5.784056
min,29.45
25%,99.96
50%,100.0
75%,100.0
max,100.0


In [82]:
df_head_labeled['year'] = df_head_labeled.apply(assign_year, axis=1)  # create year column 
df_head_labeled['date'] = df_head_labeled['year']+df_head_labeled['month_day']  # create date column

# convert 'date' to dtype datetime
df_head_labeled['date'] = pd.to_datetime(df_head_labeled['date'], format='%y%m%d', errors='coerce')
display(df_head_labeled[df_head_labeled['date'].isnull()].shape)
display(df_head_labeled.head())
print(df_head_labeled.shape)

df_head_labeled = df_head_labeled[df_head_labeled.date.notnull()].copy()
print(df_head_labeled.shape)

(48, 9)

Unnamed: 0,confidence,file,id_date,count,id,yrs_in_fac,month_day,year,date
0,99.99,1bc10209.36.JPG,1bc10209,36,1bc,1,209,13,2013-02-09
1,100.0,1bc10212.30.JPG,1bc10212,30,1bc,1,212,13,2013-02-12
2,100.0,1bc10213.30.JPG,1bc10213,30,1bc,1,213,13,2013-02-13
3,99.99,1bc10214.32.JPG,1bc10214,32,1bc,1,214,13,2013-02-14
4,100.0,1bc10216.31.JPG,1bc10216,31,1bc,1,216,13,2013-02-16


(12629, 9)
(12581, 9)


In [83]:
df_head_labeled2 = df_head_labeled[['file', 'id', 'date']].copy()
display(df_head_labeled2.head())
print(df_head_labeled2.shape)

Unnamed: 0,file,id,date
0,1bc10209.36.JPG,1bc,2013-02-09
1,1bc10212.30.JPG,1bc,2013-02-12
2,1bc10213.30.JPG,1bc,2013-02-13
3,1bc10214.32.JPG,1bc,2013-02-14
4,1bc10216.31.JPG,1bc,2013-02-16


(12581, 3)


In [84]:
def label_propagation(minus_days, plus_plays):
    # create an Empty DataFrame object
    df_new = pd.DataFrame()
    # for each img 
    for row in df_SOD_labeled2.itertuples():
        # create date range
        date_minus = row[4] - timedelta(days=minus_days)
        date_plus = row[4] + timedelta(days=plus_plays)
        #print(date_minus, row[4], date_plus)
        # get all img between date range for current donor and append to new df
        #print(row[3])
        df = df_head_labeled2[df_head_labeled2['id'] == row[3]]
        if df.shape[0] == 0:
            continue
        #print(row)
        #print(date_minus, row[4], date_plus)
        #display(df)
        df_between = df.loc[df['date'].between(date_minus, date_plus)]
        if df_between.shape[0] != 0:
            #print(row)
            df_between.loc[:,'label'] = row[2] # propagate the label
            #display(df_between.head())
            # remove current img to avoid duplicates
            #print(df_between.shape)
            df_between = df_between[df_between.file != row[1]]
            #print(df_between.shape)
            df_new = pd.concat([df_new, df_between])
    
    return df_new

In [85]:
df_new = label_propagation(2,2)
print(df_new.shape)
display(df_new.head())

(9948, 4)


Unnamed: 0,file,id,date,label
1048,02d00704.36.JPG,02d,2012-07-04,0
1049,02d00704.48.JPG,02d,2012-07-04,0
1050,02d00705.41.JPG,02d,2012-07-05,0
1051,02d00705.48.JPG,02d,2012-07-05,0
1052,02d00705.49.JPG,02d,2012-07-05,0


In [86]:
# merge the propagated data with the original data
df_final = pd.concat([df_SOD_labeled[['id', 'file', 'label']], df_new[['id', 'file', 'label']]])
print(df_final.shape)
display(df_final.head())

(14648, 3)


Unnamed: 0,id,file,label
0,0,00000122.08.JPG,0
1,0,00000129.14.JPG,0
2,0,00000213.21.JPG,1
3,0,00000219.07.JPG,1
4,0,00000222.08.JPG,2


In [87]:
# drop duplicate rows 
df_final.drop_duplicates(inplace=True)
print(df_final.shape)

(11829, 3)


In [88]:
# create image path column before saving as csv
df_final['path'] = '/da1_data/icputrd/arf/mean.js/public/anau_img3/' + df_final['id'] + '/' + df_final['file']
print(df_final.shape)
display(df_final.head())

(11829, 4)


Unnamed: 0,id,file,label,path
0,0,00000122.08.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000122.08.JPG
1,0,00000129.14.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000129.14.JPG
2,0,00000213.21.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000213.21.JPG
3,0,00000219.07.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000219.07.JPG
4,0,00000222.08.JPG,2,/da1_data/icputrd/arf/mean.js/public/anau_img3/000/00000222.08.JPG


In [94]:
df_final[df_final.file.duplicated()].head(50)

Unnamed: 0,id,file,label,path
1050,02d,02d00705.41.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3/02d/02d00705.41.JPG
1051,02d,02d00705.48.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3/02d/02d00705.48.JPG
1052,02d,02d00705.49.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3/02d/02d00705.49.JPG
1053,02d,02d00706.46.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3/02d/02d00706.46.JPG
1054,02d,02d00706.52.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3/02d/02d00706.52.JPG
1080,059,05910201.34.JPG,3,/da1_data/icputrd/arf/mean.js/public/anau_img3/059/05910201.34.JPG
1209,0a8,0a801127.45.JPG,3,/da1_data/icputrd/arf/mean.js/public/anau_img3/0a8/0a801127.45.JPG
1210,0a8,0a801128.31.JPG,3,/da1_data/icputrd/arf/mean.js/public/anau_img3/0a8/0a801128.31.JPG
1211,0a8,0a801128.32.JPG,3,/da1_data/icputrd/arf/mean.js/public/anau_img3/0a8/0a801128.32.JPG
1212,0a8,0a801129.46.JPG,3,/da1_data/icputrd/arf/mean.js/public/anau_img3/0a8/0a801129.46.JPG


In [89]:
# save image path and label as csv ready for 03_train_val_test_split.py
df_final[['path', 'label']].to_csv('../data/propagated_2_2/stages.csv.20221114_correct.processed.propagated_2_2', header=None,
                                 index=False)