In [1]:
import pandas as pd
from datetime import timedelta

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 700)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# Import & process SOD-labeled data
This data will be used to label the following unlabeled data:
1. Pre-labeled as head
2. Predicted as head (will require evaluation to ensure data only consists of heads)

In [2]:
df_SOD_labeled = pd.read_csv('../data/4_classes/stages.csv.20221114_correct.4_classes', header=None, delimiter='/',
                    usecols=[8])
display(df_SOD_labeled.head())
print(df_SOD_labeled.shape)

Unnamed: 0,8
0,"00000122.08.JPG,0"
1,"00000129.14.JPG,0"
2,"00000213.21.JPG,1"
3,"00000219.07.JPG,1"
4,"00000222.08.JPG,2"


(4731, 1)


In [3]:
# remove any spaces 
df_SOD_labeled[8] = df_SOD_labeled[8].str.replace(' ','')

In [4]:
# split into file and label columns
df_SOD_labeled[['file','label']] = df_SOD_labeled[8].str.split(',', expand=True)
display(df_SOD_labeled.head())

Unnamed: 0,8,file,label
0,"00000122.08.JPG,0",00000122.08.JPG,0
1,"00000129.14.JPG,0",00000129.14.JPG,0
2,"00000213.21.JPG,1",00000213.21.JPG,1
3,"00000219.07.JPG,1",00000219.07.JPG,1
4,"00000222.08.JPG,2",00000222.08.JPG,2


In [5]:
# remove duplicates
print(df_SOD_labeled.shape)
df_SOD_labeled.drop_duplicates(subset='file', keep="first", inplace=True)
print(df_SOD_labeled.shape)

(4731, 3)
(4706, 3)


In [6]:
# split file column into multiple columns
df_SOD_labeled[['id_date', 'count','ext']] = df_SOD_labeled['file'].str.split('.', expand=True)
display(df_SOD_labeled.head())

Unnamed: 0,8,file,label,id_date,count,ext
0,"00000122.08.JPG,0",00000122.08.JPG,0,122,8,JPG
1,"00000129.14.JPG,0",00000129.14.JPG,0,129,14,JPG
2,"00000213.21.JPG,1",00000213.21.JPG,1,213,21,JPG
3,"00000219.07.JPG,1",00000219.07.JPG,1,219,7,JPG
4,"00000222.08.JPG,2",00000222.08.JPG,2,222,8,JPG


In [7]:
# check all values in id_date column are 8 characters long
df_SOD_labeled["id_date"].str.len().unique()

array([8])

In [8]:
# split id_date column into multiple columns
df_SOD_labeled['id'] = df_SOD_labeled['id_date'].str[:3]
df_SOD_labeled['yrs_in_fac'] = df_SOD_labeled['id_date'].str[3]
df_SOD_labeled['month_day'] = df_SOD_labeled['id_date'].str[4:]
df_SOD_labeled.drop([8,'ext'], axis=1, inplace=True)
display(df_SOD_labeled.head())
print(df_SOD_labeled.shape)

Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day
0,00000122.08.JPG,0,122,8,0,0,122
1,00000129.14.JPG,0,129,14,0,0,129
2,00000213.21.JPG,1,213,21,0,0,213
3,00000219.07.JPG,1,219,7,0,0,219
4,00000222.08.JPG,2,222,8,0,0,222


(4706, 7)


In [9]:
# get unique values of yrs_in_fac column
df_SOD_labeled.yrs_in_fac.unique()

array(['0', '1', '2'], dtype=object)

In [10]:
df_SOD_labeled[df_SOD_labeled.yrs_in_fac == '2']

Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day
299,07320814.08.JPG,3,07320814,8,073,2,814
2119,84a20422.04.JPG,3,84a20422,4,84a,2,422
4413,ff120811.01.JPG,3,ff120811,1,ff1,2,811
4414,ff121013.05.JPG,3,ff121013,5,ff1,2,1013


In [11]:
# create 'year' column from 'yrs_in_fac' column and then a 'date' column
def assign_year(row):
    if row['yrs_in_fac'] == '0':
        val = '11'
    elif row['yrs_in_fac']  == '1':
        val = '12'
    elif row['yrs_in_fac']  == '2':
        val = '13'
    return val

df_SOD_labeled['year'] = df_SOD_labeled.apply(assign_year, axis=1)  # create year column 
df_SOD_labeled['date'] = df_SOD_labeled['year']+df_SOD_labeled['month_day']  # create date column

# convert 'date' to dtype datetime
df_SOD_labeled['date'] = pd.to_datetime(df_SOD_labeled['date'], format='%y%m%d', errors='coerce')
display(df_SOD_labeled[df_SOD_labeled['date'].isnull()]) # lear year dates may become NaT
display(df_SOD_labeled.head())
print(df_SOD_labeled.shape)

# remove rows where date column is NaT
df_SOD_labeled = df_SOD_labeled[df_SOD_labeled.date.notnull()].copy()
print(df_SOD_labeled.shape)

Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day,year,date
1444,3ea00229.28.JPG,1,3ea00229,28,3ea,0,229,11,NaT
2984,bf200229.11.JPG,0,bf200229,11,bf2,0,229,11,NaT


Unnamed: 0,file,label,id_date,count,id,yrs_in_fac,month_day,year,date
0,00000122.08.JPG,0,122,8,0,0,122,11,2011-01-22
1,00000129.14.JPG,0,129,14,0,0,129,11,2011-01-29
2,00000213.21.JPG,1,213,21,0,0,213,11,2011-02-13
3,00000219.07.JPG,1,219,7,0,0,219,11,2011-02-19
4,00000222.08.JPG,2,222,8,0,0,222,11,2011-02-22


(4706, 9)
(4704, 9)


In [12]:
df_SOD_labeled2 = df_SOD_labeled[['file','label', 'id','date']].copy()
display(df_SOD_labeled2.head())
print(df_SOD_labeled2.shape)

Unnamed: 0,file,label,id,date
0,00000122.08.JPG,0,0,2011-01-22
1,00000129.14.JPG,0,0,2011-01-29
2,00000213.21.JPG,1,0,2011-02-13
3,00000219.07.JPG,1,0,2011-02-19
4,00000222.08.JPG,2,0,2011-02-22


(4704, 4)


# Import & process pre-labeled head data
Need SOD label

In [13]:
# import images that were predicted to be heads
'''
df_head_labeled = pd.read_csv('/da1_data/icputrd/decaying_human_body_part_classifier/from_anau/ex1_preds_labeled_head', 
                                header=None, delimiter=':', 
                                   usecols=[0,2])
'''

# import head images (these are from the clusters collection in MongoDB)
df_head_labeled = pd.read_csv('../data/clusters.csv.head', 
                                header=None, delimiter='/')

display(df_head_labeled.head())

Unnamed: 0,0,1,2,3
0,,sara_img,00b,"00b00323.18.icon.JPG,head"
1,,sara_img,00b,"00b00324.19.icon.JPG,head"
2,,sara_img,00b,"00b00326.19.icon.JPG,head"
3,,sara_img,00b,"00b00327.19.icon.JPG,head"
4,,sara_img,00b,"00b00329.23.icon.JPG,head"


In [14]:
# remove any spaces 
df_head_labeled[3] = df_head_labeled[3].str.replace(' ','')

In [15]:
# remove 'icon'
df_head_labeled[3] = df_head_labeled[3].str.replace('icon.','')
display(df_head_labeled.head())

Unnamed: 0,0,1,2,3
0,,sara_img,00b,"00b00323.18.JPG,head"
1,,sara_img,00b,"00b00324.19.JPG,head"
2,,sara_img,00b,"00b00326.19.JPG,head"
3,,sara_img,00b,"00b00327.19.JPG,head"
4,,sara_img,00b,"00b00329.23.JPG,head"


In [16]:
# split into file and label columns
df_head_labeled[['file','label']] = df_head_labeled[3].str.split(',', expand=True)
display(df_head_labeled.head())

Unnamed: 0,0,1,2,3,file,label
0,,sara_img,00b,"00b00323.18.JPG,head",00b00323.18.JPG,head
1,,sara_img,00b,"00b00324.19.JPG,head",00b00324.19.JPG,head
2,,sara_img,00b,"00b00326.19.JPG,head",00b00326.19.JPG,head
3,,sara_img,00b,"00b00327.19.JPG,head",00b00327.19.JPG,head
4,,sara_img,00b,"00b00329.23.JPG,head",00b00329.23.JPG,head


In [17]:
# remove duplicates
print(df_head_labeled.shape)
df_head_labeled.drop_duplicates(subset='file', keep="first", inplace=True)
print(df_head_labeled.shape)

(13266, 6)
(13266, 6)


In [18]:
# split file column into multiple columns
df_head_labeled[['id_date', 'count','ext']] = df_head_labeled['file'].str.split('.', expand=True)
display(df_head_labeled.head())

Unnamed: 0,0,1,2,3,file,label,id_date,count,ext
0,,sara_img,00b,"00b00323.18.JPG,head",00b00323.18.JPG,head,00b00323,18,JPG
1,,sara_img,00b,"00b00324.19.JPG,head",00b00324.19.JPG,head,00b00324,19,JPG
2,,sara_img,00b,"00b00326.19.JPG,head",00b00326.19.JPG,head,00b00326,19,JPG
3,,sara_img,00b,"00b00327.19.JPG,head",00b00327.19.JPG,head,00b00327,19,JPG
4,,sara_img,00b,"00b00329.23.JPG,head",00b00329.23.JPG,head,00b00329,23,JPG


In [19]:
# check all values in id_date column are 8 characters long
df_head_labeled["id_date"].str.len().unique()

array([8])

In [20]:
# split id_date column into multiple columns
df_head_labeled['id'] = df_head_labeled['id_date'].str[:3]
df_head_labeled['yrs_in_fac'] = df_head_labeled['id_date'].str[3]
df_head_labeled['month_day'] = df_head_labeled['id_date'].str[4:]
display(df_head_labeled.head())
print(df_head_labeled.shape)

Unnamed: 0,0,1,2,3,file,label,id_date,count,ext,id,yrs_in_fac,month_day
0,,sara_img,00b,"00b00323.18.JPG,head",00b00323.18.JPG,head,00b00323,18,JPG,00b,0,323
1,,sara_img,00b,"00b00324.19.JPG,head",00b00324.19.JPG,head,00b00324,19,JPG,00b,0,324
2,,sara_img,00b,"00b00326.19.JPG,head",00b00326.19.JPG,head,00b00326,19,JPG,00b,0,326
3,,sara_img,00b,"00b00327.19.JPG,head",00b00327.19.JPG,head,00b00327,19,JPG,00b,0,327
4,,sara_img,00b,"00b00329.23.JPG,head",00b00329.23.JPG,head,00b00329,23,JPG,00b,0,329


(13266, 12)


In [21]:
# get unique values of yrs_in_fac column
df_head_labeled.yrs_in_fac.unique()

array(['0', '1'], dtype=object)

In [22]:
df_head_labeled['year'] = df_head_labeled.apply(assign_year, axis=1)  # create year column 
df_head_labeled['date'] = df_head_labeled['year']+df_head_labeled['month_day']  # create date column

# convert 'date' to dtype datetime
df_head_labeled['date'] = pd.to_datetime(df_head_labeled['date'], format='%y%m%d', errors='coerce')
display(df_head_labeled[df_head_labeled['date'].isnull()].shape) # lear year dates may become NaT
display(df_head_labeled.head())
print(df_head_labeled.shape)

# remove rows where date column is NaT
df_head_labeled = df_head_labeled[df_head_labeled.date.notnull()].copy()
print(df_head_labeled.shape)

(10, 14)

Unnamed: 0,0,1,2,3,file,label,id_date,count,ext,id,yrs_in_fac,month_day,year,date
0,,sara_img,00b,"00b00323.18.JPG,head",00b00323.18.JPG,head,00b00323,18,JPG,00b,0,323,11,2011-03-23
1,,sara_img,00b,"00b00324.19.JPG,head",00b00324.19.JPG,head,00b00324,19,JPG,00b,0,324,11,2011-03-24
2,,sara_img,00b,"00b00326.19.JPG,head",00b00326.19.JPG,head,00b00326,19,JPG,00b,0,326,11,2011-03-26
3,,sara_img,00b,"00b00327.19.JPG,head",00b00327.19.JPG,head,00b00327,19,JPG,00b,0,327,11,2011-03-27
4,,sara_img,00b,"00b00329.23.JPG,head",00b00329.23.JPG,head,00b00329,23,JPG,00b,0,329,11,2011-03-29


(13266, 14)
(13256, 14)


In [23]:
df_head_labeled2 = df_head_labeled[['file', 'id', 'date']].copy()
display(df_head_labeled2.head())
print(df_head_labeled2.shape)

Unnamed: 0,file,id,date
0,00b00323.18.JPG,00b,2011-03-23
1,00b00324.19.JPG,00b,2011-03-24
2,00b00326.19.JPG,00b,2011-03-26
3,00b00327.19.JPG,00b,2011-03-27
4,00b00329.23.JPG,00b,2011-03-29


(13256, 3)


In [24]:
def label_propagation(minus_days, plus_plays):
    # create an Empty DataFrame object
    df_new = pd.DataFrame()
    # for each img 
    for row in df_SOD_labeled2.itertuples():
        # create date range
        date_minus = row[4] - timedelta(days=minus_days)
        date_plus = row[4] + timedelta(days=plus_plays)
        #print(date_minus, row[4], date_plus)
        # get all img between date range for current donor and append to new df
        #print(row[3])
        df = df_head_labeled2[df_head_labeled2['id'] == row[3]]
        if df.shape[0] == 0:
            continue
        #print(row)
        #print(date_minus, row[4], date_plus)
        #display(df)
        df_between = df.loc[df['date'].between(date_minus, date_plus)]
        if df_between.shape[0] != 0:
            #print(row)
            df_between.loc[:,'label'] = row[2] # propagate the label
            #display(df_between.head())
            # remove current img to avoid duplicates
            #print(df_between.shape)
            df_between = df_between[df_between.file != row[1]]
            #print(df_between.shape)
            df_new = pd.concat([df_new, df_between])
    
    return df_new

In [25]:
df_new = label_propagation(1,1)
print(df_new.shape)
display(df_new.head())

(6042, 4)


Unnamed: 0,file,id,date,label
24,02d00706.52.JPG,02d,2011-07-06,0
26,02d00705.49.JPG,02d,2011-07-05,0
27,02d00705.48.JPG,02d,2011-07-05,0
28,02d00706.46.JPG,02d,2011-07-06,0
29,02d00705.41.JPG,02d,2011-07-05,0


In [26]:
display(df_new.head(100))

Unnamed: 0,file,id,date,label
24,02d00706.52.JPG,02d,2011-07-06,0
26,02d00705.49.JPG,02d,2011-07-05,0
27,02d00705.48.JPG,02d,2011-07-05,0
28,02d00706.46.JPG,02d,2011-07-06,0
29,02d00705.41.JPG,02d,2011-07-05,0
24,02d00706.52.JPG,02d,2011-07-06,1
28,02d00706.46.JPG,02d,2011-07-06,1
37,02d00720.41.JPG,02d,2011-07-20,3
38,04000708.17.JPG,040,2011-07-08,0
39,04000709.04.JPG,040,2011-07-09,0


In [27]:
# merge the propagated data with the original data
df_final = pd.concat([df_SOD_labeled[['id', 'file', 'label']], df_new[['id', 'file', 'label']]])
print(df_final.shape)
display(df_final.head())

(10746, 3)


Unnamed: 0,id,file,label
0,0,00000122.08.JPG,0
1,0,00000129.14.JPG,0
2,0,00000213.21.JPG,1
3,0,00000219.07.JPG,1
4,0,00000222.08.JPG,2


In [28]:
# drop duplicate rows 
df_final.drop_duplicates(inplace=True)
print(df_final.shape)

(9666, 3)


In [29]:
# create image path column before saving as csv
df_final['path'] = '/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/' + df_final['id'] + '/' + df_final['file']
print(df_final.shape)
display(df_final.head())

(9666, 4)


Unnamed: 0,id,file,label,path
0,0,00000122.08.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000122.08.JPG
1,0,00000129.14.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000129.14.JPG
2,0,00000213.21.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000213.21.JPG
3,0,00000219.07.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000219.07.JPG
4,0,00000222.08.JPG,2,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000222.08.JPG


In [30]:
# save image path and label as csv ready for 03_train_val_test_split.py
df_final[['path', 'label']].to_csv('../data/4_classes/propagated_1_1/stages.csv.20221114_correct.4_classes.prop_1_1', header=None,
                                 index=False)

In [31]:
df_final[df_final.id == '000']

Unnamed: 0,id,file,label,path
0,0,00000122.08.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000122.08.JPG
1,0,00000129.14.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000129.14.JPG
2,0,00000213.21.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000213.21.JPG
3,0,00000219.07.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000219.07.JPG
4,0,00000222.08.JPG,2,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000222.08.JPG
5,0,00000223.07.JPG,2,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/000/00000223.07.JPG


In [32]:
df_final.sample(4)

Unnamed: 0,id,file,label,path
5472,65b,65b01003.37.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/65b/65b01003.37.JPG
1919,710,71010217.35.JPG,0,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/710/71010217.35.JPG
3397,dea,dea00416.34.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/dea/dea00416.34.JPG
4630,4e4,4e410216.25.JPG,1,/da1_data/icputrd/arf/mean.js/public/anau_img3_no_stakes/4e4/4e410216.25.JPG
