# Dependencies 

In [15]:
from sys import path
path.append('../../src/')

import pandas as pd
from itertools import combinations
from glob import glob
from os.path import basename, splitext
from pathlib import Path
from shutil import copy2
from sklearn.utils.extmath import weighted_mode
from data_structures import Eye
from iso_standard import PhotographicRequirements
from mrk_file import MRKFile

pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 200

In [34]:
FOLDER_IMAGES = '../../data/pybossa/images/'
FOLDER_GROUND_TRUTH = '../../data/pybossa/ground_truth/'

N_IMAGES = 5763

In [3]:
REQ_COLS = [
    'blurred', 
    'l_away', 
    'ink_mark', 
    'skin_tone', 
    'light', 
    'washed_out',
    'pixelation', 
    'hair_eyes', 
    'eyes_closed', 
    'background', 
    'rotation',
    'reflection', 
    'red_eyes', 
    'sh_head', 
    'sh_face', 
    'dark_glasses',
    'flash_lenses', 
    'frames_heavy', 
    'frame_eyes', 
    'hat', 
    'veil', 
    'mouth',
    'close'
]

# Helper Methods 

In [4]:
voting_func = lambda col, weights: weighted_mode(col, weights)[0]

def weighted_vote_by_requirement(df_votes):
    return df_votes[REQ_COLS].apply(voting_func, weights=df_votes.weight, axis=0)

def majority_voting_by_image(df, list_users):
    df_users = df[df.user_id.isin(list_users)].copy()
    
    if len(list_users) > 1:
        df_users = df_users.groupby('img_path').apply(weighted_vote_by_requirement)
        df_users = df_users.reset_index(level='img_path')
    else:
        df_users = df_users[['img_path'] + REQ_COLS]
    
    df_users[REQ_COLS] = df_users[REQ_COLS].astype(int)
    return df_users

In [5]:
def dataframe_to_mrk(df, output_folder):
    empty_eye = Eye(1, 1, 1, 1)

    for i, row in df.iterrows():
        photo_reqs = PhotographicRequirements(*row[REQ_COLS])
        mrk = MRKFile(empty_eye, empty_eye, photo_reqs)

        img_path = splitext(basename(row.img_path))[0]
        mrk.save(f'{output_folder}/{img_path}.mrk')

# Ground Truth Creation 

In [6]:
df = pd.read_csv('../../data/pybossa/volunt_and_experts_r1_extra_corr.csv')
df[REQ_COLS] = df[REQ_COLS].replace(to_replace=-1, value=0)

print(df.shape)
df.head()

(28815, 36)


Unnamed: 0,img_id,img_origin,img_path,user_id,task_id,created,finish_time,user_ip,link,timeout,project_id,id,answers,blurred,l_away,ink_mark,skin_tone,light,washed_out,pixelation,hair_eyes,eyes_closed,background,rotation,reflection,red_eyes,sh_head,sh_face,dark_glasses,flash_lenses,frames_heavy,frame_eyes,hat,veil,mouth,close
0,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,1,6186,2019-11-08T12:18:51.372001,2019-11-08 12:34:13.807661,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/21973'/>,,5,21973,,1,0,1,0,1,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,1,1,0
1,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,2,6186,2019-11-10T22:02:04.816771,2019-11-10 22:05:42.163341,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/22010'/>,,5,22010,,1,0,1,0,1,1,1,0,0,1,1,0,0,1,1,0,0,1,0,1,1,1,1
2,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,3,381,2019-09-26T17:15:36.049989,2019-09-26 17:45:12.705487,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/348'/>,,4,348,,1,0,1,0,1,1,1,1,0,1,1,0,0,1,1,0,0,1,1,1,1,1,1
3,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,5,381,2019-10-02T00:27:36.373347,2019-10-02 00:30:46.428682,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/2457'/>,,4,2457,,1,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1
4,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,6,381,2019-09-30T12:18:22.166826,2019-09-30 12:39:22.014212,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/1595'/>,,4,1595,,1,0,1,1,1,1,1,1,0,1,1,0,0,1,1,0,0,1,0,1,1,1,1


### equal-weight ground-truth

In [7]:
df['weight'] = 1

list_users = [1, 2, 3, 5, 6]

comb_1 = [comb for comb in combinations(list_users, 1)]
comb_3 = [comb for comb in combinations(list_users, 3)]
comb_5 = [comb for comb in combinations(list_users, 5)]

list_combinations = comb_1 + comb_3 + comb_5

for comb_users in list_combinations:
    gt_folder = FOLDER_GROUND_TRUTH + '_'.join(map(str, comb_users))
    Path(gt_folder).mkdir(exist_ok=True)
    print(gt_folder)

    df_gt = majority_voting_by_image(df, comb_users)
    dataframe_to_mrk(df_gt, gt_folder)

../data/pybossa/ground_truth/1
../data/pybossa/ground_truth/2
../data/pybossa/ground_truth/3
../data/pybossa/ground_truth/5
../data/pybossa/ground_truth/6
../data/pybossa/ground_truth/1_2_3
../data/pybossa/ground_truth/1_2_5
../data/pybossa/ground_truth/1_2_6
../data/pybossa/ground_truth/1_3_5
../data/pybossa/ground_truth/1_3_6
../data/pybossa/ground_truth/1_5_6
../data/pybossa/ground_truth/2_3_5
../data/pybossa/ground_truth/2_3_6
../data/pybossa/ground_truth/2_5_6
../data/pybossa/ground_truth/3_5_6
../data/pybossa/ground_truth/1_2_3_5_6


### higher weight for experts

In [8]:
df.loc[df.user_id.isin([1, 2]), 'weight'] = 2
df.head()

Unnamed: 0,img_id,img_origin,img_path,user_id,task_id,created,finish_time,user_ip,link,timeout,project_id,id,answers,blurred,l_away,ink_mark,skin_tone,light,washed_out,pixelation,hair_eyes,eyes_closed,background,rotation,reflection,red_eyes,sh_head,sh_face,dark_glasses,flash_lenses,frames_heavy,frame_eyes,hat,veil,mouth,close,weight
0,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,1,6186,2019-11-08T12:18:51.372001,2019-11-08 12:34:13.807661,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/21973'/>,,5,21973,,1,0,1,0,1,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,1,1,0,2
1,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,2,6186,2019-11-10T22:02:04.816771,2019-11-10 22:05:42.163341,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/22010'/>,,5,22010,,1,0,1,0,1,1,1,0,0,1,1,0,0,1,1,0,0,1,0,1,1,1,1,2
2,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,3,381,2019-09-26T17:15:36.049989,2019-09-26 17:45:12.705487,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/348'/>,,4,348,,1,0,1,0,1,1,1,1,0,1,1,0,0,1,1,0,0,1,1,1,1,1,1,1
3,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,5,381,2019-10-02T00:27:36.373347,2019-10-02 00:30:46.428682,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/2457'/>,,4,2457,,1,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,1
4,0,train,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/train/AR_FDB_m-001-10.png,6,381,2019-09-30T12:18:22.166826,2019-09-30 12:39:22.014212,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/1595'/>,,4,1595,,1,0,1,1,1,1,1,1,0,1,1,0,0,1,1,0,0,1,0,1,1,1,1,1


In [9]:
list_users = [1, 2, 3, 5, 6]

gt_folder = FOLDER_GROUND_TRUTH + '_'.join(map(str, list_users)) + '_weight_2'
Path(gt_folder).mkdir(exist_ok=True)
print(gt_folder)

df_gt = majority_voting_by_image(df, list_users)
dataframe_to_mrk(df_gt, gt_folder)

../data/pybossa/ground_truth/1_2_3_5_6_weight_2


# Copy Images 

In [25]:
df[df.img_origin.str.contains('val')].head()

Unnamed: 0,img_id,img_origin,img_path,user_id,task_id,created,finish_time,user_ip,link,timeout,project_id,id,answers,blurred,l_away,ink_mark,skin_tone,light,washed_out,pixelation,hair_eyes,eyes_closed,background,rotation,reflection,red_eyes,sh_head,sh_face,dark_glasses,flash_lenses,frames_heavy,frame_eyes,hat,veil,mouth,close,weight
26055,5211,validation,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/validation/AR_m-002-14_C40.png,1,11397,2020-02-01T17:55:14.494797,2020-02-01 17:56:40.923293,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/33115'/>,,5,33115,,0,1,1,0,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2
26056,5211,validation,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/validation/AR_m-002-14_C40.png,2,11397,2020-01-05T12:51:23.454905,2020-01-05 13:01:42.597451,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/30749'/>,,5,30749,,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2
26057,5211,validation,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/validation/AR_m-002-14_C40.png,3,5594,2019-10-19T18:25:56.903633,2019-10-19 18:28:46.970622,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/13494'/>,,4,13494,,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1
26058,5211,validation,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/validation/AR_m-002-14_C40.png,5,5594,2019-11-01T03:26:17.421801,2019-11-01 03:26:58.581959,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/21415'/>,,4,21415,,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
26059,5211,validation,http://pesquisa.eastus.cloudapp.azure.com:8081/icao_dataset/validation/AR_m-002-14_C40.png,6,5594,2019-10-26T18:08:45.786672,2019-10-26 18:09:44.752367,,<link rel='self' title='taskrun' href='http://pesquisa2.eastus.cloudapp.azure.com/api/taskrun/18519'/>,,4,18519,,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [40]:
def get_local_image_path(row, relative_path='../../data/'):
    img_path = basename(row.img_path)
    folder = 'train' if row.img_origin == 'train' else 'val/FVC'

    return Path(f'{relative_path}{folder}/images/{img_path}').as_posix()

list_images = (df
    .drop_duplicates('img_path')
    .apply(get_local_image_path, axis=1)
    .values
)
assert(len(list_images) == N_IMAGES)


for src_path in list_images:
    dst_path = Path(FOLDER_IMAGES, basename(src_path)).as_posix()
    copy2(src_path, dst_path)