In [1]:
import os
import re
import itertools
import pandas as pd
from sklearn.metrics import cohen_kappa_score as cks
from tabulate import tabulate

### Building the dataframe with the results

In [2]:
datapath = '../data/img_labeling_2nd_round/'

In [3]:
label_files = sorted(os.listdir(datapath))
label_files = [f for f in label_files if 'task_labeling' in f]
for f in label_files:
    print(f)

1st_task_labeling_Amelie_2020-11-04-07-41-41.csv
1st_task_labeling_Gerda_2020-10-23-12-56-45.csv
1st_task_labeling_Marcos_2020-11-09-04-46-17.csv
1st_task_labeling_Renato_2020-11-06.csv
1st_task_labeling_Yalemisew_2020-11-02-01-40-27.csv
2nd_task_labeling_Amelie_2020-11-04-09-14-46.csv
2nd_task_labeling_Gerda_2020-10-29-10-18-19.csv
2nd_task_labeling_Renato_2020-11-09-10-31-33.csv
2nd_task_labeling_Yalemisew_2020-11-02-01-40-27.csv
3rd_task_labeling_Amelie_2020-11-04-08-14-15.csv
3rd_task_labeling_Gerda_2020-10-29-10-37-58.csv
3rd_task_labeling_Renato_2020-11-09-11-51-56.csv
3rd_task_labeling_Yalemisew_2020-11-02-01-40-27.csv


In [4]:
list_dfs = []
for label_file in label_files:
    task = int(re.findall(r'[1-3]{1}', label_file)[0])
    user = re.findall(r'Amelie|Gerda|Renato|Yalemisew|Marcos', label_file)[0]
    date = re.findall(r'[0-9]{2}.[0-9]{2}', label_file)[0]
    date = f'{date}.2020'
    
    df = pd.read_csv(f'{datapath}{label_file}', names=['image name', 'class','w','h'])
    df['task'] = task
    df['user'] = user
    df['date'] = date
    df['date'] = pd.to_datetime(df['date'])
    df.drop(['w','h'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df = df[['task', 'user', 'date', 'image name', 'class']]
    print(f'Task: {task}, User: {user}, Images Classified: {len(df)}')
    list_dfs.append(df)

Task: 1, User: Amelie, Images Classified: 392
Task: 1, User: Gerda, Images Classified: 392
Task: 1, User: Marcos, Images Classified: 392
Task: 1, User: Renato, Images Classified: 392
Task: 1, User: Yalemisew, Images Classified: 392
Task: 2, User: Amelie, Images Classified: 392
Task: 2, User: Gerda, Images Classified: 392
Task: 2, User: Renato, Images Classified: 392
Task: 2, User: Yalemisew, Images Classified: 364
Task: 3, User: Amelie, Images Classified: 392
Task: 3, User: Gerda, Images Classified: 466
Task: 3, User: Renato, Images Classified: 392
Task: 3, User: Yalemisew, Images Classified: 391


In [5]:
df_labeling = pd.concat(list_dfs)
#df_labeling['image name'].replace({'08806-~1.JPE': '08806-cgi_bin_gw_chameleon_lng_en_host_localhost_9901_DEFAULT_search_KEYWORD_function_CARDSCR_u1_12101_t1_004573881_1.jpeg', 
#                                   '204804~1.JPE': '2048047-Athena_Plus_ProvidedCHO_Institutul_Na_ional_al_Patrimoniului_1C82EC077B694E00B35B9B8044A16EAB_1.jpeg',
#                                   '204804~2.JPE':'2048047-Athena_Plus_ProvidedCHO_Institutul_Na_ional_al_Patrimoniului_7000A7800599458180CD0055DB99645F_1.jpeg',
#                                  }, inplace=True)

df_labeling['id_image'] = pd.Categorical(df_labeling['image name']).codes
df_labeling = df_labeling[['task', 'user', 'date', 'image name', 'id_image', 'class']]

In [6]:
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5141 entries, 0 to 390
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   task        5141 non-null   int64         
 1   user        5141 non-null   object        
 2   date        5141 non-null   datetime64[ns]
 3   image name  5141 non-null   object        
 4   id_image    5141 non-null   int16         
 5   class       5141 non-null   object        
dtypes: datetime64[ns](1), int16(1), int64(1), object(3)
memory usage: 251.0+ KB


In [7]:
df_labeling.head()

Unnamed: 0,task,user,date,image name,id_image,class
0,1,Amelie,2020-11-20,203-resource_document_zuiderzeemuseum_B001601_...,99,[Non-Fruits]
1,1,Amelie,2020-11-20,285-gam19649_1.jpeg,282,[Fruits]
2,1,Amelie,2020-11-20,07101-O_389_1.jpeg,6,[Fruits]
3,1,Amelie,2020-11-20,07101-O_927_1.jpeg,12,[Fruits]
4,1,Amelie,2020-11-20,07101-O_957_1.jpeg,13,[Fruits]


#### Checking the numbers of images classified after fixing names:

In [8]:
for task in df_labeling.task.unique():
    for user in df_labeling.user.unique():
        size = len(df_labeling.loc[(df_labeling.user == user) & (df_labeling.task == task), 'id_image'].unique())
        print(f'Task: {task}, User: {user}, Images Classified: {size}')
    print()

Task: 1, User: Amelie, Images Classified: 392
Task: 1, User: Gerda, Images Classified: 392
Task: 1, User: Marcos, Images Classified: 392
Task: 1, User: Renato, Images Classified: 392
Task: 1, User: Yalemisew, Images Classified: 392

Task: 2, User: Amelie, Images Classified: 392
Task: 2, User: Gerda, Images Classified: 392
Task: 2, User: Marcos, Images Classified: 0
Task: 2, User: Renato, Images Classified: 392
Task: 2, User: Yalemisew, Images Classified: 364

Task: 3, User: Amelie, Images Classified: 392
Task: 3, User: Gerda, Images Classified: 442
Task: 3, User: Marcos, Images Classified: 0
Task: 3, User: Renato, Images Classified: 392
Task: 3, User: Yalemisew, Images Classified: 391



#### Checking which images were not classified:

In [9]:
images = set(df_labeling.id_image.unique())
dfs = []
for task in df_labeling.task.unique():
    for user in df_labeling.user.unique():
        id_list = []
        ids = images.difference(set(df_labeling.loc[(df_labeling.user == user) & (df_labeling.task == task), 'id_image']))
        if len(ids) > 0:
            #print(f'Missing files for task: {task}, by user: {user}')
            for _id in ids:
                try:
                    id_list.append(df_labeling.loc[(df_labeling.task == task) & (df_labeling.id_image == _id), 'image name'].values[0])
                    #print(df_labeling.loc[(df_labeling.task == task) & (df_labeling.id_image == _id), 'image name'].values[0])
                except:
                    pass
                    #print('--->', df_labeling.loc[(df_labeling.task == task) & (df_labeling.id_image == _id), 'image name'].values)
            #print()
        dfm = pd.DataFrame({'images':id_list})
        dfm['task'] = task
        dfm['user'] = user
        dfs.append(dfm)
df_missing = pd.concat(dfs)
df_missing.to_excel(f'{datapath}/missing.xlsx')
df_missing.head()       

Unnamed: 0,images,task,user
0,07101-O_2174_1.jpeg,2,Marcos
1,07101-O_2444_1.jpeg,2,Marcos
2,07101-O_2524_1.jpeg,2,Marcos
3,07101-O_2605_1.jpeg,2,Marcos
4,07101-O_2660_1.jpeg,2,Marcos


#### Checking duplicates

In [10]:
duplicates = df_labeling.loc[df_labeling.duplicated(['task', 'user', 'image name', 'id_image'], keep=False), ['task', 'user', 'image name', 'id_image', 'class']]
duplicates.sort_values(by=['user', 'image name', 'task'], inplace=True)
duplicates.to_excel(f'{datapath}/duplicates.xlsx')
duplicates.head(30)

Unnamed: 0,task,user,image name,id_image,class
99,3,Gerda,2048128-250999_1.jpeg,110,[Appealing]
432,3,Gerda,2048128-250999_1.jpeg,110,[Non-appealing]
102,3,Gerda,2048128-253059_1.jpeg,113,[Appealing]
435,3,Gerda,2048128-253059_1.jpeg,113,[Non-appealing]
106,3,Gerda,2048128-255215_1.jpeg,117,[Appealing]
439,3,Gerda,2048128-255215_1.jpeg,117,[Non-appealing]
109,3,Gerda,2048128-255721_1.jpeg,120,[Appealing]
442,3,Gerda,2048128-255721_1.jpeg,120,[Non-appealing]
119,3,Gerda,2048128-257379_1.jpeg,130,[Non-appealing]
452,3,Gerda,2048128-257379_1.jpeg,130,[Appealing]


### Updating the decisions on the Duplicates  

In [11]:
#decisions = pd.read_csv(f'{datapath}/duplicates_decision_GK_YA.csv')
#decisions.dropna(inplace=True)
#decisions.drop(labels=['decision'], inplace=True, axis=1)
#decisions.head()

In [12]:
#for idx in decisions.index:
#    task = decisions.loc[idx,'task'] 
#    user = decisions.loc[idx,'user']
#    id_image = decisions.loc[idx,'id_image']
#    decision = decisions.loc[idx,'class']
#    df_labeling.loc[(df_labeling.user == user) & (df_labeling.task == task) & (df_labeling.id_image == id_image), 'class'] = decision

### Analysing the [inter-annotator agreement](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html) on the results

In [13]:
l1 = df_labeling.user.unique()
iter_users = list(itertools.product(l1,l1))
df_iaa = pd.DataFrame(index=l1, columns=l1)

In [14]:
for task in df_labeling.task.unique():
    for user1,user2 in iter_users:
        classesA = df_labeling.loc[(df_labeling.user == user1) & (df_labeling.task == task),['id_image', 'class']]
        classesA.sort_values(by=['id_image'], inplace=True)

        classesB = df_labeling.loc[(df_labeling.user == user2) & (df_labeling.task == task),['id_image', 'class']]
        classesB.sort_values(by=['id_image'], inplace=True)
        
        classesAB = pd.merge(classesA, classesB, on=['id_image'])
        classesAB.drop_duplicates(subset='id_image', keep = 'first', inplace=True) 
        classesAB.drop('id_image', axis=1, inplace=True)
        classesAB.dropna(inplace=True)

        agreement = cks(classesAB['class_x'], classesAB['class_y'])
        df_iaa.loc[user1,user2] = f'{agreement:.3f}/({len(classesAB)})'  
        df_iaa.index.name = f'Task_{task}'
    print(tabulate(df_iaa, headers='keys', tablefmt='psql'))
    print()

+-----------+-------------+-------------+-------------+-------------+-------------+
| Task_1    | Amelie      | Gerda       | Marcos      | Renato      | Yalemisew   |
|-----------+-------------+-------------+-------------+-------------+-------------|
| Amelie    | 1.000/(392) | 0.928/(392) | 0.898/(392) | 0.908/(392) | 0.898/(392) |
| Gerda     | 0.928/(392) | 1.000/(392) | 0.882/(392) | 0.907/(392) | 0.861/(392) |
| Marcos    | 0.898/(392) | 0.882/(392) | 1.000/(392) | 0.913/(392) | 0.928/(392) |
| Renato    | 0.908/(392) | 0.907/(392) | 0.913/(392) | 1.000/(392) | 0.903/(392) |
| Yalemisew | 0.898/(392) | 0.861/(392) | 0.928/(392) | 0.903/(392) | 1.000/(392) |
+-----------+-------------+-------------+-------------+-------------+-------------+



  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


+-----------+-------------+-------------+----------+-------------+-------------+
| Task_2    | Amelie      | Gerda       | Marcos   | Renato      | Yalemisew   |
|-----------+-------------+-------------+----------+-------------+-------------|
| Amelie    | 1.000/(392) | 0.168/(392) | nan/(0)  | 0.255/(392) | 0.170/(364) |
| Gerda     | 0.168/(392) | 1.000/(392) | nan/(0)  | 0.095/(392) | 0.411/(364) |
| Marcos    | nan/(0)     | nan/(0)     | nan/(0)  | nan/(0)     | nan/(0)     |
| Renato    | 0.255/(392) | 0.095/(392) | nan/(0)  | 1.000/(392) | 0.070/(364) |
| Yalemisew | 0.170/(364) | 0.411/(364) | nan/(0)  | 0.070/(364) | 1.000/(364) |
+-----------+-------------+-------------+----------+-------------+-------------+

+-----------+-------------+-------------+----------+-------------+-------------+
| Task_3    | Amelie      | Gerda       | Marcos   | Renato      | Yalemisew   |
|-----------+-------------+-------------+----------+-------------+-------------|
| Amelie    | 1.000/(392) |