In [1]:
import os
import re
import itertools
import pandas as pd
from sklearn.metrics import cohen_kappa_score as cks
from tabulate import tabulate

### Building the dataframe with the results

In [2]:
label_files = sorted(os.listdir('../data/img_labeling/'))
print(label_files)
label_files.pop()

['1st_task_labeling_Amelie_08.09.csv', '1st_task_labeling_Gerda_07.09.csv', '1st_task_labeling_Marcos_24.08.csv', '1st_task_labeling_Renato_31.08.csv', '1st_task_labeling_Yalemisew_07.09.csv', '2nd_task_labeling_Amelie_08.09.csv', '2nd_task_labeling_Gerda_07.09.csv', '2nd_task_labeling_Marcos_28.08.csv', '2nd_task_labeling_Renato_07.09.csv', '2nd_task_labeling_Yalemisew_07.09.csv', '3rd_task_labeling_Amelie_08.09.csv', '3rd_task_labeling_Gerda_07.09.csv', '3rd_task_labeling_Marcos_28.08.csv', '3rd_task_labeling_Renato_07.09.csv', '3rd_task_labeling_Yalemisew_08.09.csv', 'duplicates.xlsx']


'duplicates.xlsx'

In [3]:
list_dfs = []
for label_file in label_files:
    task = int(re.findall(r'[1-3]{1}', label_file)[0])
    user = re.findall(r'Amelie|Gerda|Renato|Yalemisew|Marcos', label_file)[0]
    date = re.findall(r'[0-9]{2}.[0-9]{2}', label_file)[0]
    date = f'{date}.2020'
    df = pd.read_csv(f'../data/img_labeling/{label_file}', names=['class','image name', 'w','h'])
    df['task'] = task
    df['user'] = user
    df['date'] = date
    df['date'] = pd.to_datetime(df['date'])
    df.drop(['w','h'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df = df[['task', 'user', 'date', 'image name', 'class']]
    print(f'Task: {task}, User: {user}, Images Classified: {len(df)}')
    list_dfs.append(df)

Task: 1, User: Amelie, Images Classified: 392
Task: 1, User: Gerda, Images Classified: 393
Task: 1, User: Marcos, Images Classified: 390
Task: 1, User: Renato, Images Classified: 392
Task: 1, User: Yalemisew, Images Classified: 396
Task: 2, User: Amelie, Images Classified: 392
Task: 2, User: Gerda, Images Classified: 402
Task: 2, User: Marcos, Images Classified: 390
Task: 2, User: Renato, Images Classified: 392
Task: 2, User: Yalemisew, Images Classified: 403
Task: 3, User: Amelie, Images Classified: 392
Task: 3, User: Gerda, Images Classified: 398
Task: 3, User: Marcos, Images Classified: 391
Task: 3, User: Renato, Images Classified: 392
Task: 3, User: Yalemisew, Images Classified: 399


In [4]:
df_labeling = pd.concat(list_dfs)
df_labeling['image name'].replace({'08806-~1.JPE': '08806-cgi_bin_gw_chameleon_lng_en_host_localhost_9901_DEFAULT_search_KEYWORD_function_CARDSCR_u1_12101_t1_004573881_1.jpeg', 
                                   '204804~1.JPE': '2048047-Athena_Plus_ProvidedCHO_Institutul_Na_ional_al_Patrimoniului_1C82EC077B694E00B35B9B8044A16EAB_1.jpeg',
                                   '204804~2.JPE':'2048047-Athena_Plus_ProvidedCHO_Institutul_Na_ional_al_Patrimoniului_7000A7800599458180CD0055DB99645F_1.jpeg',
                                  }, inplace=True)
df_labeling['id_image'] = pd.Categorical(df_labeling['image name']).codes
df_labeling = df_labeling[['task', 'user', 'date', 'image name', 'id_image', 'class']]
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5914 entries, 0 to 451
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   task        5914 non-null   int64         
 1   user        5914 non-null   object        
 2   date        5914 non-null   datetime64[ns]
 3   image name  5914 non-null   object        
 4   id_image    5914 non-null   int16         
 5   class       5914 non-null   object        
dtypes: datetime64[ns](1), int16(1), int64(1), object(3)
memory usage: 288.8+ KB


In [5]:
df_labeling.head()

Unnamed: 0,task,user,date,image name,id_image,class
0,1,Amelie,2020-08-09,203-resource_document_zuiderzeemuseum_B001601_...,87,Non-Fruits
1,1,Amelie,2020-08-09,285-gam19649_1.jpeg,232,Fruits
2,1,Amelie,2020-08-09,07101-O_389_1.jpeg,6,Fruits
3,1,Amelie,2020-08-09,07101-O_927_1.jpeg,12,Fruits
4,1,Amelie,2020-08-09,07101-O_957_1.jpeg,13,Non-Fruits


#### Checking the numbers of images classified:

In [6]:
for task in df_labeling.task.unique():
    for user in df_labeling.user.unique():
        size = len(df_labeling.loc[(df_labeling.user == user) & (df_labeling.task == task), 'id_image'].unique())
        print(f'Task: {task}, User: {user}, Images Classified: {size}')
    print()

Task: 1, User: Amelie, Images Classified: 392
Task: 1, User: Gerda, Images Classified: 392
Task: 1, User: Marcos, Images Classified: 390
Task: 1, User: Renato, Images Classified: 392
Task: 1, User: Yalemisew, Images Classified: 391

Task: 2, User: Amelie, Images Classified: 392
Task: 2, User: Gerda, Images Classified: 392
Task: 2, User: Marcos, Images Classified: 390
Task: 2, User: Renato, Images Classified: 392
Task: 2, User: Yalemisew, Images Classified: 392

Task: 3, User: Amelie, Images Classified: 392
Task: 3, User: Gerda, Images Classified: 392
Task: 3, User: Marcos, Images Classified: 391
Task: 3, User: Renato, Images Classified: 392
Task: 3, User: Yalemisew, Images Classified: 392



#### Checking which images were not classified:

In [7]:
images = set(df_labeling.id_image.unique())
for task in df_labeling.task.unique():
    for user in df_labeling.user.unique():
        ids = images.difference(set(df_labeling.loc[(df_labeling.user == user) & (df_labeling.task == task), 'id_image']))
        if len(ids) > 0:
            print(f'Missing files for task: {task}, by user: {user}')
            for id in ids:
                print(df_labeling.loc[(df_labeling.task == task) & (df_labeling.id_image == id), 'image name'].values[0])
            print()

Missing files for task: 1, by user: Marcos
07101-O_4536_1.jpeg
07101-O_3908_1.jpeg

Missing files for task: 1, by user: Yalemisew
2048128-58615_1.jpeg

Missing files for task: 2, by user: Marcos
90402-SK_A_1407_1.jpeg
90402-SK_A_1444_1.jpeg

Missing files for task: 3, by user: Marcos
90402-SK_A_752_1.jpeg



#### Checking duplicates

In [8]:
duplicates = df_labeling.loc[df_labeling.duplicated(['task', 'user', 'image name', 'id_image'], keep=False), ['task', 'user', 'image name', 'id_image', 'class']]
duplicates.sort_values(by=['user', 'image name', 'task'], inplace=True)
duplicates.to_excel('../data/img_labeling/duplicates.xlsx')
duplicates.head(30)

Unnamed: 0,task,user,image name,id_image,class
9,2,Gerda,07101-O_3908_1.jpeg,7,Formal
288,2,Gerda,07101-O_3908_1.jpeg,7,Informal
1,2,Gerda,07101-O_927_1.jpeg,12,Formal
286,2,Gerda,07101-O_927_1.jpeg,12,Informal
2,3,Gerda,07101-O_927_1.jpeg,12,Non-appealing
169,3,Gerda,07101-O_927_1.jpeg,12,Appealing
56,3,Gerda,2020903-KMS1518_1.jpeg,17,Non-appealing
314,3,Gerda,2020903-KMS1518_1.jpeg,17,Appealing
59,1,Gerda,2020903-KMS3015_1.jpeg,21,Fruits
317,1,Gerda,2020903-KMS3015_1.jpeg,21,Non-Fruits


### Analysing the [inter-annotator agreement](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html) on the results

In [9]:
l1 = df_labeling.user.unique()
iter_users = list(itertools.product(l1,l1))
df_iaa = pd.DataFrame(index=l1, columns=l1)

In [12]:
for task in df_labeling.task.unique():
    for user1,user2 in iter_users:
        classesA = df_labeling.loc[(df_labeling.user == user1) & (df_labeling.task == task),['id_image', 'class']]
        classesA.sort_values(by=['id_image'], inplace=True)
        classesA.drop('id_image', axis=1, inplace=True)

        classesB = df_labeling.loc[(df_labeling.user == user2) & (df_labeling.task == task),['id_image', 'class']]
        classesB.sort_values(by=['id_image'], inplace=True)
        classesB.drop('id_image', axis=1, inplace=True)

        df_iaa.loc[user1,user2] = cks(classesA[0:390], classesB[0:390])   ### we need to fix the image files before measuring correctly
        df_iaa.index.name = f'Task_{task}'
    print(tabulate(df_iaa, headers='keys', tablefmt='psql'))
    print()

+-----------+----------+-----------+-----------+----------+-------------+
| Task_1    |   Amelie |     Gerda |    Marcos |   Renato |   Yalemisew |
|-----------+----------+-----------+-----------+----------+-------------|
| Amelie    | 1        | 0.116965  | 0.101454  | 0.906939 |    0.171821 |
| Gerda     | 0.116965 | 1         | 0.0911258 | 0.131426 |    0.420274 |
| Marcos    | 0.101454 | 0.0911258 | 1         | 0.124336 |    0.082547 |
| Renato    | 0.906939 | 0.131426  | 0.124336  | 1        |    0.205577 |
| Yalemisew | 0.171821 | 0.420274  | 0.082547  | 0.205577 |    1        |
+-----------+----------+-----------+-----------+----------+-------------+

+-----------+-------------+------------+------------+-----------+-------------+
| Task_2    |      Amelie |      Gerda |     Marcos |    Renato |   Yalemisew |
|-----------+-------------+------------+------------+-----------+-------------|
| Amelie    |  1          | 0.00787026 |  0.242636  | 0.312087  |  -0.0376586 |
| Gerda     |