In [1]:
import os
import re
import itertools
import pandas as pd
from sklearn.metrics import cohen_kappa_score as cks
from tabulate import tabulate

### Building the dataframe with the results

In [22]:
datapath = '..\\data\\img_labeling_3rd_round_team\\'

In [23]:
label_files = sorted(os.listdir(datapath))
label_files = [f for f in label_files if 'labels_round3' in f]
for f in label_files:
    print(f)

labels_round3_A001_2021-02-16-06-26-53.csv
labels_round3_A002_2020-12-29-07-23-48.csv
labels_round3_A004_2021-02-16-02-03-15.csv
labels_round3_A005_2021-02-18-05-40-21.csv
labels_round3_A006_2021-01-28-11-17-26.csv
labels_round3_A007_2021-02-12-12-24-05.csv


In [24]:
list_dfs = []
for label_file in label_files:
    task = int(re.findall(r'[1-4]{1}', label_file)[0])
    user = re.findall(r'A001|A002|A004|A005|A006|A007', label_file)[0]
    date = re.findall(r'202[0-1]{1}.[0-9]{2}.[0-9]{2}', label_file)[0]
    #date = f'{date}.2021'
    
    df = pd.read_csv(f'{datapath}{label_file}', names=['image name', 'class','w','h'])
    df['task'] = task
    df['user'] = user
    df['date'] = date
    df['date'] = pd.to_datetime(df['date'])
    df.drop(['w','h'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df = df[['task', 'user', 'date', 'image name', 'class']]
    print(f'Task: {task}, User: {user}, Images Classified: {len(df)}, Date: {date}')
    list_dfs.append(df)

Task: 3, User: A001, Images Classified: 1010, Date: 2021-02-16
Task: 3, User: A002, Images Classified: 1010, Date: 2020-12-29
Task: 3, User: A004, Images Classified: 1010, Date: 2021-02-16
Task: 3, User: A005, Images Classified: 1011, Date: 2021-02-18
Task: 3, User: A006, Images Classified: 1010, Date: 2021-01-28
Task: 3, User: A007, Images Classified: 1010, Date: 2021-02-12


In [25]:
df_labeling = pd.concat(list_dfs)
df_labeling['id_image'] = pd.Categorical(df_labeling['image name']).codes
df_labeling["class"] = df_labeling["class"].apply(lambda x:x[1:-1])
df_labeling = df_labeling[['task', 'user', 'date', 'image name', 'id_image', 'class']]

In [26]:
df_labeling.head()

Unnamed: 0,task,user,date,image name,id_image,class
0,3,A001,2021-02-16,18.jpg,102,Non-appealing
1,3,A001,2021-02-16,19.jpg,113,Appealing
2,3,A001,2021-02-16,20.jpg,125,Non-appealing
3,3,A001,2021-02-16,21.jpg,136,Appealing
4,3,A001,2021-02-16,22.jpg,147,Appealing


In [27]:
df_labeling['class'].unique()

array(['Non-appealing', 'Appealing'], dtype=object)

In [28]:
df_labeling['image_appearances'] = df_labeling.groupby('id_image')['id_image'].transform('count')
df_labeling = df_labeling[df_labeling['image_appearances'] != 1] 

In [29]:
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6060 entries, 0 to 1009
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   task               6060 non-null   int64         
 1   user               6060 non-null   object        
 2   date               6060 non-null   datetime64[ns]
 3   image name         6060 non-null   object        
 4   id_image           6060 non-null   int16         
 5   class              6060 non-null   object        
 6   image_appearances  6060 non-null   int64         
dtypes: datetime64[ns](1), int16(1), int64(2), object(3)
memory usage: 343.2+ KB


In [30]:
df_labeling.head()

Unnamed: 0,task,user,date,image name,id_image,class,image_appearances
0,3,A001,2021-02-16,18.jpg,102,Non-appealing,6
1,3,A001,2021-02-16,19.jpg,113,Appealing,6
2,3,A001,2021-02-16,20.jpg,125,Non-appealing,6
3,3,A001,2021-02-16,21.jpg,136,Appealing,6
4,3,A001,2021-02-16,22.jpg,147,Appealing,6


#### Checking the numbers of images classified after fixing names:

In [31]:
for task in df_labeling.task.unique():
    for user in df_labeling.user.unique():
        size = len(df_labeling.loc[(df_labeling.user == user) & (df_labeling.task == task), 'id_image'].unique())
        print(f'Task: {task}, User: {user}, Images Classified: {size}')
    print()

Task: 3, User: A001, Images Classified: 1010
Task: 3, User: A002, Images Classified: 1010
Task: 3, User: A004, Images Classified: 1010
Task: 3, User: A005, Images Classified: 1010
Task: 3, User: A006, Images Classified: 1010
Task: 3, User: A007, Images Classified: 1010



#### Checking which images were not classified:

In [32]:
images = set(df_labeling.id_image.unique())
dfs = []
for task in df_labeling.task.unique():
    for user in df_labeling.user.unique():
        id_list = []
        ids = images.difference(set(df_labeling.loc[(df_labeling.user == user) & (df_labeling.task == task), 'id_image']))
        if len(ids) > 0:
            #print(f'Missing files for task: {task}, by user: {user}')
            for _id in ids:
                try:
                    id_list.append(df_labeling.loc[(df_labeling.task == task) & (df_labeling.id_image == _id), 'image name'].values[0])
                    #print(df_labeling.loc[(df_labeling.task == task) & (df_labeling.id_image == _id), 'image name'].values[0])
                except:
                    pass
                    #print('--->', df_labeling.loc[(df_labeling.task == task) & (df_labeling.id_image == _id), 'image name'].values)
            #print()
        dfm = pd.DataFrame({'images':id_list})
        dfm['task'] = task
        dfm['user'] = user
        dfs.append(dfm)
df_missing = pd.concat(dfs)
#df_missing.to_excel(f'{datapath}/missing.xlsx')
df_missing.head()       

Unnamed: 0,images,task,user


#### Checking duplicates

In [33]:
duplicates = df_labeling.loc[df_labeling.duplicated(['task', 'user', 'image name', 'id_image'], keep=False), ['task', 'user', 'image name', 'id_image', 'class']]
duplicates.sort_values(by=['user', 'image name', 'task'], inplace=True)
#duplicates.to_excel(f'{datapath}/duplicates.xlsx')
duplicates.head(30)

Unnamed: 0,task,user,image name,id_image,class


### Analysing the [inter-annotator agreement](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html) on the results

In [34]:
l1 = df_labeling.user.unique()
iter_users = list(itertools.product(l1,l1))
df_iaa = pd.DataFrame(index=l1, columns=l1)

In [35]:
for task in df_labeling.task.unique():
    for user1,user2 in iter_users:
        classesA = df_labeling.loc[(df_labeling.user == user1) & (df_labeling.task == task),['id_image', 'class']]
        classesA.sort_values(by=['id_image'], inplace=True)

        classesB = df_labeling.loc[(df_labeling.user == user2) & (df_labeling.task == task),['id_image', 'class']]
        classesB.sort_values(by=['id_image'], inplace=True)
        
        classesAB = pd.merge(classesA, classesB, on=['id_image'])
        classesAB.drop_duplicates(subset='id_image', keep = 'first', inplace=True) 
        classesAB.drop('id_image', axis=1, inplace=True)
        classesAB.dropna(inplace=True)

        agreement = cks(classesAB['class_x'], classesAB['class_y'])
        df_iaa.loc[user1,user2] = f'{agreement:.3f}/({len(classesAB)})'  
        df_iaa.index.name = f'Task_{task}'
    print(tabulate(df_iaa, headers='keys', tablefmt='psql'))
    print()

+----------+--------------+--------------+--------------+--------------+---------------+---------------+
| Task_3   | A001         | A002         | A004         | A005         | A006          | A007          |
|----------+--------------+--------------+--------------+--------------+---------------+---------------|
| A001     | 1.000/(1010) | 0.293/(1010) | 0.335/(1010) | 0.330/(1010) | 0.164/(1010)  | 0.274/(1010)  |
| A002     | 0.293/(1010) | 1.000/(1010) | 0.475/(1010) | 0.483/(1010) | 0.190/(1010)  | 0.042/(1010)  |
| A004     | 0.335/(1010) | 0.475/(1010) | 1.000/(1010) | 0.648/(1010) | 0.156/(1010)  | 0.082/(1010)  |
| A005     | 0.330/(1010) | 0.483/(1010) | 0.648/(1010) | 1.000/(1010) | 0.123/(1010)  | 0.061/(1010)  |
| A006     | 0.164/(1010) | 0.190/(1010) | 0.156/(1010) | 0.123/(1010) | 1.000/(1010)  | -0.025/(1010) |
| A007     | 0.274/(1010) | 0.042/(1010) | 0.082/(1010) | 0.061/(1010) | -0.025/(1010) | 1.000/(1010)  |
+----------+--------------+--------------+-------------

In [36]:
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6060 entries, 0 to 1009
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   task               6060 non-null   int64         
 1   user               6060 non-null   object        
 2   date               6060 non-null   datetime64[ns]
 3   image name         6060 non-null   object        
 4   id_image           6060 non-null   int16         
 5   class              6060 non-null   object        
 6   image_appearances  6060 non-null   int64         
dtypes: datetime64[ns](1), int16(1), int64(2), object(3)
memory usage: 343.2+ KB


In [37]:
df_labeling['round'] = 2
df_labeling.drop('image_appearances', inplace=True, axis=1)

In [None]:
df_labeling.to_hdf('../data/df_labeling.hdf', key='round2')