In [1]:
import os
import re
import itertools
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns
import pandas as pd
from sklearn.metrics import cohen_kappa_score as cks
from tabulate import tabulate

import imgkit
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('precision', 2)

### Building the dataframe with the results

In [2]:
datapath = '../data/img_labeling_3rd_round/'

#### Reading orginal files

In [3]:
label_files = sorted(os.listdir(datapath))
label_files = [f for f in label_files if 'csv' in f and "_" in f]
print(f"{len(label_files)} .csv files in the folder\n")
#for f in label_files:
#    print(f)

47 .csv files in the folder



#### Extracting subjects

In [4]:
subjects = {x[:x.find("_")] for x in label_files}
print(f"{len(subjects)} diferent subjects")

20 diferent subjects


#### Consolidating files per user

In [5]:
n_files = []
for subject in subjects:
    s_files = [os.path.join(datapath, f) for f in label_files if f.startswith(subject)]
    n_file = os.path.join(datapath, subject + ".csv")
    n_files.append(n_file)
    with open(n_file, "w") as fw:
        for s in s_files:
            with open(s, "r") as fr:
                lines = fr.read()
                lines = lines.replace('"','')
                lines = lines.replace("[Appealing,Non-appealing]", "[Non-appealing]")   ## Choosing the last chosen class for ambiguous
                lines = lines.replace("[Non-appealing,Appealing]", "[Appealing]")
                fw.write(lines+"\n");

2136

8916

11729

10969

11541

22775

12165

11708

11077

22697

3981

6604

1045

2180

9601

11257

10681

2164

2257

2252

2240

2252

2260

2268

2252

2256

2557

23614

11424

11773

4224

6520

4900

6341

10753

10797

23364

22213

22390

4245

4492

4664

4532

4801

10596

10973

11338

#### Reading CSV files into Dataframes

In [6]:
list_dfs = []
for idx, n_file in enumerate(n_files):
    user = re.findall("|".join(subjects), n_file)[0]
    userid = idx + 1
    print(f"{userid}: \t {user}")
    df = pd.read_csv(n_file, names=['image_name', 'class','w','h'])
    df['user'] = user
    df['userid'] = userid
    df.drop(['w','h'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    list_dfs.append(df)

1: 	 38205FF03AFF19E008FA123615336213A0ED33AB
2: 	 C003DF0C2EC67886EA6BC835682E3182A2126135
3: 	 F96F6ACF9B40EDA60D4D48FBED7B3512E1C8E48D
4: 	 C733B58CB8AE244885A97E86B09D896A5633674C
5: 	 867D8B4F797167AA1CA27E00F76B51C63A7278ED
6: 	 A6241D7982404859C8D816D8C4A61DC506145A50
7: 	 F0E0055FB33661B91E7C55DAFB0246BCB43D7D8D
8: 	 5FFB96918AA02D6E1EF82257A2828F9A4FD9C702
9: 	 5FEF4C8B719CABDF32C46A4C9C9316376DF7B512
10: 	 6438BC09053F6B5215CA1FC743AB0E3CF2B53984
11: 	 3AB6D9AAC2244ECC4F595743DF6EB54686DE31A0
12: 	 B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA
13: 	 F9CC3D73B8F36057257097AFFE29B884000BABB5
14: 	 73396439E5B11BBCA57A8FCB53FBB6F0FCECE794
15: 	 43166EAC7DE5C6D48FF646B1B12BEB46AC7CA375
16: 	 BE887E16DEB41D06FACD481B5BFE3AF6FDF6F8CB
17: 	 8D396D662D61B164860C081535E30C3E32934406
18: 	 C6C77B9A8025D969F0E14BEBBEECB8B82BAAF284
19: 	 C91C4E40D11000EB5E5BE0EDBE12C74A54FAA40B
20: 	 245DC3767EBDBF721824F2F09B4251D5E0063494


#### Concatenating Dataframes

In [7]:
df_labeling = pd.concat(list_dfs)
df_labeling['id_image'] = df_labeling['image_name'].apply(lambda x:x[:-4])
df_labeling = df_labeling[['userid','user', 'image_name', 'id_image', 'class']]
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18156 entries, 0 to 1011
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userid      18156 non-null  int64 
 1   user        18156 non-null  object
 2   image_name  18156 non-null  object
 3   id_image    18156 non-null  object
 4   class       18156 non-null  object
dtypes: int64(1), object(4)
memory usage: 851.1+ KB


#### Fixing classes names

In [8]:
print(df_labeling['class'].unique())
df_labeling["class"] = df_labeling["class"].str.strip("[] ")
df_labeling["class"].replace({'Npn-appealing':"Non-appealing"}, inplace=True)
df_labeling['class'].value_counts()

['[Appealing]' '[Non-appealing]' '[Npn-appealing]' '[Appealing]    ']


Non-appealing    11076
Appealing         7080
Name: class, dtype: int64

In [9]:
df_labeling.head()

Unnamed: 0,userid,user,image_name,id_image,class
0,1,38205FF03AFF19E008FA123615336213A0ED33AB,1.jpg,1,Appealing
1,1,38205FF03AFF19E008FA123615336213A0ED33AB,2.jpg,2,Non-appealing
2,1,38205FF03AFF19E008FA123615336213A0ED33AB,3.jpg,3,Non-appealing
3,1,38205FF03AFF19E008FA123615336213A0ED33AB,4.jpg,4,Non-appealing
4,1,38205FF03AFF19E008FA123615336213A0ED33AB,5.jpg,5,Appealing


#### Checking images classified

In [10]:
df_labeling['image_appearances'] = df_labeling.groupby('id_image')['id_image'].transform('count')

In [11]:
print(f"{len(df_labeling['image_name'])} images were classified")
print(f"{len(df_labeling['image_name'].unique())} unique images were classified\n")
num_images = 0
image_appearances = df_labeling.image_appearances.value_counts().sort_index(ascending=True)
for idx in image_appearances.index:
    print(f'{image_appearances[idx]} images were classified {idx} times')
    num_images += image_appearances[idx]

18156 images were classified
1012 unique images were classified

11 images were classified 11 times
165 images were classified 15 times
8016 images were classified 16 times
18 images were classified 18 times
266 images were classified 19 times
9680 images were classified 20 times


#### Keeping the images that were classified by all subjects (optional)

In [12]:
#df_labeling = df_labeling[df_labeling['image_appearances'] == 20]
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18156 entries, 0 to 1011
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userid             18156 non-null  int64 
 1   user               18156 non-null  object
 2   image_name         18156 non-null  object
 3   id_image           18156 non-null  object
 4   class              18156 non-null  object
 5   image_appearances  18156 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 992.9+ KB


### Analysing the [inter-annotator agreement](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html) on the results  

### $\kappa = (p_o - p_e) / (1 - p_e)$

#### where $p_o$ is the empirical probability of agreement on the label assigned to any sample (the observed agreement ratio), and $p_e$ is the expected agreement when both annotators assign labels randomly. $p_e$ is estimated using a per-annotator empirical prior over the class labels

In [13]:
l1 = df_labeling.userid.unique()
iter_users = itertools.product(l1,l1)
df_iaa = pd.DataFrame(index=l1, columns=l1)
df_iaa_styled = pd.DataFrame(index=l1, columns=l1)

In [20]:
for user1,user2 in iter_users:
    classesA = df_labeling.loc[(df_labeling.userid == user1),['id_image', 'class']]
    classesA.sort_values(by=['id_image'], inplace=True)

    classesB = df_labeling.loc[(df_labeling.userid == user2),['id_image', 'class']]
    classesB.sort_values(by=['id_image'], inplace=True)

    classesAB = pd.merge(classesA, classesB, on=['id_image'])
    #classesAB.drop_duplicates(subset='id_image', keep = 'first', inplace=True) 
    classesAB.drop('id_image', axis=1, inplace=True)
    classesAB.dropna(inplace=True)

    agreement = cks(classesAB['class_x'], classesAB['class_y'])
    df_iaa.loc[user1,user2] = f'{agreement:.3f}/({len(classesAB)})'
    df_iaa_styled.loc[user1,user2] = f'{agreement:.3f}'
    
df_iaa_styled = df_iaa_styled.apply(pd.to_numeric)
#print(tabulate(df_iaa, headers='keys', tablefmt='psql'))

In [22]:
cm = sns.light_palette("green", as_cmap=True)
cm = 'viridis'
cm = 'plasma'
#cm = 'magma'

s = df_iaa_styled.style.background_gradient(cmap=cm, vmin=0, vmax=1, axis=None)
display(s)
html = s.render()
imgkit.from_string(html, f'../data/outputs/styled_table_round3.png')

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
1,1.0,0.28,-0.3,0.02,-0.24,-0.06,0.29,0.08,0.06,-0.4,-0.12,-0.31,0.34,-0.2,-0.25,0.02,-0.12,-0.0,-0.01,0.02
2,0.28,1.0,-0.07,0.03,-0.01,0.04,0.22,0.13,0.14,-0.22,0.03,-0.13,0.34,-0.1,-0.06,0.03,-0.02,0.11,0.04,0.04
3,-0.3,-0.07,1.0,0.07,0.5,0.2,-0.1,0.12,0.06,0.52,0.28,0.48,-0.17,0.27,0.39,-0.05,0.14,0.27,0.13,0.0
4,0.02,0.03,0.07,1.0,0.14,0.09,0.03,0.05,0.03,0.06,0.11,0.06,0.04,0.04,0.07,-0.02,0.0,0.06,0.01,0.0
5,-0.24,-0.01,0.5,0.14,1.0,0.26,0.02,0.2,0.05,0.56,0.23,0.43,-0.07,0.39,0.42,-0.01,0.16,0.38,0.16,0.03
6,-0.06,0.04,0.2,0.09,0.26,1.0,0.17,0.17,0.02,0.22,0.18,0.18,0.09,0.17,0.25,0.01,0.08,0.32,0.03,-0.02
7,0.29,0.22,-0.1,0.03,0.02,0.17,1.0,0.01,0.13,-0.17,0.02,-0.08,0.38,-0.15,-0.06,0.05,-0.12,0.15,0.03,0.08
8,0.08,0.13,0.12,0.05,0.2,0.17,0.01,1.0,0.06,0.13,0.11,0.12,0.08,0.23,0.13,-0.01,0.05,0.11,0.1,-0.06
9,0.06,0.14,0.06,0.03,0.05,0.02,0.13,0.06,1.0,-0.03,0.02,0.04,0.08,0.05,0.02,0.04,0.05,0.07,0.08,-0.02
10,-0.4,-0.22,0.52,0.06,0.56,0.22,-0.17,0.13,-0.03,1.0,0.29,0.61,-0.3,0.38,0.51,-0.04,0.22,0.21,0.1,0.01


Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


True

In [16]:
df_iaa.to_excel("../data/outputs/round3_results.xls")

In [17]:
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18156 entries, 0 to 1011
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userid             18156 non-null  int64 
 1   user               18156 non-null  object
 2   image_name         18156 non-null  object
 3   id_image           18156 non-null  object
 4   class              18156 non-null  object
 5   image_appearances  18156 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 992.9+ KB


In [18]:
df_labeling['round'] = 3
df_labeling.drop('image_appearances', inplace=True, axis=1)

In [19]:
df_labeling.to_hdf('../data/df_labeling.hdf', key='round3')