In [41]:
import os
import re
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns
import pandas as pd
from sklearn.metrics import cohen_kappa_score as cks
from tabulate import tabulate

import imgkit
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('precision', 2)

### Building the dataframe with the results

#### Reading original files

In [2]:
datapath = '../data/img_labeling_3rd_round/'

In [7]:
label_files = sorted(os.listdir(datapath))
label_files = [f for f in label_files if 'csv' in f and "_" in f]
print(f"{len(label_files)} .csv files in the folder\n")
#for f in label_files:
#    print(f)

47 .csv files in the folder



#### Extracting subjects

In [8]:
subjects = {x[:x.find("_")] for x in label_files}
print(f"{len(subjects)} diferent subjects\n")
#for s in sorted(subjects):
#    print(s)

20 diferent subjects



#### Consolidating files per user

In [9]:
consolidated_files = []
for subject in subjects:
    subject_files = [os.path.join(datapath, f) for f in label_files if f.startswith(subject)]
    consolidated_file = os.path.join(datapath, subject + ".csv")
    consolidated_files.append(consolidated_file)
    with open(consolidated_file, "w") as fw:
        for s in subject_files:
            with open(s, "r") as fr:
                lines = fr.read()
                lines = lines.replace('"','')
                lines = lines.replace("[Appealing,Non-appealing]", "[Non-appealing]")   ## Choosing the last chosen class for ambiguous
                lines = lines.replace("[Non-appealing,Appealing]", "[Appealing]")
                fw.write(lines+"\n");

#### Reading CSV files into Dataframes

In [15]:
list_dfs = []
for idx, consolidated_file in enumerate(consolidated_files):
    user = re.findall("|".join(subjects), consolidated_file)[0]
    userid = idx + 1
    df = pd.read_csv(consolidated_file, names=['image_name', 'class','w','h'])
    print(f"User:{userid}, hash: \t {user} tagged {len(df)} images")
    df['user'] = user
    df['userid'] = userid
    df.drop(['w','h'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    list_dfs.append(df)

User:1, hash: 	 B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA tagged 1012 images
User:2, hash: 	 C6C77B9A8025D969F0E14BEBBEECB8B82BAAF284 tagged 1011 images
User:3, hash: 	 BE887E16DEB41D06FACD481B5BFE3AF6FDF6F8CB tagged 1012 images
User:4, hash: 	 C003DF0C2EC67886EA6BC835682E3182A2126135 tagged 1009 images
User:5, hash: 	 C733B58CB8AE244885A97E86B09D896A5633674C tagged 1010 images
User:6, hash: 	 6438BC09053F6B5215CA1FC743AB0E3CF2B53984 tagged 1012 images
User:7, hash: 	 5FFB96918AA02D6E1EF82257A2828F9A4FD9C702 tagged 1101 images
User:8, hash: 	 245DC3767EBDBF721824F2F09B4251D5E0063494 tagged 1012 images
User:9, hash: 	 F0E0055FB33661B91E7C55DAFB0246BCB43D7D8D tagged 495 images
User:10, hash: 	 38205FF03AFF19E008FA123615336213A0ED33AB tagged 1010 images
User:11, hash: 	 5FEF4C8B719CABDF32C46A4C9C9316376DF7B512 tagged 500 images
User:12, hash: 	 C91C4E40D11000EB5E5BE0EDBE12C74A54FAA40B tagged 500 images
User:13, hash: 	 F9CC3D73B8F36057257097AFFE29B884000BABB5 tagged 1011 images
User:14, ha

#### Concatenating Dataframes

In [16]:
df_labeling = pd.concat(list_dfs)
df_labeling['id_image'] = df_labeling['image_name'].apply(lambda x:x[:-4])
df_labeling = df_labeling[['userid','user', 'image_name', 'id_image', 'class']]
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18156 entries, 0 to 1011
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userid      18156 non-null  int64 
 1   user        18156 non-null  object
 2   image_name  18156 non-null  object
 3   id_image    18156 non-null  object
 4   class       18156 non-null  object
dtypes: int64(1), object(4)
memory usage: 851.1+ KB


#### Fixing classes names

In [17]:
print(df_labeling['class'].unique())
df_labeling["class"] = df_labeling["class"].str.strip("[] ")
df_labeling["class"].replace({'Npn-appealing':"Non-appealing"}, inplace=True)
df_labeling['class'].value_counts()

['[Non-appealing]' '[Appealing]' '[Appealing]    ' '[Npn-appealing]']


Non-appealing    11076
Appealing         7080
Name: class, dtype: int64

In [18]:
df_labeling.head()

Unnamed: 0,userid,user,image_name,id_image,class
0,1,B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA,1.jpg,1,Non-appealing
1,1,B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA,2.jpg,2,Non-appealing
2,1,B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA,3.jpg,3,Non-appealing
3,1,B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA,4.jpg,4,Appealing
4,1,B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA,5.jpg,5,Non-appealing


### Adding users' data

In [54]:
profiles = pd.read_csv(os.path.join(datapath, 'ChIA-CulturalQuestionnaire-results-raw data.csv'), 
                       names=["timestamp", "age", "gender", "country", "discard1", "culturalBack", "culture", "hash", "discard2"])
profiles.drop(["timestamp", "discard1", "discard2"], axis=1, inplace=True)
profiles.drop([0,1], axis=0, inplace=True)

profiles["age"] = profiles.age.str.replace("-",",").str.strip("+")
profiles["age"] = profiles.age.apply(lambda x:eval(f"[{x}]"))
profiles["age"] = [np.mean(np.array(x)).astype(int) for x in profiles.age]

profiles.head(20)

Unnamed: 0,age,gender,country,culturalBack,culture,hash
2,40,female,Austria,European;Asian,European;Asian,
3,22,male,Austria,European,European,3AB6D9AAC2244ECC4F595743DF6EB54686DE31A0
4,22,male,Austria,European,European,43166EAC7DE5C6D48FF646B1B12BEB46AC7CA375
5,35,male,Austria,European,European,B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA
6,22,male,Albania,European,European,8D396D662D61B164860C081535E30C3E32934406
7,22,male,Albania,European,European;American;Latin American,C003DF0C2EC67886EA6BC835682E3182A2126135
8,27,female,Germany,European,European,C733B58CB8AE244885A97E86B09D896A5633674C
9,22,male,Italy,European,European,245DC3767EBDBF721824F2F09B4251D5E0063494
10,22,male,Austria,European,European,BE887E16DEB41D06FACD481B5BFE3AF6FDF6F8CB
11,22,male,Luxembourg,European,European,38205FF03AFF19E008FA123615336213A0ED33AB


In [56]:
df = df_labeling.join(profiles.set_index('hash'), on='user')

#### Users with no profiles

In [60]:
df[df.gender.isnull()].user.unique()

array(['5FFB96918AA02D6E1EF82257A2828F9A4FD9C702',
       '73396439E5B11BBCA57A8FCB53FBB6F0FCECE794',
       '867D8B4F797167AA1CA27E00F76B51C63A7278ED',
       'F96F6ACF9B40EDA60D4D48FBED7B3512E1C8E48D'], dtype=object)

#### Users with profiles

In [61]:
df[df.gender.notnull()].user.unique()

array(['B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA',
       'C6C77B9A8025D969F0E14BEBBEECB8B82BAAF284',
       'BE887E16DEB41D06FACD481B5BFE3AF6FDF6F8CB',
       'C003DF0C2EC67886EA6BC835682E3182A2126135',
       'C733B58CB8AE244885A97E86B09D896A5633674C',
       '6438BC09053F6B5215CA1FC743AB0E3CF2B53984',
       '245DC3767EBDBF721824F2F09B4251D5E0063494',
       'F0E0055FB33661B91E7C55DAFB0246BCB43D7D8D',
       '38205FF03AFF19E008FA123615336213A0ED33AB',
       '5FEF4C8B719CABDF32C46A4C9C9316376DF7B512',
       'C91C4E40D11000EB5E5BE0EDBE12C74A54FAA40B',
       'F9CC3D73B8F36057257097AFFE29B884000BABB5',
       '8D396D662D61B164860C081535E30C3E32934406',
       'A6241D7982404859C8D816D8C4A61DC506145A50',
       '3AB6D9AAC2244ECC4F595743DF6EB54686DE31A0',
       '43166EAC7DE5C6D48FF646B1B12BEB46AC7CA375'], dtype=object)

#### Checking images classified

In [19]:
df_labeling['image_appearances'] = df_labeling.groupby('id_image')['id_image'].transform('count')

In [20]:
print(f"{len(df_labeling['image_name'])} images were classified")
print(f"{len(df_labeling['image_name'].unique())} unique images were classified\n")
num_images = 0
image_appearances = df_labeling.image_appearances.value_counts().sort_index(ascending=True)
for idx in image_appearances.index:
    print(f'{image_appearances[idx]} images were classified {idx} times')
    num_images += image_appearances[idx]

18156 images were classified
1012 unique images were classified

11 images were classified 11 times
165 images were classified 15 times
8016 images were classified 16 times
18 images were classified 18 times
266 images were classified 19 times
9680 images were classified 20 times


#### Keeping the images that were classified by all subjects (optional)

In [21]:
#df_labeling = df_labeling[df_labeling['image_appearances'] == 20]
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18156 entries, 0 to 1011
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userid             18156 non-null  int64 
 1   user               18156 non-null  object
 2   image_name         18156 non-null  object
 3   id_image           18156 non-null  object
 4   class              18156 non-null  object
 5   image_appearances  18156 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 992.9+ KB


### Analysing the [inter-annotator agreement](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html) on the results  

### $\kappa = (p_o - p_e) / (1 - p_e)$

#### where $p_o$ is the empirical probability of agreement on the label assigned to any sample (the observed agreement ratio), and $p_e$ is the expected agreement when both annotators assign labels randomly. $p_e$ is estimated using a per-annotator empirical prior over the class labels

In [22]:
l1 = df_labeling.userid.unique()
iter_users = itertools.product(l1,l1)
df_iaa = pd.DataFrame(index=l1, columns=l1)
df_iaa_styled = pd.DataFrame(index=l1, columns=l1)

In [23]:
for user1,user2 in iter_users:
    classesA = df_labeling.loc[(df_labeling.userid == user1),['id_image', 'class']]
    classesA.sort_values(by=['id_image'], inplace=True)

    classesB = df_labeling.loc[(df_labeling.userid == user2),['id_image', 'class']]
    classesB.sort_values(by=['id_image'], inplace=True)

    classesAB = pd.merge(classesA, classesB, on=['id_image'])
    #classesAB.drop_duplicates(subset='id_image', keep = 'first', inplace=True) 
    classesAB.drop('id_image', axis=1, inplace=True)
    classesAB.dropna(inplace=True)

    agreement = cks(classesAB['class_x'], classesAB['class_y'])
    df_iaa.loc[user1,user2] = f'{agreement:.3f}/({len(classesAB)})'
    df_iaa_styled.loc[user1,user2] = f'{agreement:.3f}'
    
df_iaa_styled = df_iaa_styled.apply(pd.to_numeric)
#print(tabulate(df_iaa, headers='keys', tablefmt='psql'))

In [24]:
cm = sns.light_palette("green", as_cmap=True)
cm = 'viridis'
cm = 'plasma'
#cm = 'magma'

s = df_iaa_styled.style.background_gradient(cmap=cm, vmin=0, vmax=1, axis=None)
display(s)
html = s.render()
imgkit.from_string(html, f'../data/outputs/styled_table_round3.png')

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
1,1.0,0.17,-0.02,-0.13,0.06,0.61,0.12,-0.01,-0.08,-0.31,0.04,0.08,-0.21,0.11,0.26,0.18,0.34,0.46,0.43,0.48
2,0.17,1.0,0.03,0.11,0.06,0.21,0.11,0.02,0.15,-0.0,0.07,0.07,0.1,0.03,0.14,0.32,0.23,0.26,0.38,0.27
3,-0.02,0.03,1.0,0.03,-0.02,-0.04,-0.01,-0.01,0.05,0.02,0.04,0.04,0.02,0.02,0.02,0.01,-0.04,-0.05,-0.01,-0.05
4,-0.13,0.11,0.03,1.0,0.03,-0.22,0.13,0.04,0.22,0.28,0.14,0.04,0.34,-0.02,-0.1,0.04,0.03,-0.06,-0.01,-0.07
5,0.06,0.06,-0.02,0.03,1.0,0.06,0.05,0.0,0.03,0.02,0.03,0.01,0.04,0.0,0.04,0.09,0.11,0.07,0.14,0.07
6,0.61,0.21,-0.04,-0.22,0.06,1.0,0.13,0.01,-0.17,-0.4,-0.03,0.1,-0.3,0.22,0.38,0.22,0.29,0.51,0.56,0.52
7,0.12,0.11,-0.01,0.13,0.05,0.13,1.0,-0.06,0.01,0.08,0.06,0.1,0.08,0.05,0.23,0.17,0.11,0.13,0.2,0.12
8,-0.01,0.02,-0.01,0.04,0.0,0.01,-0.06,1.0,0.08,0.02,-0.02,0.02,0.02,-0.05,-0.02,-0.02,0.02,-0.0,0.03,0.0
9,-0.08,0.15,0.05,0.22,0.03,-0.17,0.01,0.08,1.0,0.29,0.13,0.03,0.38,-0.12,-0.15,0.17,0.02,-0.06,0.02,-0.1
10,-0.31,-0.0,0.02,0.28,0.02,-0.4,0.08,0.02,0.29,1.0,0.06,-0.01,0.34,-0.12,-0.2,-0.06,-0.12,-0.25,-0.24,-0.3


Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


True

In [25]:
df_iaa.to_excel("../data/outputs/round3_results.xls")

### Analysis per gender and country/culture

In [17]:
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18156 entries, 0 to 1011
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userid             18156 non-null  int64 
 1   user               18156 non-null  object
 2   image_name         18156 non-null  object
 3   id_image           18156 non-null  object
 4   class              18156 non-null  object
 5   image_appearances  18156 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 992.9+ KB


In [18]:
df_labeling['round'] = 3
df_labeling.drop('image_appearances', inplace=True, axis=1)

In [19]:
df_labeling.to_hdf('../data/df_labeling.hdf', key='round3')