In [1]:
import os
import re
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns
import pandas as pd
from sklearn.metrics import cohen_kappa_score as cks
from tabulate import tabulate

import imgkit
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('precision', 2)

### Building the dataframe with the results

#### Reading original files

In [2]:
datapath = '../data/img_labeling_3rd_round/'

In [3]:
label_files = sorted(os.listdir(datapath))
label_files = [f for f in label_files if 'csv' in f and "_" in f]
print(f"{len(label_files)} .csv files in the folder\n")
#for f in label_files:
#    print(f)

47 .csv files in the folder



#### Extracting subjects

In [4]:
subjects = {x[:x.find("_")] for x in label_files}
print(f"{len(subjects)} diferent subjects\n")
#for s in sorted(subjects):
#    print(s)

20 diferent subjects



#### Consolidating files per user

In [5]:
consolidated_files = []
for subject in subjects:
    subject_files = [os.path.join(datapath, f) for f in label_files if f.startswith(subject)]
    consolidated_file = os.path.join(datapath, subject + ".csv")
    consolidated_files.append(consolidated_file)
    with open(consolidated_file, "w") as fw:
        for s in subject_files:
            with open(s, "r") as fr:
                lines = fr.read()
                lines = lines.replace('"','')
                lines = lines.replace("[Appealing,Non-appealing]", "[Non-appealing]")   ## Choosing the last chosen class for ambiguous
                lines = lines.replace("[Non-appealing,Appealing]", "[Appealing]")
                fw.write(lines+"\n");

#### Reading CSV files into Dataframes

In [6]:
list_dfs = []
for idx, consolidated_file in enumerate(consolidated_files):
    user = re.findall("|".join(subjects), consolidated_file)[0]
    userid = idx + 1
    df = pd.read_csv(consolidated_file, names=['image_name', 'class','w','h'])
    print(f"User:{userid}, hash: \t {user} tagged {len(df)} images")
    df['user'] = user
    df['userid'] = userid
    df.drop(['w','h'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    list_dfs.append(df)

User:1, hash: 	 245DC3767EBDBF721824F2F09B4251D5E0063494 tagged 1012 images
User:2, hash: 	 F9CC3D73B8F36057257097AFFE29B884000BABB5 tagged 1011 images
User:3, hash: 	 867D8B4F797167AA1CA27E00F76B51C63A7278ED tagged 499 images
User:4, hash: 	 A6241D7982404859C8D816D8C4A61DC506145A50 tagged 1011 images
User:5, hash: 	 C003DF0C2EC67886EA6BC835682E3182A2126135 tagged 1009 images
User:6, hash: 	 C6C77B9A8025D969F0E14BEBBEECB8B82BAAF284 tagged 1011 images
User:7, hash: 	 43166EAC7DE5C6D48FF646B1B12BEB46AC7CA375 tagged 1008 images
User:8, hash: 	 B59DE8F9A0D4C9AADABAFF7EA6A4F61BF89861DA tagged 1012 images
User:9, hash: 	 BE887E16DEB41D06FACD481B5BFE3AF6FDF6F8CB tagged 1012 images
User:10, hash: 	 38205FF03AFF19E008FA123615336213A0ED33AB tagged 1010 images
User:11, hash: 	 3AB6D9AAC2244ECC4F595743DF6EB54686DE31A0 tagged 1012 images
User:12, hash: 	 C733B58CB8AE244885A97E86B09D896A5633674C tagged 1010 images
User:13, hash: 	 C91C4E40D11000EB5E5BE0EDBE12C74A54FAA40B tagged 500 images
User:14, h

#### Concatenating Dataframes

In [7]:
df_labeling = pd.concat(list_dfs)
df_labeling['id_image'] = df_labeling['image_name'].apply(lambda x:x[:-4])
df_labeling = df_labeling[['userid','user', 'image_name', 'id_image', 'class']]
df_labeling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18156 entries, 0 to 499
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userid      18156 non-null  int64 
 1   user        18156 non-null  object
 2   image_name  18156 non-null  object
 3   id_image    18156 non-null  object
 4   class       18156 non-null  object
dtypes: int64(1), object(4)
memory usage: 851.1+ KB


#### Fixing classes names

In [8]:
print(df_labeling['class'].unique())
df_labeling["class"] = df_labeling["class"].str.strip("[] ")
df_labeling["class"].replace({'Npn-appealing':"Non-appealing"}, inplace=True)
df_labeling['class'].value_counts()

['[Non-appealing]' '[Appealing]' '[Appealing]    ' '[Npn-appealing]']


Non-appealing    11076
Appealing         7080
Name: class, dtype: int64

In [9]:
df_labeling.head()

Unnamed: 0,userid,user,image_name,id_image,class
0,1,245DC3767EBDBF721824F2F09B4251D5E0063494,1.jpg,1,Non-appealing
1,1,245DC3767EBDBF721824F2F09B4251D5E0063494,2.jpg,2,Appealing
2,1,245DC3767EBDBF721824F2F09B4251D5E0063494,3.jpg,3,Non-appealing
3,1,245DC3767EBDBF721824F2F09B4251D5E0063494,4.jpg,4,Non-appealing
4,1,245DC3767EBDBF721824F2F09B4251D5E0063494,5.jpg,5,Non-appealing


### Adding users' data

In [10]:
profiles = pd.read_csv(os.path.join(datapath, 'ChIA-CulturalQuestionnaire-results-raw data.csv'), 
                       names=["timestamp", "age", "gender", "country", "discard1", "culturalBack", "culture", "hash", "discard2"])
profiles.drop(["timestamp", "discard1", "discard2"], axis=1, inplace=True)
profiles.drop([0,1], axis=0, inplace=True)

profiles["age"] = profiles.age.str.replace("-",",").str.strip("+")
profiles["age"] = profiles.age.apply(lambda x:eval(f"[{x}]"))
profiles["age"] = [np.mean(np.array(x)).astype(int) for x in profiles.age]

profiles.head(20)

FileNotFoundError: [Errno 2] No such file or directory: '../data/img_labeling_3rd_round/ChIA-CulturalQuestionnaire-results-raw data.csv'

In [None]:
df = df_labeling.join(profiles.set_index('hash'), on='user')

#### Users with no profiles

In [None]:
df[df.gender.isnull()].user.unique()

#### Users with profiles

In [None]:
df[df.gender.notnull()].user.unique()

#### Checking images classified

In [None]:
df_labeling['image_appearances'] = df_labeling.groupby('id_image')['id_image'].transform('count')

In [None]:
print(f"{len(df_labeling['image_name'])} images were classified")
print(f"{len(df_labeling['image_name'].unique())} unique images were classified\n")
num_images = 0
image_appearances = df_labeling.image_appearances.value_counts().sort_index(ascending=True)
for idx in image_appearances.index:
    print(f'{image_appearances[idx]} images were classified {idx} times')
    num_images += image_appearances[idx]

#### Keeping the images that were classified by all subjects (optional)

In [None]:
#df_labeling = df_labeling[df_labeling['image_appearances'] == 20]
df_labeling.info()

### Analysing the [inter-annotator agreement](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html) on the results  

### $\kappa = (p_o - p_e) / (1 - p_e)$

#### where $p_o$ is the empirical probability of agreement on the label assigned to any sample (the observed agreement ratio), and $p_e$ is the expected agreement when both annotators assign labels randomly. $p_e$ is estimated using a per-annotator empirical prior over the class labels

In [None]:
l1 = df_labeling.userid.unique()
iter_users = itertools.product(l1,l1)
df_iaa = pd.DataFrame(index=l1, columns=l1)
df_iaa_styled = pd.DataFrame(index=l1, columns=l1)

In [None]:
total_agreement = 0
for user1,user2 in iter_users:
    classesA = df_labeling.loc[(df_labeling.userid == user1),['id_image', 'class']]
    classesA.sort_values(by=['id_image'], inplace=True)

    classesB = df_labeling.loc[(df_labeling.userid == user2),['id_image', 'class']]
    classesB.sort_values(by=['id_image'], inplace=True)

    classesAB = pd.merge(classesA, classesB, on=['id_image'])
    #classesAB.drop_duplicates(subset='id_image', keep = 'first', inplace=True) 
    classesAB.drop('id_image', axis=1, inplace=True)
    classesAB.dropna(inplace=True)

    agreement = cks(classesAB['class_x'], classesAB['class_y'])
    total_agreement += agreement
    df_iaa.loc[user1,user2] = f'{agreement:.3f}/({len(classesAB)})'
    df_iaa_styled.loc[user1,user2] = f'{agreement:.3f}'
    
df_iaa_styled = df_iaa_styled.apply(pd.to_numeric)
print(tabulate(df_iaa, headers='keys', tablefmt='psql'))
print(f'\nThe average agreement was {total_agreement/(len(l1)**2):.3f}\n\n')

In [None]:
cm = sns.light_palette("green", as_cmap=True)
cm = 'viridis'
cm = 'plasma'
#cm = 'magma'

s = df_iaa_styled.style.background_gradient(cmap=cm, vmin=0, vmax=1, axis=None)
display(s)
html = s.render()
imgkit.from_string(html, f'../data/outputs/styled_table_round3.png')

In [None]:
df_iaa.to_excel("../data/outputs/round3_results.xls")

### Analysis per gender and country/culture

In [None]:
df_labeling.info()

In [None]:
df_labeling['round'] = 3
df_labeling.drop('image_appearances', inplace=True, axis=1)

In [None]:
#df_labeling.to_hdf('../data/df_labeling.hdf', key='round3')