In [1]:
'''
Analyze elicited soft label information 
'''

import pandas as pd
import numpy as np
import os 
import itertools
import json 
import ast
import importlib 
import label_construction_utils as utils

save_dir = "./"

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

most_prob_class_txt = "Most Probable Class"
second_prob_class_txt = "Second Most Probable Class" 
imposs_txt = "Impossible Class(es)"

most_prob_txt = f"{most_prob_class_txt} Prob"
second_prob_txt = f"{second_prob_class_txt} Prob"

none_option = "No" # from second 

def participant_completed(subj_df): 
    '''
    Check if a participant completed the study
    For ours, this is if they filled out the final instructions
    Final survey page is the only one of type "survey-text"
    So we can check if that page was reached 
    Return True if completed, else False
    '''
    final_survey_res = subj_df.loc[subj_df.trial_type == "survey-text"]
    if len(final_survey_res) != 0: 
        return True
    else: return False


def process_response(data_entry, subj_id=None): 
    # return processed human response
    # note: messy from to output format used for html inputs to avoid data saving issues
    # extract dict directly from: https://stackoverflow.com/questions/988228/convert-a-string-representation-of-a-dictionary-to-a-dictionary
    data_entry = ast.literal_eval(data_entry)
    most_prob_class = data_entry["classSelect"].split("mostProb")[-1] # needed to store tag to avoid data overlap on saving
    
    # note: two errors came up in prob specification
    # one person typed 8p, which we assume is 80
    # a few annotators selected a most prob class, but wrote 0 for prob
    # we convert this to 100, as any such annotator only did this once (we believe it is an annotation error)
    # however, we note that we make these judgments in processing the annotations
    # which are inherently noisy
    if data_entry["prob"] == "8p": most_prob = 80 # manually adjust
    else: most_prob = float(data_entry["prob"])
    if most_prob == 0: most_prob = 100 
        
    second_most_prob_class = data_entry["classSelect2"].split("secondProb")[-1]
    if "prob2" in data_entry and second_most_prob_class != none_option:
        if data_entry["prob2"] != "": second_prob = float(data_entry["prob2"])
        else:
            second_prob = None
    else: second_prob = None
    
    # all imposs classes selected had "improbClassSelect" as the starter tag, with class name after
    save_tag = "improbClassSelect"
    selected_imposs_classes = [save_txt.split(save_tag)[-1] for save_txt in set(data_entry.keys()) if save_tag in save_txt]
    
    return {most_prob_class_txt: most_prob_class, most_prob_txt: most_prob,
            second_prob_class_txt: second_most_prob_class, second_prob_txt: second_prob,
            imposs_txt: selected_imposs_classes} 

def annotator_accuracy(subj_df, use_top_2=False): 
    '''
    Compute the accuracy of the class(es) selected as most probable against cifar10 "gold" labels
    '''
    score=0
    subj_id = subj_df[id_col].iloc[0]
    for response, filename in zip(subj_df.response, subj_df.filename): 
        response = process_response(response, subj_id)
        eval_set, example_idx, cifar_label = filename.split(".png")[0].split("_")[1:]
        if cifar_label == response[most_prob_class_txt]: score += 1 
        elif use_top_2 and cifar_label == response[second_prob_class_txt]: score+=1
    return score/len(subj_df.response)
    
def clean_axes(ax):
    # clean axes for display
    ax.set_yticks([])
    ax.set_xticks([])
    
# note, classes are alphabetized, as per cifar-10h index matching 
class_names = ['Airplane', 'Automobile', 'Bird','Cat', 'Deer','Dog', 'Frog', 'Horse', 'Ship', 'Truck']
num_classes = len(class_names)
class2idx = {class_name: idx for idx, class_name in enumerate(class_names)}
idx2class = {idx: class_name for class_name, idx in class2idx.items()}

id_col = "subject"

data_path = "./raw_human_data.csv"

df = pd.read_csv(data_path)

'''
Filter to just include main data 
'''
response_df = df[(df.trial_type == 'survey-html-form')].reset_index()

# store a mapping from image id to batch number (helping for figuring out which batches to investigate or run extra)
image_id2batch = {}
for image_id, batch_num in zip(response_df.img_id, response_df.condition): 
    image_id = int(image_id)
    if image_id not in image_id2batch: image_id2batch[image_id] = batch_num
        
subj_ids = set(response_df[id_col])

print(f"Num participants: {len(subj_ids)}\n")

Num participants: 248



In [None]:
'''
Extract data from each annotator
Save in a format ammenable to label creation 
'''

filenames = set(response_df.filename)
example_idxs = set([int(idx) for idx in set(response_df.img_id)])

example_idx2filename = {int(filename.split("_")[2]): filename for filename in filenames}

# save all elicitation data per sample
all_elicitation_per_example = {example_idx: [] for example_idx in example_idxs}  

all_probs = []
for subj_id in subj_ids: 
    subj_df = response_df.loc[response_df[id_col] == subj_id]
    for (data_entry, filename, example_idx) in zip(subj_df.response, subj_df.filename, subj_df.img_id):
                
        elicited_data = process_response(data_entry, subj_id)
        
        all_elicitation_per_example[example_idx].append(elicited_data)
        
        most_prob = elicited_data[most_prob_txt]
        second_prob = elicited_data[second_prob_txt]
        all_probs.append(most_prob)
        if second_prob is not None: all_probs.append(second_prob)


with open("human_soft_labels_data.json", "w") as f:
    json.dump(all_elicitation_per_example, f)

In [None]:
'''
Some additional analyses and stats on elicited info
'''

# Compute annotator accuracy against CIFAR-10 labels
accs = []
top2_accs = []
for subj_id in subj_ids: 
    subj_df = response_df.loc[response_df[id_col] == subj_id]
    acc = annotator_accuracy(subj_df)
    acc2 = annotator_accuracy(subj_df, use_top_2 = True)
    accs.append(acc)
    top2_accs.append(acc2)
    
print(f"Accuracy of annotators' Top 1 Most Prob pred: {round(np.mean(accs), 3)*100}%")
print(f"Accuracy of annotators' combined Top 1 and Top 2 Most prob preds: {round(np.mean(top2_accs)*100, 3)}%")

# Compute elicitation time 
sec_per_img = np.mean(response_df.rt) / (1000)
print(f"Avg seconds per image: {round(sec_per_img,2)} sec")
med_sec_per_img = np.median(response_df.rt) / (1000)
print(f"Median seconds per image: {round(med_sec_per_img,2)} sec")
