In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
# mturk csv file to analyze
mturk_csv_path = './data_files/mturk/moderation0.csv'

# read contents into dataframe
mturk_df = pd.read_csv(mturk_csv_path)

In [3]:
# calculate worker accuracy
def calc_accuracy():
    # get expert labels
    with open('../image_processing/labels.json') as handle:
        expert_labels = json.load(handle)
        handle.close()

    # evaluate accuracy wrt. individual categories and overall
    num_total_correct_category = 0
    num_total_category = 0
    num_error = 0
    num_other = 0

    num_sexnudity_correct = 0
    num_sexnudity_incorrect = 0

    num_graphic_correct = 0
    num_graphic_incorrect = 0

    num_safe_correct = 0
    num_safe_incorrect = 0

    # evaluate accuracy wrt. individual types and overall
    num_total_correct_type = 0
    num_total_type = 0

    num_realistic_correct = 0
    num_realistic_incorrect = 0

    num_synthetic_correct = 0
    num_synthetic_incorrect = 0

    # evaluate accuracy wrt. both category and type
    num_total_correct = 0
    num_total = 0

    # iterate through worker submissions
    for row in mturk_df.itertuples():
        labels = getattr(row, 'labels')
        worker_labels = json.loads(labels)

        # iterate through worker annotations for each image
        for image, annotation in worker_labels.items():
            # skip if no annotations for this image
            if annotation == 'error':
                num_error += 1
                continue

            worker_category = annotation['category']
            worker_type = annotation['type']
            worker_label_type = worker_category + '_' + worker_type

            expert_annotation = expert_labels[image]
            expert_category = expert_annotation['category']
            expert_type = expert_annotation['type']
            expert_label_type = expert_annotation['label_type']

            # evaluate accuracy wrt. category
            if worker_category == 'other':
                num_other += 1
            else:
                if worker_category == expert_category:
                    if expert_category == 'sex_nudity':
                        num_sexnudity_correct += 1
                    elif expert_category == 'graphic':
                        num_graphic_correct += 1
                    else:
                        num_safe_correct += 1
                    num_total_correct_category += 1
                else:
                    if expert_category == 'sex_nudity':
                        num_sexnudity_incorrect += 1
                    elif expert_category == 'graphic':
                        num_graphic_incorrect += 1
                    else:
                        num_safe_incorrect += 1    
                num_total_category += 1

            # evaluate wrt. type
            if worker_type == expert_type:
                if expert_type == 'realistic':
                    num_realistic_correct += 1
                else:
                    num_synthetic_correct += 1
                num_total_correct_type += 1
            else:
                if expert_type == 'realistic':
                    num_realistic_incorrect += 1
                else:
                    num_synthetic_incorrect += 1
            num_total_type += 1

            # evaluate wrt. both category and type
            if worker_category == 'other':
                pass
            else:
                num_total += 1
                if worker_label_type == expert_label_type:
                    num_total_correct += 1

    # calculate and print results wrt. categories
    print('Category accuracy:')
    print('\tOverall detection:\t{}'.format(num_total_correct_category / num_total_category))
    if num_sexnudity_correct + num_sexnudity_incorrect:
        print('\tSex and nudity detection:\t{}'.format(num_sexnudity_correct / (num_sexnudity_correct + num_sexnudity_incorrect)))
    if num_graphic_correct + num_graphic_incorrect:
        print('\tGraphic content detection:\t{}'.format(num_graphic_correct / (num_graphic_correct + num_graphic_incorrect)))
    if num_safe_correct + num_safe_incorrect:
        print('\tSafe content detection:\t{}'.format(num_safe_correct / (num_safe_correct + num_safe_incorrect)))
    print('\n')

    # calculate and print results wrt. types
    print('Type accuracy:')
    print('\tOverall detection:\t{}'.format(num_total_correct_type / num_total_type))
    if num_realistic_correct + num_realistic_incorrect:
        print('\tRealistic detection:\t{}'.format(num_realistic_correct / (num_realistic_correct + num_realistic_incorrect)))
    if num_synthetic_correct + num_synthetic_incorrect:
        print('\tSynthetic detection:\t{}'.format(num_synthetic_correct / (num_synthetic_correct + num_synthetic_incorrect)))
    print('\n')

    # calculate and print results wrt. both category and type
    print('Category and type accuracy:')
    print('\tOverall detection:\t{}'.format(num_total_correct / num_total))
    print('\n')

    # print number of 'other' category annotations and number of 'errors' reported
    print('Number of images skipped:')
    print('\tNumber of errored images:\t\t{}'.format(num_error))
    print('\tNumber of "other" categorizations:\t{}'.format(num_other))

In [4]:
# calculate worker confidence statistics
def calc_confidence():
    confidences = []

    # iterate through worker submissions
    for row in mturk_df.itertuples():
        labels = getattr(row, 'labels')
        worker_labels = json.loads(labels)

        # iterate through worker annotations for each image
        for image, annotation in worker_labels.items():
            # skip if no annotations for this image
            if annotation == 'error':
                continue

            confidences.append(annotation['confidence'])

    # convert list to np array
    confidences = np.asarray(confidences, dtype=np.float64)

    # calculate and print statistics
    print('Confidence statistics:')
    print('\tAverage:\t{}'.format(np.mean(confidences)))
    print('\tStandard dev:\t{}'.format(np.std(confidences)))

In [5]:
# calculate worker behavioral statistics
def calc_behavioral():
    # calculate and print clicks_total statistics
    clicks_total = mturk_df['clicks_total']
    print('Clicks total:')
    print('\tAverage:\t{}'.format(np.mean(clicks_total)))
    print('\tStandard dev:\t{}'.format(np.std(clicks_total)))
    print('\n')

    # calculate and print mousemove_total statistics
    mousemoves_total = mturk_df['mousemoves_total']
    print('Mousemoves total:')
    print('\tAverage:\t{}'.format(np.mean(mousemoves_total)))
    print('\tStandard dev:\t{}'.format(np.std(mousemoves_total)))
    print('\n')

    # calculate and print completion_time statistics
    completion_time = mturk_df['completion_time']
    print('Completion time:')
    print('\tAverage:\t{}'.format(np.mean(completion_time)))
    print('\tStandard dev:\t{}'.format(np.std(completion_time)))

In [6]:
calc_accuracy()

Category accuracy:
	Overall detection:	0.9158878504672897
	Safe content detection:	0.9158878504672897


Type accuracy:
	Overall detection:	0.8909090909090909
	Realistic detection:	0.975
	Synthetic detection:	0.8428571428571429


Category and type accuracy:
	Overall detection:	0.8130841121495327


Number of images skipped:
	Number of errored images:		0
	Number of "other" categorizations:	3


In [7]:
calc_confidence()

Confidence statistics:
	Average:	4.6909090909090905
	Standard dev:	0.4813891743590446


In [8]:
calc_behavioral()

Clicks total:
	Average:	85.54545454545455
	Standard dev:	17.132444692950994


Mousemoves total:
	Average:	92.0
	Standard dev:	35.009089728759925


Completion time:
	Average:	181343.63636363635
	Standard dev:	57932.5320418231
