In [67]:
import os
import glob
import json
import jsonlines
import numpy as np

In [92]:
datasets = ['ffhq', 'stylegan2', 'celeba-hq', 'transects']
dataset = datasets[3] #stylegan2-latents' # or celeba-hq or ffhq or transect
BASE_DIR = './datasets/%s/' % dataset
ANNOTATIONS_DIR = BASE_DIR + 'sagemaker/'
LABELS_FILE = '/annotation-tool/data.json'
OUT_MANIFEST_FILE = '/manifests/output/output.manifest'
WORKERS_RESPONSE = '/annotations/worker-response'

In [93]:
class annotationDatabase():
    
    def __init__(self,annotations_file_names, labelScores):
        '''Parameters:
        annotations_file_names - list of names of jupyter files where the annotations are saved.
        Each file refers to one image.'''
        self.annotators = {}           # dictionary that maps the unique ID strings of annotators to local integer IDs
        self.annotator_names = []      # list of the unique string identifiers that AMT uses sorted by integer ID
        self.annotations = []          # list of the number of annotations per annotator
        self.N_ANNOTATORS = len(self.annotators)
        self.N_IMAGES = len(annotations_file_names)
        self.imageScores = []          # scores this image received -- list of lists
        self.imageAnnotators = []      # annotators who worked on a given image -- list of lists
        
        #
        # for each file extract all the useful annotations
        #
        for i, fname in enumerate(annotations_file_names):
            with open(fname, 'r') as read_file:
                data = json.load(read_file)                  # open the json file and read its contents into data
                scores = []                                  # initialize the two lists of scores and IDs for the current image
                annotatorIDs = []
                for a,ans in enumerate(data['answers']):     # read each annotator's annotation
                    ID = self.addAnnotation(ans['workerId']) # mark annotation and retrieve ID of annotator
                    annotatorIDs.append(ID)                  # take note of which annotator it was
                    label = data['answers'][a]['answerContent']['crowd-image-classifier']['label']
                    scores.append(labelScores[label]) # transform label into score
                    
                self.imageScores.append(scores)
                self.imageAnnotators.append(annotatorIDs)
        
        print(f'Found {self.N_IMAGES} images and {self.N_ANNOTATORS} annotators.')
        
    def addAnnotation(self,annotator_name):
        '''Keep track of the annotations and of the annotators'''
        try:
            ID = self.annotators[annotator_name] # the annotator was found, here is her ID
            self.annotations[ID] += 1 # chalk up one more annotation for this annotator
        except: # the annotator was not on the list
            ID = self.N_ANNOTATORS # create a new ID
            self.annotators[annotator_name] = ID
            self.N_ANNOTATORS +=1
            self.annotations.append(1) # add a count of one for the last annotator
            self.annotator_names.append(annotator_name)
        return ID

In [94]:
# Get list of attributes in annotation dir. Alternatively, specify ones you care about.
attributes = [os.path.split(f)[1] for f in glob.glob(os.path.join(ANNOTATIONS_DIR, '*'))]
attributes.sort()

all_scores = []
all_labels = []

for attribute_label in attributes:
    
    print(attribute_label)
    
    ANNOTATIONS_PATH = f'{ANNOTATIONS_DIR}/{attribute_label}{WORKERS_RESPONSE}'
    OUT_MANIFEST_PATH = f'{ANNOTATIONS_DIR}/{attribute_label}{OUT_MANIFEST_FILE}'
    LABELS_PATH = f'{ANNOTATIONS_DIR}/{attribute_label}{LABELS_FILE}'

    #Read labels using annotation-tool/data.json and assign integers to labels 
    with open(LABELS_PATH, 'r') as labels_file:
        labels_data = json.load(labels_file)['labels']
        LABELS = [l['label'] for l in labels_data]
    labelScores = {l:i for (i, l) in enumerate(LABELS)} # generate numerical scores for the labels - useful in regression
    all_labels.append(LABELS)
    
    # Make map from annotation index to image index
    idx_map = []
    with jsonlines.open(OUT_MANIFEST_PATH) as reader:
        for obj in reader:
            _, name = os.path.split(obj['source-ref']) #remove leading path
            idx = name.split('.')[0]
            idx_map.append(idx)

    # Note: This loop assumes alphanumeric order = numeric order, e.g., file names are 0000.jpg, 0001.jpg, etc.)
    annotation_file_names = []
    for i in range(len(idx_map)):
        annotation_file_names += glob.glob(ANNOTATIONS_PATH + '/*/%d/*.json'% i)
    
    # put together the database of the annotator IDs and their work
    annotations = annotationDatabase(annotation_file_names, labelScores)
    all_scores.append(np.array(annotations.imageScores))
    
# Save
dic = {'attributes': attributes, 'responses': all_scores, 'attribute_levels': all_labels}
pickle.dump(dic, open(BASE_DIR + 'annotations.pkl', 'wb'))
output.close()

age
Found 8000 images and 590 annotators.
facial-hair
Found 8000 images and 480 annotators.
gender
Found 8000 images and 559 annotators.
hair-length
Found 8000 images and 514 annotators.
makeup
Found 8000 images and 484 annotators.
skin-color
Found 8000 images and 489 annotators.
smile
Found 8000 images and 534 annotators.
uncanny
Found 8000 images and 587 annotators.
