In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json
from os.path import exists
from quality_match.types import TaskInput, TaskOutput, Answer


REFERENCES_DATA_PATH = '../data/references.json'
(ANONYMIZED_PROJECT_DATA_PATH) = '../data/anonymized_project.json'    

def get_answer_disagree_score(task_output: TaskOutput) -> int:  
    ans_map = {Answer.NO: -1, Answer.YES: 1, Answer.EMPTY: 0}
    return ans_map[task_output.answer]

def get_reference_key(task_input: TaskInput) -> str:
    return task_input.image_url.split("/")[-1].split(".")[0]

def load_normalized_data():
    if not exists(REFERENCES_DATA_PATH) or not exists(ANONYMIZED_PROJECT_DATA_PATH):
        raise Exception("Data files not found ...")
    
    references = pd.read_json(REFERENCES_DATA_PATH).T
    anonymized_project_json_file = json.load(open(ANONYMIZED_PROJECT_DATA_PATH))
    anonymized_project_data = anonymized_project_json_file['results']['root_node']['results']

    records = []
    for task_id, tasks in anonymized_project_data.items():
        for record in tasks['results']:
            record['task_id'] = task_id
            record['answer_disagree_score'] = get_answer_disagree_score(TaskOutput.from_dict(record['task_output']))
            ref_key = get_reference_key(TaskInput.from_dict(record['task_input']))        
            record['reference'] = references.loc[ref_key, 'is_bicycle']
            records.append(record)        
       
    data = pd.json_normalize(records)
    
    data.loc[data['task_output.duration_ms'] < 0, "task_output.duration_ms"] = 0 
    data.astype({'task_output.duration_ms': 'int64'}) 

    return data, references

data, references = load_normalized_data()
data 

### Task 1
a. How many annotators did contribute to the dataset?

In [None]:
len(data['user.id'].unique())

b. What are the average, min and max annotation times (durations) ? 
Feel free to add visual representations here such as graphs if you like.

In [None]:
data['task_output.duration_ms'].describe().apply("{0:.2f}".format)

In [None]:
d = {'Value Count': 90870, 'Mean': 1289.836184, 'Std': 1124.011302, 'Min': 0.0, '25%': 887.0, '50%': 1058.0, '75%': 1328.0, 'Max': 42398.0}
s = pd.Series(d)

duration_plot = s.to_frame('data').boxplot(vert=False, figsize=(10,6),meanline=True, showmeans=True)
plt.title("Summary")

c. Did all annotators produce the same amount of results, or are there differences? 

In [None]:
data.groupby("user.id").agg(count=('user.id', 'count')).sort_values('count', ascending=False).reset_index()

d. Are there questions for which annotators highly disagree?

In [None]:
d1 = data.groupby("task_input.image_url").agg(sum=('answer_disagree_score', 'sum')).reset_index()
d1[d1['sum'] == 0]['task_input.image_url'].reset_index()
# data[data['task_input.image_url'].isin(d2)].reset_index()

### Task 2


a.1. How often does 'corrupt_data'  occur in the project and do you see a trend within the annotators that made use of these options?

In [None]:
c1 = data[data['task_output.corrupt_data'] == True]
c1.groupby("user.id").agg(count=('user.id', 'count')).sort_values('count', ascending=False).reset_index() 

a.2. How often does 'cant_solve'  occur in the project and do you see a trend within the annotators that made use of these options?

In [None]:
s1 = data[data['task_output.cant_solve'] == True]
s1.groupby("user.id").agg(count=('user.id', 'count')).sort_values('count', ascending=False).reset_index()


### Task 3

Is the reference set balanced? Please demonstrate via numbers and visualizations.

In [None]:
counts =references['is_bicycle'].value_counts()
counts

In [None]:
plt.bar(counts.index.astype(str), counts.values)
plt.title('Distribution of is_bicycle Column')
plt.xlabel('is_bicycle')
plt.ylabel('Count')
plt.show()

### Task 4

Using the reference set, can you identify good and bad annotators? Please use statistics and visualizations. Feel free to get creative.

to identify the good/bad annotator we can consider the following approach:
1. Calculate the accuracy of each annotator for both good and bad datasets.
2. Determine a threshold for what constitutes a good annotator versus a bad annotator.
3. Use the threshold to classify each annotator as either good or bad.
4. Validate the classification.

In [None]:
def check_answer(rec):
    if rec['task_output.answer']:
        if rec['task_output.answer'] == 'yes' and rec['reference'] is True:
            return True
        if rec['task_output.answer'] == 'no' and rec['reference'] is False:
            return True
    return False


bad_annotations = data[data.apply(lambda rec: not check_answer(rec) , axis=1)].groupby('user.id').agg(count=('user.id', 'count')).sort_values('count', ascending=False).reset_index()
good_annotations = data[data.apply(lambda rec: check_answer(rec) , axis=1)].groupby('user.id').agg(count=('user.id', 'count')).sort_values('count', ascending=False).reset_index()

users = data.groupby('user.id').agg(count=('user.id', 'count')).sort_values('count', ascending=False).reset_index()

users['bad_annotation_count'] = bad_annotations['count']
users['good_annotation_count'] = good_annotations['count']
users['accuracy_rate'] = (users['good_annotation_count'] / users['count'])
users['error_rate'] = (users['bad_annotation_count']/ users['count']) 
sorted_accuracy = users.sort_values("accuracy_rate", ascending=False)
sorted_accuracy


Defining the threshold depends on the specific requirements of the project and the nature of the data.
set the threshold at 0.9 in order to ensure that  annotators are at least 90% accurate.

In [None]:
threshold = 0.9
users["annotation_quality"]= "unknown"
users.loc[users['good_annotation_count'] / users['count'] >= threshold, 'annotation_quality'] = 'good'
users.loc[users['bad_annotation_count'] / users['count'] >= 1 - threshold, 'annotation_quality'] = 'bad'
users

In [None]:
colors = {'good': 'green', 'bad': 'red'}

plt.scatter(users['good_annotation_count'] / users['count'], users.index, c=users['annotation_quality'].apply(lambda x: colors[x]))
plt.title('Annotation Quality')
plt.xlabel('Proportion of Correct Answers')
plt.ylabel('Annotator')
plt.show()