In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
annotations_filepath ="anonymized_project.json"
references_filepath = "references.json"

## Read annotations json file 

In [None]:
annotations_read = pd.read_json(annotations_filepath)

In [None]:
annotations_read.head()

In [None]:
annotations_read = annotations_read.results

In [None]:
annotations_read.head()

In [None]:
annotations_read = annotations_read.root_node

In [None]:
type(annotations_read)

In [None]:
for key in annotations_read.keys():
    print(key)

In [None]:
annotations_read['gui_type']

In [None]:
type(annotations_read['results'])

In [None]:
annotations_read = annotations_read['results']

In [None]:
keys_count = 0
for key in annotations_read.keys():
    keys_count+=1
    print(key)

In [None]:
keys_count

In [None]:
for key in annotations_read.keys():
    print(type(annotations_read[key]))

In [None]:
type(annotations_read['7e8984b6-dff7-4015-865a-b721a2faf681'])

In [None]:
list_of_dicts = []
for key in annotations_read.keys():
    temp_dict = annotations_read[key]
    temp_list = temp_dict['results']
    for i in range(len(temp_list)):
        list_of_dicts.append(temp_list[i])

In [None]:
len(list_of_dicts)

In [None]:
annotations_df = pd.json_normalize(list_of_dicts)

In [None]:
annotations_df.head()

In [None]:
annotations_df[['task_input.image_url', 'task_output.answer', 'task_output.cant_solve', 'task_output.corrupt_data','task_output.duration_ms', 'user.vendor_id', 'user.id', 'user.vendor_user_id']].sample(4).to_excel('annotations_sample_raw.xlsx')

## Clean annotations dataframe

In [None]:
# check for missing values in the columns
annotations_df.isna().sum()

In [None]:
# check for duplicated rows 
annotations_df[annotations_df.duplicated()].shape

In [None]:
annotations_df.columns

In [None]:
# replace '.' character with '_' in the columns
annotations_df.columns = annotations_df.columns.str.replace(".", "_", regex = False)

In [None]:
annotations_df.columns

In [None]:
annotations_df.dtypes

In [None]:
annotations_df.sample(4)

In [None]:
# check whether there are rows on which 'task_input_image_url' column value 
# is different than 'root_input_image_url' column value
annotations_df.loc[annotations_df.task_input_image_url != annotations_df.root_input_image_url].shape

In [None]:
annotations_df.info(memory_usage='deep')

In [None]:
# check whether there are rows on which 'project_node_input_id' column value is different
# than 'project_root_node_input_id' column value
annotations_df.loc[annotations_df.project_node_input_id != annotations_df.project_root_node_input_id].shape

In [None]:
annotations_df.workpackage_total_size.unique() # shows unique values in workpackage_total_size column

In [None]:
annotations_df.loss.unique() # shows unique values in loss column

In [None]:
annotations_df.user_vendor_id.unique() # shows unique values in user_vendor_id column

In [None]:
annotations_df.columns

In [None]:
only_columns = ['created_at', 'workpackage_total_size', 'project_node_input_id',
       'project_node_output_id', 'task_input_image_url', 'task_output_answer', 'task_output_cant_solve',
       'task_output_corrupt_data', 'task_output_duration_ms', 'user_vendor_id',
       'user_id', 'user_vendor_user_id']

In [None]:
len(only_columns)

In [None]:
annotations_df = annotations_df[only_columns]

In [None]:
annotations_df.head()

In [None]:
annotations_df.sample()

In [None]:
annotations_df.info(memory_usage='deep')

In [None]:
annotations_df.created_at.dtype

In [None]:
annotations_df.created_at.head()

In [None]:
pd.to_datetime(annotations_df.created_at).head()

In [None]:
annotations_df['created_at_datetime'] = pd.to_datetime(annotations_df['created_at'])

In [None]:
annotations_df.head()

In [None]:
annotations_df.info(memory_usage='deep')

# Tasks:

## Task 1:

1. a)

In [None]:
annotations_df.user_vendor_user_id.nunique() # user_id 

In [None]:
annotations_df.sort_values(by='user_vendor_user_id').user_vendor_user_id.unique()

1. b)

In [None]:
annotations_df.task_output_duration_ms.describe()

In [None]:
annotations_df.task_output_duration_ms.min() # less than a 1 ms or invalid value?

In [None]:
annotations_df.task_output_duration_ms[annotations_df.task_output_duration_ms > 0].describe()

In [None]:
min_duration_time = annotations_df.task_output_duration_ms[annotations_df.task_output_duration_ms > 0].min()
min_duration_time

In [None]:
max_duration_time = annotations_df.task_output_duration_ms.max()
max_duration_time

In [None]:
average_duration_time = \
                    annotations_df.task_output_duration_ms[annotations_df.task_output_duration_ms > 0].mean()
average_duration_time

In [None]:
annotations_df.task_output_duration_ms[annotations_df.task_output_duration_ms <500].count()

In [None]:
hist_plot = annotations_df.task_output_duration_ms.plot.hist(bins=(range(500, 4000, 100)), \
                                                              by=annotations_df.task_output_duration_ms,figsize=(14,8), grid = True)
hist_plot.set_title('Frequency of duration(ms)', weight='bold')
hist_plot.set_ylabel('Frequancy', weight='bold', size = 10)
hist_plot.set_xlabel('Duration (ms)', weight='bold', size=10)

In [None]:
hist_plot.figure.savefig('duration_hist_plot.png')

In [None]:
annotations_df.columns

In [None]:
duration_barh_plot = annotations_df[annotations_df.task_output_duration_ms>0\
                                   ].groupby(['user_vendor_user_id']).task_output_duration_ms.mean()\
                                    .sort_values(ascending=False).plot.barh(figsize =(14,8))
duration_barh_plot.set_title('The ranking of annotators by the average duration (ms ) needed to answer the questions', \
                            weight='bold', size = 14)
duration_barh_plot.set_xlabel('Duration (ms)', weight='bold', size = 10)
duration_barh_plot.set_ylabel('Annotators', weight='bold', size = 10)

In [None]:
duration_barh_plot.figure.savefig('duration_ranking_plot.png')

1. c)

In [None]:
annotations_df.head()

In [None]:
annotations_df.groupby('user_vendor_user_id').task_output_answer.count().sort_values(ascending =False)

In [None]:
ann_count_plot = annotations_df.groupby('user_vendor_user_id').task_output_answer.count().sort_values().plot(kind='barh', figsize=(14,8), width=0.5, grid=True)
ann_count_plot.set_title('Ranking of annotators by the number of answers to questions', weight='bold', size = 14)
ann_count_plot.set_xlabel('Questions answered count', weight='bold', size = 10)
ann_count_plot.set_ylabel('Annotators', weight='bold', size = 10)

In [None]:
ann_count_plot.figure.savefig('annotators_answers_count_plot.png')

1. d)

In [None]:
annotations_df.columns

In [None]:
annotations_df['image_name'] = annotations_df.task_input_image_url.apply(lambda x: x[(x.rfind('/')+1) : (x.rfind('.'))])

In [None]:
annotations_df.head()

In [None]:
annotations_df.groupby('image_name').task_output_answer.count()

In [None]:
annotations_df[['image_name', 'task_output_answer']].groupby('image_name').head()

In [None]:
task_answer_df = annotations_df[['image_name', 'task_output_answer']].groupby('image_name').apply(lambda x: x) 

In [None]:
task_answer_df.columns

In [None]:
task_answer_df.head()

In [None]:
highly_disagree_questions_df = pd.DataFrame(task_answer_df.groupby('image_name').task_output_answer.value_counts('yes')==0.5)

In [None]:
highly_disagree_questions_df.sample()

In [None]:
#highly_disagree_questions_df = highly_disagree_questions.query('task_output_answer == True')

In [None]:
highly_disagree_questions_df = highly_disagree_questions_df.query('task_output_answer == True')

In [None]:
highly_disagree_questions_df.sample(10)

In [None]:
highly_disagree_questions_df.index.unique(level='image_name')

In [None]:
# tasks (questions) count where there were highly disagreements between annotators
highly_disagree_questions_df.index.unique(level='image_name').nunique()

In [None]:
annotations_df.columns

In [None]:
# show all answers for a particular task where there were disagreements
annotations_df[annotations_df.image_name == 'img_6324'].loc[:,['image_name','task_output_answer']].to_excel('disagree_sample.xlsx')

## Task 2.

In [None]:
annotations_df.head()

In [None]:
# can't solve cases
annotations_df.task_output_cant_solve.eq(True).sum()

In [None]:
# corrupt data cases
annotations_df.task_output_corrupt_data.eq(True).sum()

In [None]:
corrupt_cantsolve_df = \
annotations_df.loc[annotations_df.task_output_cant_solve.eq(True) | annotations_df.task_output_corrupt_data.eq(True)]

In [None]:
corrupt_cantsolve_df.sort_values('image_name')

In [None]:
corrupt_cantsolve_df.task_output_duration_ms.plot.line()

In [None]:
corrupt_cantsolve_df.sort_values('image_name')[['task_output_answer','task_output_cant_solve', 'task_output_corrupt_data', 'task_output_duration_ms', 'user_vendor_id', 'user_vendor_user_id', 'created_at_datetime', 'image_name']].to_csv("corr_cantsolve.csv")

## Task 3

Read the references json

In [None]:
references_df = pd.read_json(references_filepath)

In [None]:
type(references_df)

In [None]:
references_df.head()

In [None]:
annotations_df.image_name.nunique()

In [None]:
references_df.columns.nunique() # columns are the image names

In [None]:
# check if every image name from annotations df is present in references df as well
annotations_df.image_name.apply(lambda image_name: image_name in references_df.columns).eq(False).sum()

In [None]:
references_df.iloc[0].unique() # check if there are other values than True or False

In [None]:
references_df.dtypes

In [None]:
True_values_count = 0
False_values_count = 0
for column_name, item in references_df.iteritems():
    if item.bool() == True:
        True_values_count +=1
    else:
        False_values_count+=1

In [None]:
print(True_values_count)
print(False_values_count)

In [None]:
d = {
    'NAME':['True Values', 'False Values'],
    'Correct_Answers':[True_values_count, False_values_count]
}

In [None]:
ref_bool_count = pd.DataFrame.from_dict(data = d)

In [None]:
references_plot = ref_bool_count.plot.pie(title="Reference True False Balance",y='Correct_Answers', labels=['True', 'False'], figsize=(10,6))


In [None]:
references_plot.figure.savefig('reference_balance.png')

In [None]:
"""
The reference set is balanced because it contains approximately the same amount of True images with False images.
"""

## Task 4

In [None]:
# create a new column named 'correct_answer' with values from references dataset
annotations_df['correct_answer'] = annotations_df.image_name.apply(lambda x: references_df[x].iloc[0])

In [None]:
annotations_df['answer_bool'] = annotations_df.task_output_answer.apply(lambda x: True if (x=='yes') else False)

In [None]:
annotations_df['is_correct'] = np.where(annotations_df['answer_bool'] == annotations_df['correct_answer'], True, False)

In [None]:
annotations_df.groupby('user_vendor_user_id').answer_bool.count()

In [None]:
annotations_df.shape

In [None]:
annotators_rating = pd.DataFrame(annotations_df.groupby('user_vendor_user_id').is_correct.value_counts(True)*100)

In [None]:
annotators_rating.head(10)

In [None]:
a = annotations_df.loc[annotations_df.user_vendor_user_id == 'annotator_18'].is_correct.eq(True).sum()
b= annotations_df.loc[annotations_df.user_vendor_user_id == 'annotator_18'].is_correct.count()
a*100/b

In [None]:
annotations_df.head()

In [None]:
temp_ser = annotations_df.groupby(['user_vendor_user_id', 'is_correct']).is_correct.count()

In [None]:
temp_ser.index

In [None]:
is_correct_df = temp_ser.unstack()

In [None]:
is_correct_df.head()

In [None]:
is_correct_df.loc['annotator_01']

In [None]:
is_correct_df.dtypes

In [None]:
is_correct_df.loc[:, 'correctness_percentage'] = is_correct_df.loc[:,True]*100/(is_correct_df.loc[:,True]+is_correct_df.loc[:,False])

In [None]:
is_correct_df

In [None]:
is_correct_df.loc['annotator_01'][True]

In [None]:
correctness_perc_plot = is_correct_df.sort_values(by='correctness_percentage').correctness_percentage.plot(kind='barh', grid=True,figsize = (14,8), xlim=(85,100))
correctness_perc_plot.set_title("The percentage of correct answers given by annotators", weight='bold', size=14)
correctness_perc_plot.set_xlabel('Percentage %', weight='bold', size = 10)
correctness_perc_plot.set_ylabel('Annotators', weight='bold', size=10)

In [None]:
correctness_perc_plot.figure.savefig('corectness_perc_plot.png')