# Descriptive statistics for annotations file

## Reading the data

In [None]:
import json, os
from collections import Counter, defaultdict, OrderedDict
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean
import pandas as pd

In [None]:
os.chdir(r'/data/raw/annotations')

In [None]:
# saving annotations to list of dicts
all_annotations = []

# keeping track of the files that fail
failed_files = []

for filename in os.listdir('.'):
    if filename.endswith('.json'):

        with open(filename, encoding='utf-8', errors='ignore') as f:
            
            try:
                # file is read: append its annotations
                file = json.load(f)
                
                for i in range(len(file['results'])):
                    annotation = file['results'][i]
                    all_annotations.append(annotation)
                
            except:
                # file cannot be read
                failed_files.append(filename)
            
                       
len(failed_files)
            

There are 91 files that cannot be read in. These are the following:

In [None]:
print(failed_files)

## Annotations per Review Group

The number of annotations for each Review Group is counted. This number varies across review groups. There are few groups with a very large number of annotations; many groups have only a few. Statistics provided below:

In [None]:
# count how many annotations exist per review group
labels_per_group = Counter()

for annotation in all_annotations:
    
    try:
        labels_per_group[annotation['reviewGroupLabel']] += 1
    
    # some annotations have the group label duplicated in the list
    except:
        
        try:
            labels_per_group[annotation['reviewGroupLabel'][0]] += 1
        
        # a small number of annotations lack reviewGroupLabel altogether
        except:
            pass
        
         
    
labels_per_group_df = pd.DataFrame.from_dict(labels_per_group, orient='index').reset_index()
labels_per_group_df.rename(columns={'index': 'review_group', 0:'annotations'}, inplace=True)
labels_per_group_df.describe()

In [None]:
group, count = zip(*labels_per_group.most_common(10))
sns.barplot(y=list(group), x=list(count),orient="h")
plt.title("Annotations per Review Group - top 10")

## Average number of labels applied per Review Group

To get a further idea of the variance in the data, we consider the average number of PICO labels that is applied to an individual study/review across the Review Groups. We first look at the variable "outcomeClassification" and observe that the average number of PICO terms applied to one study/review varies a lot across the Review Groups.

In [None]:
# count average the number of outcomes labels applied per group
# (can be replaced by any other variable)

vars_per_group = defaultdict(list)
var = "condition"
no_var = 0

for annotation in all_annotations:
    
    try:
        if isinstance(annotation[var], list):
            vars_per_group[annotation["reviewGroupLabel"]].append(len(annotation[var]))
        else:
            vars_per_group[annotation["reviewGroupLabel"]].append(1)
            
    except:
        no_var += 1

for group in vars_per_group.keys():
    vars_per_group[group] = mean(vars_per_group[group])
    
vars_per_group_df = pd.DataFrame.from_dict(vars_per_group, orient='index').reset_index()
vars_per_group_df.rename(columns={'index':'review_group', 0:'pico_labels'}, inplace=True)
vars_per_group_df.describe()

In [None]:
group, avg = zip(*sorted(vars_per_group.items(), key=lambda k_v:k_v[1], reverse=True))
plt.figure(figsize=(16,12))
sns.barplot(y=list(group), x=list(avg),orient="h")
plt.title("Average number of PICO labels for {} applied per Review Group (overall mean: {})".format(
    var, str(round(mean(avg), 2))))

## Count of PICO labels (for a single variable)

To get a sense of how often individual PICO labels are used, we count the total number of times that a label is applied (for an individual variable).

In [None]:
# count the most popular PICO labels for a variable
# (can be replaced by any other variable)

pico_counter = Counter()
var = "outcomeClassification"
no_var = 0

for annotation in all_annotations:
    
    try:
        if isinstance(annotation[var], list):
            for link in annotation[var]:
                pico_counter[link] += 1
        else:
            pico_counter[annotation[var]] += 1
            
    except:
        no_var += 1


pico_counter_df = pd.DataFrame.from_dict(pico_counter, orient='index').reset_index()
pico_counter_df.rename(columns={'index': 'pico_label', 0:'count'}, inplace=True)
pico_counter_df.describe()

In [None]:
group, count = zip(*pico_counter.most_common(10))
sns.barplot(y=list(group), x=list(count),orient="h")
plt.title("Most-used PICO labels for variable {} - top 10".format(var))

We are also interested in the distribution of PICO labels across all variables. 

In [None]:
# count the most popular PICO labels for all variables

pico_counter = Counter()
variables = ["implicitMaterial", "sex", "outcomeClassfication", "implicitCondition", 
            "condition", "material", "interventionClassification", "age"]
no_var = 0
total_count = 0
all_labels = []

for annotation in all_annotations:
    
    for var in variables:
    
        try:
            if isinstance(annotation[var], list):
                for link in annotation[var]:
                    pico_counter[link] += 1
                    total_count += 1
                    all_labels.append(link)
            else:
                pico_counter[annotation[var]] += 1
                total_count += 1
                all_labels.append(annotation[var])
        except:
            no_var += 1

#print(pico_counter.most_common(20))
#print(total_count)


pico_counter_df = pd.DataFrame.from_dict(pico_counter, orient='index').reset_index()
pico_counter_df.rename(columns={'index': 'pico_label', 0:'count'}, inplace=True)
pico_counter_df.describe()


There are 8446 different labels that are applied, which are applied 339406 times in total. On average, a PICO label is applied 40 times, but this distribution is highly skewed to the right. 

In [None]:
group_most, count_most = zip(*pico_counter.most_common(100))
g = sns.barplot(x=list(group_most), y=list(count_most),orient="v")
g.set(xticks=[])
plt.title("Distribution of most-used PICO labels - top 100")

The pie chart below shows a distribution of the frequency of use of the PICO labels. For example, 57% of all PICO labels were applied only 5 times or less.

In [None]:
pico_counter_tresholds = {'x <= 5': len([pico_counter[x] for x in pico_counter if pico_counter[x] < 5]),
                         '5 < x <= 10': len([pico_counter[x] for x in pico_counter if pico_counter[x] <= 10 and pico_counter[x] > 5]),
                         '10 < x <= 100': len([pico_counter[x] for x in pico_counter if pico_counter[x] <= 100 and pico_counter[x] > 10]),
                         '100 < x <= 500': len([pico_counter[x] for x in pico_counter if pico_counter[x] <= 500 and pico_counter[x] > 100]),
                         '500 < x': len([pico_counter[x] for x in pico_counter if pico_counter[x] > 500])}

fig, ax = plt.subplots()
patches, b, c = ax.pie(pico_counter_tresholds.values(), autopct='%1.1f%%')#, colors=sns.color_palette("Blues"))
plt.legend(patches, pico_counter_tresholds.keys(), loc='best')
plt.axis('equal')
plt.title("Frequency of use of PICO labels")
plt.tight_layout()