In [None]:
import json, os
import pandas as pd
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
import json

In [None]:
# saving annotations to list of dicts
all_annotations = []

# keeping track of the files that fail
failed_files = []

for filename in os.listdir('.'):
    if filename.endswith('.json'):

        with open(filename, encoding='utf-8', errors='ignore') as f:
            
            try:
                # file is read: append its annotations
                file = json.load(f)
                
                for i in range(len(file['results'])):
                    annotation = file['results'][i]
                    all_annotations.append(annotation)
                
            except:
                # file cannot be read
                failed_files.append(filename)
            
                       
len(failed_files)

annotations_df_raw = pd.DataFrame(all_annotations)

In [None]:
annotations_df_raw.head()

In [None]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            return resp.content
    except RequestException as e:
        print('The following error occurred during HTTP GET request to {0} : {1}'.format(url, str(e)))
    
    return None



def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
               and content_type is not None
    and content_type.find('html') > -1)

In [None]:
# separate "raw" and cleaned annotations
cleaned_annotations = all_annotations


for cnt, annotation in enumerate(cleaned_annotations):
    
    # print progress
    if cnt % 5 == 0:
        print(str(cnt) + " annotations done.")

    for key in annotation.keys():
        
        try:

            # list of urls
            if isinstance(annotation[key], list):

                # check if we are looking at PICO concepts that should be extracted from the web
                if '/concepts/' in annotation[key][0]:

                    # PICO concepts will be added to list
                    ann_list = []

                    for entry in annotation[key]:

                        webpage = simple_get("http://data.cochrane.org/search/concepts?ids=" + entry)
                        json_var = json.loads(webpage)

                        ann_list.append(json_var['label'])

                    annotation[key] = ann_list

            # single URL
            else:

                if '/concepts/' in annotation[key]:

                    webpage = simple_get("http://data.cochrane.org/search/concepts?ids=" + annotation[key])
                    json_var = json.loads(webpage)

                    annotation[key] = json_var['label']
                    
        except:
            print("Variable " + key + " from annotation " + str(cnt) + " failed.")
                



In [None]:
annotations_df_cleaned = pd.DataFrame(cleaned_annotations)
annotations_df_cleaned.to_csv('cleaned_annotations.csv')