# Document Annotations
This notebook combines the output of the backends (Term Definitions, Questions & Answers, and Section Headers) into a format readable by the UI. 

The notebook assumes that the paper as already been parsed by PAWLs and the backends were run to great term definitions, sections headers, and QAs. 

Note that parts of the backend were manually created (e.g., writing the questions or linking multiple answers together) so not everything has a function, these places will be marked.


In [78]:
import os
import sys
import pandas as pd
import json
from spacy.tokens import Doc
import uuid
import random

import spacy
import scispacy
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import tqdm

sci_nlp = spacy.load("en_core_sci_scibert")
ner = sci_nlp.get_pipe("ner")

rd = random.Random()
rd.seed(0)


DIR = ''
DATA_DIR = '{}/data'.format(DIR)

sys.path.append('{}/lib'.format(DIR))
import sauce_defs



In [79]:
with open('development_user@example.com_annotations.json', 'r') as f:
    pawls_annotations_json = json.load(f)
    
    
with open('pdf_structure.json', 'r') as f:
    pawls_structure = json.load(f)
    

# Extract terms 

In [494]:
def find_matching_token(pages, docs):
    matching = {}
    for d, p in zip(docs, pages):
        matching_tokens = []
        ents_index = [(s.start, s.end) for s in d.ents]
        for indices in ents_index:
            matching_tokens.append({'tokens': p['tokens'][indices[0]:indices[1]], 'indices':indices})
        matching[p['page']['index']] = matching_tokens
    return matching

def make_keyword_annotation(tokens, page):
    return {
        "page": int(page),
        "id": str(uuid.UUID(int=rd.getrandbits(128), version=4)),
        "label": {'text': 'Keywords', 'color': '#64ED96'},
        "bounds": {
            "left": tokens['tokens'][0]['x'],
            "top": tokens['tokens'][0]['y'],
            "right": tokens['tokens'][-1]['x'] + tokens['tokens'][-1]['width'],
            "bottom": tokens['tokens'][-1]['y'] + tokens['tokens'][-1]['height'],
        },
        "tokens": [{'pageIndex': int(page), 'tokenIndex': i+1} for i in range(*tokens['indices'])],
        "text": ' '.join([t['text'] for t in tokens['tokens']]),
    }

# useful for the section's also
def add_annotation_text(page, annotation):
    if annotation['tokens'] != None:
        page_tokens = page['tokens']
        start = annotation['tokens'][0]['tokenIndex']
        end = annotation['tokens'][-1]['tokenIndex']

        return ' '.join([t['text'] for t in page_tokens[start:end+1]])
    return None

In [495]:
#### getting pretokenized text : https://spacy.io/usage/linguistic-features#custom-tokenizer (close to the right page)
tokens = [[t['text'] for t in p['tokens']] for p in pawls_structure]
docs = []
for p in tokens:
    doc = Doc(sci_nlp.vocab, words=p)
    doc = ner(doc)
    docs.append(doc)

In [496]:
matching_tokens = find_matching_token(pawls_structure, docs)

annotations = [make_keyword_annotation(t, page='1') for t in matching_tokens[1]]

In [497]:
all_paper_annotations = [[make_keyword_annotation(t, page=k) for t in matching_tokens[k]] for k in matching_tokens.keys()]

### Converting from PAWLS structure to what is readable by the UI

In [501]:
# Get just the pawls keyword annotations first
pawls_annotations_json_keywords = list(filter(lambda x: x['label']['text'] == 'Keywords', pawls_annotations_json['annotations']))
pawls_annotations_json_sections = list(filter(lambda x: x['label']['text'] == 'Section', pawls_annotations_json['annotations']))
pawls_annotations_json_headers = list(filter(lambda x: x['label']['text'] == 'Header', pawls_annotations_json['annotations']))


In [504]:
# UI is a ratio for the total page, while PAWLS is an absulote. so changing that is the first step
# takes a PAWLS bounding box and converts to UI bounding box
def get_spui_bounding_box(page, annotation):
    return {
        "page": annotation['page'],
        "left":annotation['bounds']['left']/page['width'],
        "top":annotation['bounds']['top']/page['height'],
        "width": (annotation['bounds']['right'] - annotation['bounds']['left'])/page['width'],
        "height": (annotation['bounds']['bottom'] - annotation['bounds']['top'])/page['height'],
    }
# takes a PAWLS annotation for keywords and makes it into a spui annotation
def make_spui_annotation(index, page_info, pawls_annotation, df_definitions):
    page = page_info['page']
    tokens = page_info['tokens']
    type_mapping = {"Keywords": "experience", "Section": "answerSentence", "Header": "sectionHeader"}
    
    if pawls_annotation['label']['text'] == "Keywords":
        definition = sauce_defs.get_def(pawls_annotation['text'], df_definitions)
        source = None
        if definition is not None:
            source = definition[1]
            definition = definition[0]
        
        return {
            "id":str(uuid.uuid4()),
            "type": type_mapping[pawls_annotation['label']['text']],
            "attributes": {"bounding_boxes": [get_spui_bounding_box(page, pawls_annotation)],
                "experience_id": str(uuid.uuid4()),
                "urls":[source],
                "snippets": [definition],
                "source": "tex-pipeline",
                "tags": []
              },
                "relationships": {},
        }
    # note that the linking will have to be put in manually
    elif pawls_annotation['label']['text'] == "Section":
        # because we had to manually link the questions and answers
        Q_IDS = ['227','228','229','230','231','232','233','234','235','236','237','238','239','240','241']
        
        try:
            str_id = str(Q_IDS[index])
        except:
            str_id = str(index)
        return {
            "id":str_id,
            "type": type_mapping[pawls_annotation['label']['text']],
            "attributes": {"bounding_boxes": [get_spui_bounding_box(page, pawls_annotation)],
                "Name": str(uuid.uuid4()),
                "text":"This is a highlighted FAQ section.",
                "simplified_text": "This is a simplified response.",
                "tex": str(uuid.uuid4()),
                "tex_start": 0,
                "tex_end": 5,
                "source": "tex-pipeline",
                "tags": [],
                "text": add_annotation_text(page_info, pawls_annotation)
              },
                "relationships": {
                  "question": {"type":"question", "id":"242"},
                  "more_details": {},
                  "less_details": {},
                  "coaster": []
                }
        }
        # note that the linking will have to be put in manually
    elif pawls_annotation['label']['text'] == "Header":
        return {
            "id":str(uuid.uuid4()),
            "type": type_mapping[pawls_annotation['label']['text']],
            "attributes": {"bounding_boxes": [get_spui_bounding_box(page, pawls_annotation)],
                "summary": "Summary",
                "points": ["Points"],
                "source": "tex-pipeline",
                "tags": []
              },
                "relationships": {},
        }
    


In [505]:
pages_info = [p['page'] for p in pawls_structure]
pages_info_mapping = {}
for p in pages_info:
    pages_info_mapping[p['index']] = {'width':p['width'], 'height':p['height']}
    

# Add definitions to annotations 

In [570]:
df_definitions_all_sle = pd.read_csv('term_definitions.csv')

all_paper_annotations_flat = [a for page in all_paper_annotations for a in page]

In [574]:
import re
def contains_tags(d):
    wiktionary_tags = ['surgery', 'anatomy', 'statistics',  'medicine', 'pathology', 'biochemistry', 'autoantigen', 'genetics', 'cytology', 'physics', 'chemistry', 'organic chemistry', 'immunology', 'pharmacology', 'anatomy', 'neuroanatomy', 'organism']
    return any([t in d for t in wiktionary_tags])

def clean_HTML(text):
    return re.sub('<[^>]*>', '', text)
# Rule: if there is a wiki def with the given tags, use that, otherwise use UMLS
# tags were found as any tag occuring more than once and related to medicine in either wiki set of defs
def get_UMLS_or_WikT(row):
    # also clean of any html tags
    if (row['wikitionary'] != None) and (contains_tags(row['wikitionary'])):
        return (clean_HTML(row['wikitionary']), 'Wiktionary')
    elif row['UMLS'] != None:
        return (clean_HTML(row['UMLS']), 'UMLS')
        

In [575]:
# convert NaN to none
df_definitions_all_sle = df_definitions_all_sle.where(pd.notnull(df_definitions_all_sle), None)

# add in the tuple for source
df_definitions_all_sle['definition'] = [get_UMLS_or_WikT(r) for _, r in df_definitions_all_sle.iterrows()]

In [576]:
df_definitions_all_sle_filtered = df_definitions_all_sle.dropna(subset=['definition'])


In [577]:
df_definitions_all_sle_filtered['definition_text'] = [d[0] for d in df_definitions_all_sle_filtered['definition']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [582]:
# make UI annotations for term definitions and annotations for the sections (will be filled in with generated text later)

annotations = []

# sections have to go first
annotations = all_paper_annotations_flat

SPUI_annotations = []
for i, annotation in enumerate(annotations):
    page = pawls_structure[annotation['page']]
    SPUI_annotations.append(make_spui_annotation(i, page, annotation, df_definitions_all_sle_filtered))

    

In [583]:
# filter for non-empty definitions
def is_non_empty_def(annotation):
    if annotation['type'] == 'experience':
        return annotation['attributes']['snippets'][0] != None
    return True
    
non_empty_spui = list(filter(is_non_empty_def, SPUI_annotations))


## Add in questions
Manually created

In [584]:
with open('question_annotations.json', 'r') as f:
    questions = json.load(f)

## add in answers
Answers were generated but put into correct format manually

In [585]:
with open('answer_annotations.json', 'r') as f:
    answers = json.load(f)    


## add in section headers

same as answers

In [586]:

with open('header_annotations.json', 'r') as f:
    headers = json.load(f) 
    

In [587]:
# dump all annotations as single file for UI
with open('auto_PAWLS_SPUI_annotations.json', 'w') as f:
    non_empty_spui.extend(questions)
    non_empty_spui.extend(answers)
    non_empty_spui.extend(headers)
    json.dump(non_empty_spui, f)