### basic doc annos retreived from prod

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!/usr/bin/env python3
import boto3
import requests

from mmda.types.span import Span
from mmda.types.annotation import SpanGroup, Box
from mmda.types.document import Document

def get_text(url: str) -> str:
    s3 = boto3.resource('s3')
    url_no_prefix = url[5:]
    bucket, *key = url_no_prefix.split("/")
    key = "/".join(key)
    with s3.Bucket(bucket).Object(key).get()['Body'] as f:
        return f.read().decode('utf-8')
    


sha = '17ada3c96ef888fc89f6b995d2edd19a45383423'
resp1 = requests.get(f'http://annotations-api.prod.s2.allenai.org/pdf/{sha}/annotations?annotationSources=none&attributeSources=pdfplumber-0.0.4').json()
text_url = resp1['attributesFromSource']['pdfplumber-0.0.4']['text']
print("text_url: ", text_url)

resp2 = requests.post(
    'http://annotations-api.prod.s2.allenai.org/plain-text/',
    json={'s3Url': text_url}
).json()
text_id = resp2['id']
print(text_id)

resp3 = requests.get(f'http://annotations-api.prod.s2.allenai.org/plain-text/{text_id}/annotations'
                     '?annotationSources=vila-0.0.2|layout-parser-0.0.2|pdfplumber-0.0.4'
                     '&attributeSources=none').json()
vila_annos = resp3['annotationsFromSource']['vila-0.0.2|layout-parser-0.0.2|pdfplumber-0.0.4']['vila_span_groups']

text = get_text(text_url)
vila_spangroups = []
for anno in vila_annos:
    vila_spangroups.append(
        SpanGroup(
            spans=[Span(start=anno['startChar'], end=anno['endChar'])],
            type=anno['attributesFromSource']['vila-0.0.2|layout-parser-0.0.2|pdfplumber-0.0.4']['type']
        )
    )

doc = Document(text)
doc.annotate(vila=vila_spangroups)

print("")
print(vila_annos[0])
print("")
# for spangroup in doc.vila:
#     if spangroup.type == "Bibliography":
#         print(spangroup.symbols)
print(vila_spangroups[0])

text_url:  s3://ai2-s2-science-parse-plus-prod/document/17ada3c96ef888fc89f6b995d2edd19a45383423/pdfplumber-0.0.4/text
740c1d64752b4679a742544046cf5de3452cfef4

{'id': 'vila-0.0.2|layout-parser-0.0.2|pdfplumber-0.0.4/vila_span_groups/0', 'startChar': 0, 'endChar': 125, 'attributesFromSource': {'vila-0.0.2|layout-parser-0.0.2|pdfplumber-0.0.4': {'type': 'Title', '_group': 0}}}

SpanGroup(uuid='08ac912a-1136-4c0d-85d9-bcafa3d5181f', doc=<mmda.types.document.Document object at 0x7f1888303970>, metadata=Metadata({'id': None, 'type': 'Title', 'text': None}), spans=[Span(start=0, end=125, box=None)], box_group=None, id=None, type='Title', text='1\nFunctions of FMS-like tyrosine kinase 3 (flt3) in zebrafish hematopoiesis and its\nrelevance to human acute myeloid leukemia')


### get and store bib entry annos to dev

In [3]:
from annotation_store import client

api = client.client() # host http://annotations-api.dev.s2.allenai.org

In [4]:
api

<annotation_store.client.ApiClient at 0x7f18882f7460>

In [5]:
dev_doc_id = api.Pdf.create_document("s3://ai2-s2-pdfs/17ad/a3c96ef888fc89f6b995d2edd19a45383423.pdf")

In [6]:
dev_doc_id # '17ada3c96ef888fc89f6b995d2edd19a45383423' cool

'17ada3c96ef888fc89f6b995d2edd19a45383423'

In [7]:
# instance requires
#     symbols: str
#     tokens: List[api.SpanGroup]
#     rows: List[api.SpanGroup]
#     pages: List[api.SpanGroup]
#     vila_span_groups: List[api.SpanGroup]
#     page_images: List[str] = Field(description="List of base64-encoded page images")

In [8]:
# prod stuff
# symbols and tokens

def make_span_groups(text_spans, pdf_boxes):
    
    instance_tokens = []
    for i, text_span in enumerate(text_spans):
        box_info = pdf_boxes[i]
        associated_box = Box(
            l=box_info['x'], 
            t=box_info['y'], 
            w=box_info['width'], 
            h=box_info['height'], 
            page=box_info['page']
        )

        instance_tokens.append(
            SpanGroup(
                spans=[Span(start=text_span['startChar'], end=text_span['endChar'], box=associated_box)],
                id=text_span['attributesFromSource']['pdfplumber-0.0.4']['id']

            )
        )
    return instance_tokens
    

symbols = text
pdf_plumber_text_annos_resp = requests.get(f'http://annotations-api.prod.s2.allenai.org/plain-text/{text_id}/annotations'
                     '?annotationSources=pdfplumber-0.0.4'
                     '&attributeSources=none').json()
pdf_plumber_pdf_annos_resp = requests.get(f'http://annotations-api.prod.s2.allenai.org/pdf/{sha}/annotations'
                     '?annotationSources=pdfplumber-0.0.4'
                     '&attributeSources=none').json()

token_spans = pdf_plumber_text_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['tokens']
token_boxes = pdf_plumber_pdf_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['tokens']


instance_tokens = make_span_groups(token_spans, token_boxes)
    
# print(token_spans[0])
# print(len(token_spans))
# print(token_boxes[0])
# print(len(token_boxes))
# print(instance_tokens[0])
rows = pdf_plumber_text_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['rows']
# print(rows[0])
pages = pdf_plumber_text_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['pages']
# print(pages[0])
vila_span_groups = vila_spangroups
print(vila_span_groups[0])


SpanGroup(uuid='08ac912a-1136-4c0d-85d9-bcafa3d5181f', doc=<mmda.types.document.Document object at 0x7f1888303970>, metadata=Metadata({'id': None, 'type': 'Title', 'text': None}), spans=[Span(start=0, end=125, box=None)], box_group=None, id=None, type='Title', text='1\nFunctions of FMS-like tyrosine kinase 3 (flt3) in zebrafish hematopoiesis and its\nrelevance to human acute myeloid leukemia')


In [9]:
# prod rows and pages
row_spans = pdf_plumber_text_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['rows']
row_boxes = pdf_plumber_pdf_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['rows']

instance_rows = make_span_groups(row_spans, row_boxes)

print(row_spans[0])
print(len(row_spans))
print(row_boxes[0])
print(len(row_boxes))
print("instance row:")
print(instance_rows[0])

print(pages[0])
print('\n\n\n\n')


page_spans = pdf_plumber_text_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['pages']
page_boxes = pdf_plumber_pdf_annos_resp['annotationsFromSource']['pdfplumber-0.0.4']['pages']

instance_pages = make_span_groups(page_spans, page_boxes)

print(page_spans[0])
print(len(page_spans))
print(page_boxes[0])
print(len(page_boxes))
print("instance page:")
print(instance_pages[0])

print(pages[0])


{'id': 'pdfplumber-0.0.4/rows/0', 'startChar': 0, 'endChar': 1, 'attributesFromSource': {'pdfplumber-0.0.4': {'id': 0, '_group': 0}}}
901
{'id': 'pdfplumber-0.0.4/rows/0', 'page': 0, 'x': 0.4970588235294117, 'y': 0.9170906818181819, 'width': 0.0069117647058823395, 'height': 0.010681818181818237, 'attributesFromSource': {'pdfplumber-0.0.4': {'id': 0, '_group': 0}}}
901
instance row:
SpanGroup(uuid='a543c77b-535a-4b60-a520-6ae2997c9656', doc=None, metadata=Metadata({'id': 0, 'type': None, 'text': None}), spans=[Span(start=0, end=1, box=Box(l=0.4970588235294117, t=0.9170906818181819, w=0.0069117647058823395, h=0.010681818181818237, page=0))], box_group=None, id=0, type=None, text='')
{'id': 'pdfplumber-0.0.4/pages/0', 'startChar': 0, 'endChar': 1236, 'attributesFromSource': {'pdfplumber-0.0.4': {'id': 0, '_group': 0}}}





{'id': 'pdfplumber-0.0.4/pages/0', 'startChar': 0, 'endChar': 1236, 'attributesFromSource': {'pdfplumber-0.0.4': {'id': 0, '_group': 0}}}
28
{'id': 'pdfplumber-0.0.4/p

In [10]:
# now just need to get the images
from mmda.types.image import tobase64

from awscli.customizations.s3.utils import split_s3_bucket_key
from io import BytesIO
from PIL import Image

import boto3

s3 = boto3.client('s3')
image_s3_keys = resp1['attributesFromSource']['pdfplumber-0.0.4']['images']
page_images = []
for key in image_s3_keys:
    bucket_name, key_name = split_s3_bucket_key(key)
    img_obj = s3.get_object(Bucket=bucket_name, Key=key_name)
    img_data = img_obj["Body"].read()
    img = Image.open(BytesIO(img_data))
    page_images.append(tobase64(img))

In [11]:
instance_pages

[SpanGroup(uuid='6a62f3eb-10f2-454d-a556-49b2f0fef701', doc=None, metadata=Metadata({'id': 0, 'type': None, 'text': None}), spans=[Span(start=0, end=1236, box=Box(l=0.08796004901960783, t=0.011190656565656636, w=0.8240833333333321, h=0.9737462121212122, page=0))], box_group=None, id=0, type=None, text=''),
 SpanGroup(uuid='10021fd5-74f7-49b8-ac63-ad5f0a385033', doc=None, metadata=Metadata({'id': 1, 'type': None, 'text': None}), spans=[Span(start=1237, end=2893, box=Box(l=0.1814683842647059, t=0.011190656565656636, w=0.6383671124836606, h=0.9165818434343435, page=1))], box_group=None, id=1, type=None, text=''),
 SpanGroup(uuid='7839f96a-30fb-45d6-8725-477ef4351c7e', doc=None, metadata=Metadata({'id': 2, 'type': None, 'text': None}), spans=[Span(start=2894, end=6157, box=Box(l=0.18146668784313721, t=0.011190656565656636, w=0.6428722485784314, h=0.9165818434343435, page=2))], box_group=None, id=2, type=None, text=''),
 SpanGroup(uuid='671685ec-5b07-47ee-bce3-ec49502e2495', doc=None, metad

In [12]:
instance = {
    "symbols": symbols,
    "tokens" : [sg.to_json() for sg in instance_tokens],
    "rows" : [sg.to_json() for sg in instance_rows],
    "pages" : [sg.to_json() for sg in instance_pages],
    "vila_span_groups" : [sg.to_json() for sg in vila_span_groups],
    "page_images" : page_images
}

instance['vila_span_groups'][0]

{'spans': [{'start': 0, 'end': 125}], 'type': 'Title'}

In [13]:
# send the request:
bib_detector_response = requests.post(
    'http://bibentry-detector.v0.prod.models.s2.allenai.org/invocations',
    json={'instances': [instance]}
)
# ).json()
    
# bib_detector_response['predictions']['bib_entry_boxes'][0]
print(bib_detector_response)

<Response [200]>


In [14]:
bib_detector_response.text

'{"predictions":[{"bib_entry_boxes":[{"boxes":[{"left":0.18146317915032664,"top":0.866992529633839,"width":0.6380922541862745,"height":0.06077997036616102,"page":12}],"id":0,"type":"bib_entry"},{"boxes":[{"left":0.18147058823529413,"top":0.1447024740656565,"width":0.6380905871421568,"height":0.032949968926767735,"page":12}],"id":1,"type":"bib_entry"},{"boxes":[{"left":0.18147058823529413,"top":0.1817488250378788,"width":0.638381949408497,"height":0.05158693280303038,"page":12}],"id":2,"type":"bib_entry"},{"boxes":[{"left":0.1814687359640523,"top":0.23720456310606067,"width":0.6376549329460782,"height":0.032949968926767735,"page":12}],"id":3,"type":"bib_entry"},{"boxes":[{"left":0.18146503142156847,"top":0.5522123347601015,"width":0.6379462952124177,"height":0.05135935602272722,"page":12}],"id":4,"type":"bib_entry"},{"boxes":[{"left":0.18146688369281047,"top":0.34834361602272745,"width":0.6379592611111118,"height":0.05158693280303038,"page":12}],"id":5,"type":"bib_entry"},{"boxes":[{"le

In [15]:
bib_detector_response.json()['predictions'][0]['bib_entry_boxes']

[{'boxes': [{'left': 0.18146317915032664,
    'top': 0.866992529633839,
    'width': 0.6380922541862745,
    'height': 0.06077997036616102,
    'page': 12}],
  'id': 0,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.18147058823529413,
    'top': 0.1447024740656565,
    'width': 0.6380905871421568,
    'height': 0.032949968926767735,
    'page': 12}],
  'id': 1,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.18147058823529413,
    'top': 0.1817488250378788,
    'width': 0.638381949408497,
    'height': 0.05158693280303038,
    'page': 12}],
  'id': 2,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.1814687359640523,
    'top': 0.23720456310606067,
    'width': 0.6376549329460782,
    'height': 0.032949968926767735,
    'page': 12}],
  'id': 3,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.18146503142156847,
    'top': 0.5522123347601015,
    'width': 0.6379462952124177,
    'height': 0.05135935602272722,
    'page': 12}],
  'id': 4,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.1814668

### store them to dev

In [16]:
dev_doc_id

'17ada3c96ef888fc89f6b995d2edd19a45383423'

In [17]:
bib_entry_boxgroups_json = bib_detector_response.json()['predictions'][0]['bib_entry_boxes']

In [201]:
def box_group_json_response_to_annotation_array(boxgroups_json):
    # more lil hackiness
    import itertools
    sg_counter = itertools.count()
    
    anno_array = []
    
    for box_group_json in boxgroups_json:
        group_id = next(sg_counter)
        box = box_group_json['boxes'][0]
        anno_array.append({
            "page": box['page'],
            "x": box['left'],
            "y": box['top'],
            "width": box['width'],
            "height": box['height'],
            # will be able to use sg "id" as group id in future3
            # also, box groups from model currently only give 1 box per box group so that's good
            "attributes": {"group": group_id} # TODO: update models cause "id is bad"
            }
        )
    return anno_array

In [202]:
bib_entry_annos = box_group_json_response_to_annotation_array(bib_entry_boxgroups_json)
bib_entry_annos[0]

{'page': 12,
 'x': 0.18146317915032664,
 'y': 0.866992529633839,
 'width': 0.6380922541862745,
 'height': 0.06077997036616102,
 'attributes': {'group': 0}}

In [203]:
source = "bib_detector_test_3"
bib_entry_post_response = requests.post(
    f'http://annotations-api.dev.s2.allenai.org/pdf/{dev_doc_id}/annotations/{source}',
    json={"bib-entries": bib_entry_annos}
    )
bib_entry_post_response

<Response [200]>

In [21]:
bib_entry_raw_annos_json = bib_detector_response.json()['predictions'][0]['raw_bib_entry_boxes']
bib_entry_raw_annos_json[0]

{'boxes': [{'left': 0.17995676028183083,
   'top': 0.8574961652659406,
   'width': 0.6271684559342129,
   'height': 0.07612293898457229,
   'page': 12}],
 'id': 0,
 'type': 'raw_model_prediction'}

In [22]:
# also store the "raw" ones
bib_entry_raw_annos = box_group_json_response_to_annotation_array(bib_entry_raw_annos_json)
bib_entry_post_response = requests.post(
    f'http://annotations-api.dev.s2.allenai.org/pdf/{dev_doc_id}/annotations/{source}',
    json={"bib-entry-raw": bib_entry_raw_annos}
    )
bib_entry_post_response

<Response [200]>

### create and save mention spans and boxes to anno store

In [23]:
mentions_instance = {
    "symbols": symbols,
    "tokens" : [sg.to_json() for sg in instance_tokens],
    "pages" : [sg.to_json() for sg in instance_pages],
}

# send the request:
mentions_response = requests.post(
    'http://citation-mentions.v0.prod.models.s2.allenai.org/invocations',
    json={'instances': [mentions_instance]}
)
# ).json()
    
# bib_detector_response['predictions']['bib_entry_boxes'][0]
print(mentions_response)

<Response [200]>


In [24]:
mentions_response.json()

{'predictions': [{'mentions': [{'spans': [{'start': 4376,
       'end': 4381,
       'box': {'left': 0.3673529411764706,
        'top': 0.45673090681818185,
        'width': 0.027257940653594773,
        'height': 0.00939974747474745,
        'page': 2}}],
     'box_group': None,
     'id': 0,
     'type': None,
     'text': None},
    {'spans': [{'start': 4840,
       'end': 4845,
       'box': {'left': 0.36411764705882355,
        'top': 0.5862763613636364,
        'width': 0.02755158876470585,
        'height': 0.00939974747474745,
        'page': 2}}],
     'box_group': None,
     'id': 1,
     'type': None,
     'text': None},
    {'spans': [{'start': 5072,
       'end': 5083,
       'box': {'left': 0.23176470588235296,
        'top': 0.6419581795454545,
        'width': 0.05784770473856206,
        'height': 0.00939974747474745,
        'page': 2}}],
     'box_group': None,
     'id': 2,
     'type': None,
     'text': None},
    {'spans': [{'start': 5316,
       'end': 5318,
   

In [25]:
mentions_annos_json = mentions_response.json()['predictions'][0]['mentions']
mentions_annos_json[0]

{'spans': [{'start': 4376,
   'end': 4381,
   'box': {'left': 0.3673529411764706,
    'top': 0.45673090681818185,
    'width': 0.027257940653594773,
    'height': 0.00939974747474745,
    'page': 2}}],
 'box_group': None,
 'id': 0,
 'type': None,
 'text': None}

In [26]:
# spangroups_json to mmda_spangroups:
mentions_spangroups = [SpanGroup.from_json(sg) for sg in mentions_annos_json] # oh this works ez but extra crap
mentions_spangroups[0]


SpanGroup(uuid='d163bd69-267a-49b7-a965-5633bbfedc9a', doc=None, metadata=Metadata({'id': 0, 'type': None, 'text': None}), spans=[Span(start=4376, end=4381, box=Box(l='left', t='top', w='width', h='height', page='page'))], box_group=None, id=0, type=None, text='')

In [77]:
# def spangroups_json_response_to_text_and_pdf_anno_arrays(spangroups_json):
#     # more lil hackiness
#     import itertools

#     sg_counter = itertools.count()
    
#     text_anno_array = []
#     pdf_anno_array = []
#     for spangroup_json in spangroups_json:
#         current_id = next(sg_counter)
#         span = spangroup_json['spans'][0]
#         box = span['box']
#         pdf_anno_array.append({
#             "page": box['page'],
#             "x": box['left'],
#             "y": box['top'],
#             "width": box['width'],
#             "height": box['height'],
#             "attributes": {"id": current_id} # "_group"?
#             }
#         )
#         text_anno_array.append({
#             "startChar": span['start'],
#             "endChar": span['end'],
#             "attributes": {"id": current_id}
#         })
#         # id_count = next(counter)
#     return text_anno_array, pdf_anno_array

# tryna figure out SpanGroups w/ multiple spans...
def spangroups_json_response_to_text_and_pdf_anno_arrays(spangroups_json):
    # more lil hackiness
    import itertools

    sg_counter = itertools.count()
    
    text_anno_array = []
    pdf_anno_array = []
    for spangroup_json in spangroups_json:
        current_id = next(sg_counter)
        
        for span in spangroup_json['spans']:
            box = span['box']
            pdf_anno_array.append({
                "page": box['page'],
                "x": box['left'],
                "y": box['top'],
                "width": box['width'],
                "height": box['height'],
                "attributes": {"group": current_id} # "_group"?
                }
            )
            text_anno_array.append({
                "startChar": span['start'],
                "endChar": span['end'],
                "attributes": {"group": current_id}
            })
    return text_anno_array, pdf_anno_array
            


In [78]:
# TODO(at least in SPP) the thingies need to have _group bcuz could be more spans per spangroup
mention_text_annos, mention_pdf_annos = spangroups_json_response_to_text_and_pdf_anno_arrays(mentions_annos_json)
print(mention_text_annos[0])
print()
print(mention_pdf_annos[0])

{'startChar': 4376, 'endChar': 4381, 'attributes': {'group': 0}}

{'page': 2, 'x': 0.3673529411764706, 'y': 0.45673090681818185, 'width': 0.027257940653594773, 'height': 0.00939974747474745, 'attributes': {'group': 0}}


In [29]:
# make dev plain text doc
dev_text_id = requests.post(
    f'http://annotations-api.dev.s2.allenai.org/plain-text/',
    json={"s3Url": text_url}
    )
dev_text_id

<Response [200]>

In [30]:
dev_text_id_response = dev_text_id
dev_text_id = dev_text_id_response.json()['id']
print(dev_text_id)

740c1d64752b4679a742544046cf5de3452cfef4


In [79]:
source = "mentions-test-3"
mentions_text_post_response = requests.post(
    f'http://annotations-api.dev.s2.allenai.org/plain-text/{dev_text_id}/annotations/{source}',
    json={"mentions": mention_text_annos}
    )
mentions_pdf_post_response = requests.post(
    f'http://annotations-api.dev.s2.allenai.org/pdf/{dev_doc_id}/annotations/{source}',
    json={"mentions": mention_pdf_annos}
    )

print(mentions_text_post_response)
print(mentions_pdf_post_response)

<Response [200]>
<Response [200]>


In [32]:
mentions_text_post_response

<Response [200]>

### get and save citation-links annos

#### turn bibs json into real spangroup bibs

In [33]:
# bib_entry_boxgroups_json = bib_detector_response.json()['predictions'][0]['bib_entry_boxes']
bib_entry_boxgroups_json

[{'boxes': [{'left': 0.18146317915032664,
    'top': 0.866992529633839,
    'width': 0.6380922541862745,
    'height': 0.06077997036616102,
    'page': 12}],
  'id': 0,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.18147058823529413,
    'top': 0.1447024740656565,
    'width': 0.6380905871421568,
    'height': 0.032949968926767735,
    'page': 12}],
  'id': 1,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.18147058823529413,
    'top': 0.1817488250378788,
    'width': 0.638381949408497,
    'height': 0.05158693280303038,
    'page': 12}],
  'id': 2,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.1814687359640523,
    'top': 0.23720456310606067,
    'width': 0.6376549329460782,
    'height': 0.032949968926767735,
    'page': 12}],
  'id': 3,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.18146503142156847,
    'top': 0.5522123347601015,
    'width': 0.6379462952124177,
    'height': 0.05135935602272722,
    'page': 12}],
  'id': 4,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.1814668

In [34]:
# first add pages to the doc (idk why rlly but seems it needs pages to annotate th boxgroups? prob cause those have pgs
doc.annotate(pages=instance_pages)
doc.pages[0]

SpanGroup(uuid='6a62f3eb-10f2-454d-a556-49b2f0fef701', doc=<mmda.types.document.Document object at 0x7f1888303970>, metadata=Metadata({'id': 0, 'type': None, 'text': None}), spans=[Span(start=0, end=1236, box=Box(l=0.08796004901960783, t=0.011190656565656636, w=0.8240833333333321, h=0.9737462121212122, page=0))], box_group=None, id=0, type=None, text='1\nFunctions of FMS-like tyrosine kinase 3 (flt3) in zebrafish hematopoiesis and its\nrelevance to human acute myeloid leukemia\nRunning Title: Flt3 in zebrafish hematopoiesis and AML\nBai-Liang He 1 , Xiangguo Shi 1 , Cheuk Him Man 1 , Alvin CH Ma 1,2 , Stephen C. Ekker 2 ,\nHoward CH Chow 1 , Chi Wai Eric So 3 , William WL Choi 4 , Wenqing Zhang 5 , Yiyue\nZhang 5 and Anskar YH Leung 1, *\n1 Division of Haematology, Department of Medicine, LKS Faculty of Medicine, The\nUniversity of Hong Kong\n2 Department of Biochemistry and Molecular Biology, Mayo Clinic, Rochester,\nMinnesota, USA\n3 Department of Haematological Medicine, King’s Coll

In [35]:
# ok guess it needs tokens, adding rows too
doc.annotate(tokens=instance_tokens)
doc.annotate(rows=instance_rows)

In [36]:
# bib_entry_boxgroups_json to spangroups to json:
from mmda.types.annotation import BoxGroup
from copy import copy

def box_groups_json_to_boxgroups(box_groups_json):
    #lil hack
    import itertools

    counter = itertools.count()
    
    # make BoxGroups from Json
    boxgroups = []
    for bg in box_groups_json:
        box_info = bg['boxes'][0]
        box = Box(
            l=box_info['left'], 
            t=box_info['top'], 
            w=box_info['width'], 
            h=box_info['height'], 
            page=box_info['page']
        )
        boxgroups.append(
        BoxGroup(boxes=[box],
                 id=next(counter) # OR update models?
                )
        )
    print("boxgroups0")
    print(boxgroups[0])
#     mmda_doc.annotate(bibs=boxgroups) # don't redo it'll get mad!!
    return boxgroups


In [37]:
bib_box_groups = box_groups_json_to_boxgroups(bib_entry_boxgroups_json)

boxgroups0
BoxGroup(uuid='3d126f60-d7c8-44d8-94c0-237a123377be', doc=None, metadata=Metadata({'id': 0, 'type': None}), boxes=[Box(l=0.18146317915032664, t=0.866992529633839, w=0.6380922541862745, h=0.06077997036616102, page=12)], id=0, type=None)


In [38]:
# PROBLEM!!
for bib in bib_box_groups:
    print(bib.id)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


In [39]:
doc.annotate(bibs=bib_box_groups)

In [40]:
doc.bibs[0]

SpanGroup(uuid='dcbcbb36-dd3f-4e58-9230-b28b477e4a58', doc=<mmda.types.document.Document object at 0x7f1888303970>, metadata=Metadata({'id': 0, 'type': None, 'text': None}), spans=[Span(start=31511, end=31513, box=Box(l=0.4932352941176471, t=0.9170906818181819, w=0.013970058823529467, h=0.010681818181818228, page=12)), Span(start=34105, end=34218, box=None)], box_group=BoxGroup(uuid='3d126f60-d7c8-44d8-94c0-237a123377be', doc=None, metadata=Metadata({'id': 0, 'type': None}), boxes=[Box(l=0.18146317915032664, t=0.866992529633839, w=0.6380922541862745, h=0.06077997036616102, page=12)], id=0, type=None), id=0, type=None, text='13 16. Stirewalt DL, Radich JP. The role of FLT3 in haematopoietic malignancies. Nat\nRev Cancer . 2003;3(9):650-665.')

In [41]:
[sg.to_json() for sg in doc.bibs][0]

{'spans': [{'start': 31511,
   'end': 31513,
   'box': {'left': 0.4932352941176471,
    'top': 0.9170906818181819,
    'width': 0.013970058823529467,
    'height': 0.010681818181818228,
    'page': 12}},
  {'start': 34105, 'end': 34218}],
 'box_group': {'boxes': [{'left': 0.18146317915032664,
    'top': 0.866992529633839,
    'width': 0.6380922541862745,
    'height': 0.06077997036616102,
    'page': 12}]},
 'id': 0}

#### and also mentions into real spangroups

In [42]:
mentions_annos_json[0]

{'spans': [{'start': 4376,
   'end': 4381,
   'box': {'left': 0.3673529411764706,
    'top': 0.45673090681818185,
    'width': 0.027257940653594773,
    'height': 0.00939974747474745,
    'page': 2}}],
 'box_group': None,
 'id': 0,
 'type': None,
 'text': None}

In [112]:
#lil hack
import itertools

counter = itertools.count()

mentions = []
for anno in mentions_annos_json:
    anno_spans = anno['spans']
    mmda_spans = []
    for anno_span in anno_spans:
        box_info = anno_span['box']
        mmda_spans.append(Span(
            start=anno_span['start'], 
            end=anno_span['end'],
            box=Box(
                l=box_info['left'], 
                t=box_info['top'], 
                w=box_info['width'], 
                h=box_info['height'], 
                page=box_info['page']
                )
            )
        )
    mentions.append(SpanGroup(
        spans=mmda_spans,
        id=next(counter)       
        )
    )
mentions[0]

SpanGroup(uuid='6b2a9671-5cd2-4756-be2c-8b45a86cd7a9', doc=None, metadata=Metadata({'id': 0, 'type': None, 'text': None}), spans=[Span(start=4376, end=4381, box=Box(l=0.3673529411764706, t=0.45673090681818185, w=0.027257940653594773, h=0.00939974747474745, page=2))], box_group=None, id=0, type=None, text='')

In [113]:
# get mentions from anno store:

# noooo it's too annoying cause they're split into box and spans

for mention in mentions:
    print(mention.id)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [71]:
citation_links_instance = {
    "symbols": symbols,
    "mentions" : [sg.to_json() for sg in mentions],
    "bibs" : [sg.to_json() for sg in doc.bibs],
}

citation_links_instance['bibs'][:2]

[{'spans': [{'start': 31511,
    'end': 31513,
    'box': {'left': 0.4932352941176471,
     'top': 0.9170906818181819,
     'width': 0.013970058823529467,
     'height': 0.010681818181818228,
     'page': 12}},
   {'start': 34105, 'end': 34218}],
  'box_group': {'boxes': [{'left': 0.18146317915032664,
     'top': 0.866992529633839,
     'width': 0.6380922541862745,
     'height': 0.06077997036616102,
     'page': 12}]},
  'id': 0},
 {'spans': [{'start': 31653, 'end': 31762}],
  'box_group': {'boxes': [{'left': 0.18147058823529413,
     'top': 0.1447024740656565,
     'width': 0.6380905871421568,
     'height': 0.032949968926767735,
     'page': 12}],
   'id': 1},
  'id': 1}]

In [72]:
# send the request:
citation_links_response = requests.post(
    'http://citation-links.v0.prod.models.s2.allenai.org/invocations',
    json={'instances': [citation_links_instance]}
)
# ).json()
    
# bib_detector_response['predictions']['bib_entry_boxes'][0]
print(citation_links_response) # had to make sure SpanGroup.to_json() includes id

# SpanGroup 'id' is the group id.
# When it goes to the anno store...

<Response [200]>


In [73]:
citation_links_response.json() # returns (mention.id, bib.id)

{'predictions': [{'linked_mentions': [['0', '0'],
    ['1', '14'],
    ['2', '14'],
    ['3', '15'],
    ['4', '15'],
    ['5', '15'],
    ['6', '15'],
    ['7', '15'],
    ['8', '11'],
    ['9', '12'],
    ['10', '13'],
    ['11', '14'],
    ['12', '15'],
    ['13', '18'],
    ['14', '4'],
    ['15', '15'],
    ['16', '0'],
    ['17', '11'],
    ['18', '15'],
    ['19', '0'],
    ['20', '0'],
    ['21', '0'],
    ['22', '15'],
    ['23', '0'],
    ['24', '0'],
    ['25', '12']]}]}

#### now save to plain-text anno store

In [103]:
# map of annotation-id to {"bib-entry"}

# "test_mentions" has fake spangroups w/ multiple spans (sg 0 and 3)
mentions_source = "mentions-test-3"
link_anno_id_base = f"{mentions_source}/mentions"

links_source = "citation-links-test-1"

# this works as long as the groups we get back from anno store are same order as they were when we
# put them in...should be.

# the anno_id in annotation store could be diff
# from the 'id' given back by links response because of flattening of 
# SpanGroups that got sent to annotation store 
# ie, mention w/ id 'mentions-test-3/mentions/1' could belong to group 0 
# if group 0 had 2 spans w/in hte span group) - so we need a map to 
# make sure we're writing the attributes to the correct annos:
test_mentions = copy(mentions)
two_spans = [Span(start=4376, end=4381, box=Box(l=0.3673529411764706, t=0.45673090681818185, w=0.027257940653594773, h=0.00939974747474745, page=2)),
             Span(start=4840, end=4845, box=Box(l=0.36411764705882355, t=0.5862763613636364, w=0.02755158876470585, h=0.00939974747474745, page=2))]
test_mentions[0].spans = two_spans
test_mentions[3].spans = two_spans
group_to_anno_id = dict()
counter = 0
for sg in test_mentions:
#     print("group id:")
#     print(sg.id)
#     print("anno span id:")
#     print(counter)
    group_to_anno_id[sg.id] = counter
    for span in sg.spans:
#         print(span)
        counter += 1
print("group, anno_id")        
for k, v in group_to_anno_id.items():
    print(k, v)

group, anno_id
0 0
1 2
2 3
3 4
4 6
5 7
6 8
7 9
8 10
9 11
10 12
11 13
12 14
13 15
14 16
15 17
16 18
17 19
18 20
19 21
20 22
21 23
22 24
23 25
24 26
25 27


In [114]:
# for reals
group_to_anno_id = dict()
anno_id_counter = 0
for sg in mentions:
    group_to_anno_id[sg.id] = anno_id_counter
    for span in sg.spans:
        anno_id_counter += 1
        
print("group, anno_id")        
for k, v in group_to_anno_id.items():
    print(k, v)
    

group, anno_id
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
20 20
21 21
22 22
23 23
24 24
25 25


In [117]:
mentions_source = "mentions-test-3"
link_anno_id_base = f"{mentions_source}/mentions"

for linked_mention in citation_links_response.json()['predictions'][0]['linked_mentions']:
#     print('linked_mention: ', linked_mention)
#     print('group_to_anno_id: ', group_to_anno_id)
    anno_id = f"{link_anno_id_base}/{group_to_anno_id[int(linked_mention[0])]}" # dict is necessary for multi span sgs
    #     this only works if there is always only 1 span per spangroup
#     anno_id = f"{link_anno_id_base}/{linked_mention[0]}" 
#     print(anno_id)
    linked_bib = int(linked_mention[1]) # int!
    attr_post_response = requests.post(
    f'http://annotations-api.dev.s2.allenai.org/plain-text/{dev_text_id}/annotations/{links_source}/attributes',
    json={anno_id: {"bib-entry-group-id": linked_bib}}
    )

### get and save bib structuring data to anno store

In [96]:
# symbols, tokens, pages, bib_entry_boxes
bib_struct_instance = {
    "symbols": symbols,
    "tokens": [sg.to_json() for sg in instance_tokens],
    "pages": [sg.to_json() for sg in instance_pages],
    "bib_entry_boxes" : bib_entry_boxgroups_json, # the boxes, not spangroups
}

bib_struct_instance['bib_entry_boxes'][:2]

[{'boxes': [{'left': 0.18146317915032664,
    'top': 0.866992529633839,
    'width': 0.6380922541862745,
    'height': 0.06077997036616102,
    'page': 12}],
  'id': 0,
  'type': 'bib_entry'},
 {'boxes': [{'left': 0.18147058823529413,
    'top': 0.1447024740656565,
    'width': 0.6380905871421568,
    'height': 0.032949968926767735,
    'page': 12}],
  'id': 1,
  'type': 'bib_entry'}]

In [97]:
# send the request:
bib_struct_response = requests.post(
    'http://bibentry-predictor-mmda.v0.prod.models.s2.allenai.org/invocations',
    json={'instances': [bib_struct_instance]}
)
# ).json()
    
# bib_detector_response['predictions']['bib_entry_boxes'][0]
print(bib_struct_response)

<Response [200]>


In [98]:
bib_struct_response.json()

{'predictions': [{'bib_entry_number': [{'spans': [{'start': 31511,
       'end': 31513,
       'box': None},
      {'start': 34105, 'end': 34107, 'box': None}],
     'box_group': None,
     'id': None,
     'type': None,
     'text': ''},
    {'spans': [{'start': 31653, 'end': 31654, 'box': None}],
     'box_group': None,
     'id': None,
     'type': None,
     'text': ''},
    {'spans': [{'start': 31763, 'end': 31764, 'box': None}],
     'box_group': None,
     'id': None,
     'type': None,
     'text': ''},
    {'spans': [{'start': 31971, 'end': 31972, 'box': None}],
     'box_group': None,
     'id': None,
     'type': None,
     'text': ''},
    {'spans': [{'start': 32329, 'end': 32330, 'box': None}],
     'box_group': None,
     'id': None,
     'type': None,
     'text': ''},
    {'spans': [{'start': 32821, 'end': 32822, 'box': None}],
     'box_group': None,
     'id': None,
     'type': None,
     'text': ''},
    {'spans': [{'start': 33036, 'end': 33038, 'box': None}],
     

In [127]:
prediction_fields

dict_keys(['bib_entry_number', 'bib_entry_authors', 'bib_entry_title', 'bib_entry_venue_or_event', 'bib_entry_year', 'bib_entry_doi', 'bib_entry_url'])

In [138]:
# spangroups_json to mmda_spangroups:
prediction_fields = bib_struct_response.json()['predictions'][0].keys()
# for each field make span groups and annotate on doc
fields_to_annos = dict()
for f in prediction_fields:
    print()
#     print(str(f))
#     field_string = str(f)
    preds_for_field = bib_struct_response.json()['predictions'][0][f]
    # spangroups_json to mmda_spangroups:
    pred_spangroups = [SpanGroup.from_json(sg_json) for sg_json in preds_for_field]
#     doc.annotate(f=pred_spangroups) # doesn't work (f is "f")
#     doc.annotate(field_string=pred_spangroups)
    fields_to_annos[str(f)] = pred_spangroups

# doc.annotate(f=pred_spangroups) # doesn't work (f is "f")
doc.annotate(bib_entry_number=fields_to_annos['bib_entry_number'])
doc.annotate(bib_entry_authors=fields_to_annos['bib_entry_authors'])
doc.annotate(bib_entry_title=fields_to_annos['bib_entry_title'])
doc.annotate(bib_entry_venue_or_event=fields_to_annos['bib_entry_venue_or_event'])
doc.annotate(bib_entry_year=fields_to_annos['bib_entry_year'])
doc.annotate(bib_entry_doi=fields_to_annos['bib_entry_doi']) # warning, empty
doc.annotate(bib_entry_url=fields_to_annos['bib_entry_url']) # warning, empty












In [139]:
doc.fields

['vila',
 'pages',
 'tokens',
 'rows',
 'bibs',
 'f',
 'field_string',
 'bib_entry_number',
 'bib_entry_authors',
 'bib_entry_title',
 'bib_entry_venue_or_event',
 'bib_entry_year']

In [170]:
print(doc.bibs[0].bib_entry_authors)
print()
print(doc.bibs[0].bib_entry_authors[0])
print()
for author in doc.bibs[0].bib_entry_authors:
    print(author.text)

[SpanGroup(uuid='a246166a-8fb8-4afe-9c0d-5ebb85cc554a', doc=<mmda.types.document.Document object at 0x7f1888303970>, metadata=Metadata({'id': None, 'type': None, 'text': ''}), spans=[Span(start=34109, end=34121, box=None)], box_group=None, id=None, type=None, text='Stirewalt DL'), SpanGroup(uuid='0d2a5ab0-e827-4550-bfd8-78cdf4b5cfaf', doc=<mmda.types.document.Document object at 0x7f1888303970>, metadata=Metadata({'id': None, 'type': None, 'text': ''}), spans=[Span(start=34123, end=34132, box=None)], box_group=None, id=None, type=None, text='Radich JP')]

SpanGroup(uuid='a246166a-8fb8-4afe-9c0d-5ebb85cc554a', doc=<mmda.types.document.Document object at 0x7f1888303970>, metadata=Metadata({'id': None, 'type': None, 'text': ''}), spans=[Span(start=34109, end=34121, box=None)], box_group=None, id=None, type=None, text='Stirewalt DL')

Stirewalt DL
Radich JP


#### save struct data to anno-store

In [171]:
# i think spangroups do not have multiple spans for structuring model
def spangroups_to_text_anno_arrays(spangroups):
    # more lil hackiness
    import itertools

    sg_counter = itertools.count()
    
    text_anno_array = []
    for sg in spangroups:
        current_id = next(sg_counter)
        
        for span in sg.spans: # i believe could/should instead be span[0]
            text_anno_array.append({
                "startChar": span.start,
                "endChar": span.end,
                "attributes": {"group": current_id, "text": sg.text } # spangroup text cause only 1 span per sg
            })
    return text_anno_array

In [177]:
annos = dict()
# in scala make a big dict with these annos types to spans
bib_entry_numbers = spangroups_to_text_anno_arrays(doc.bib_entry_number)
annos["bib_entry_numbers"] = bib_entry_numbers

bib_entry_authors = spangroups_to_text_anno_arrays(doc.bib_entry_authors)
annos["bib_entry_authors"] = bib_entry_authors

bib_entry_title = spangroups_to_text_anno_arrays(doc.bib_entry_title)
annos["bib_entry_title"] = bib_entry_title

bib_entry_venue_or_event = spangroups_to_text_anno_arrays(doc.bib_entry_venue_or_event)
annos["bib_entry_venue_or_event"] = bib_entry_venue_or_event

bib_entry_year = spangroups_to_text_anno_arrays(doc.bib_entry_year)
annos["bib_entry_year"] = bib_entry_year

# bib_entry_doi = spangroups_to_text_anno_arrays(doc.bib_entry_doi) # does not exist since were no annots
# bib_entry_url = spangroups_to_text_anno_arrays(doc.bib_entry_url) # does not exist since were no annots 

In [174]:
# in scala wrap each in Try, then if this attribute error, skip
bib_entry_url = spangroups_to_text_anno_arrays(doc.bib_entry_url) # does not exist since were no annots 

AttributeError: 'Document' object has no attribute 'bib_entry_url'

In [176]:
print(bib_entry_numbers[0])
print(bib_entry_title[0])

{'startChar': 31511, 'endChar': 31513, 'attributes': {'group': 0, 'text': '13 16'}}
{'startChar': 34134, 'endChar': 34181, 'attributes': {'group': 0, 'text': 'The role of FLT3 in haematopoietic malignancies'}}


In [193]:
#idk double checking
print(annos['bib_entry_numbers'][0])
print()
import json
print(json.dumps(annos)[:300])
print()
print(annos)

{'startChar': 31511, 'endChar': 31513, 'attributes': {'group': 0, 'text': '13 16'}}

{"bib_entry_numbers": [{"startChar": 31511, "endChar": 31513, "attributes": {"group": 0, "text": "13 16"}}, {"startChar": 34105, "endChar": 34107, "attributes": {"group": 0, "text": "13 16"}}, {"startChar": 31653, "endChar": 31654, "attributes": {"group": 1, "text": "2"}}, {"startChar": 31763, "endC

{'bib_entry_numbers': [{'startChar': 31511, 'endChar': 31513, 'attributes': {'group': 0, 'text': '13 16'}}, {'startChar': 34105, 'endChar': 34107, 'attributes': {'group': 0, 'text': '13 16'}}, {'startChar': 31653, 'endChar': 31654, 'attributes': {'group': 1, 'text': '2'}}, {'startChar': 31763, 'endChar': 31764, 'attributes': {'group': 2, 'text': '3'}}, {'startChar': 31971, 'endChar': 31972, 'attributes': {'group': 3, 'text': '4'}}, {'startChar': 32329, 'endChar': 32330, 'attributes': {'group': 4, 'text': '6'}}, {'startChar': 32821, 'endChar': 32822, 'attributes': {'group': 5, 'text': '9'}}, {'startChar': 3

In [194]:
source = "bib-struct-test-1"
bib_struct_post_response = requests.post(
    f'http://annotations-api.dev.s2.allenai.org/plain-text/{dev_text_id}/annotations/{source}',
    json=annos
    )
bib_struct_post_response

<Response [200]>

In [197]:
bib_struct_post_response

<Response [200]>

### store teh source infos

In [205]:
# to doc level pdf attributes
source = "spp-test-1"

attributes = {
        "plain-text-doc-id": "740c1d64752b4679a742544046cf5de3452cfef4",
        "pdf-plumber": "pdfplumber-0.0.4", # pages, tokens, rows (from prod env) -- PlainText and PDF anno stores
#         "dwp": "tbd" # words
        "bib-entries": {
            "annotation-type": "PDF",
            "bib-detection-source": "bib_detector_test_3" , # annotation and attribute source
#             "bib-grouping-source": what it is #  if any. attribute source
        },
        "mentions": {
            # mentions model returns spans w/boxes, so annos are split, grouped by "_group" attribute
            "annotation-type": ["PlainText", "PDF"],  # or token stream
            "mention-detection-source": "mentions-test-3", # annotation and attribute source
            "bib-mention-linker-source": "citation-links-test-1" # attribute source
        },
        "bib-entry-parses": {
            "annotation-type": "PDF",
            "bib-parsing-source": "bib-struct-test-1" # annotation and attribute source
        }
    }


requests.post(
    f'http://annotations-api.dev.s2.allenai.org/pdf/{dev_doc_id}/attributes/{source}',
    json=attributes
    )

<Response [200]>