## Exploring the MMDA data model

In [None]:
import json
from typing import *

from mmda.types import *
from mmda.parsers.pdfplumber_parser import PDFPlumberParser
from mmda.rasterizers.rasterizer import PDF2ImageRasterizer

sha = 'cca0ef8230a85e9dc233980ebb8d5778b8982394'
pdf_loc = '/Users/yogic/git/mmda/pdfs/cca0ef8230a85e9dc233980ebb8d5778b8982394.pdf'

images = list(PDF2ImageRasterizer().rasterize(pdf_loc, dpi=72))
images[0]

The __parser__ produces rows, tokens, and pages. All are of type List[SpanGroup].

Within each SpanGroup, every span has also a box.

In [None]:
doc = PDFPlumberParser().parse(pdf_loc)

def print_anno(anno):
    if type(anno) is SpanGroup:
        print(f"SpanGroup ", end='')
        print(json.dumps(anno.to_json(), indent=2))
    elif type(anno) is BoxGroup:
        print("BoxGroup ", end='')
        print(json.dumps(anno.to_json(), indent=2))

print(f"the first token = ", end='')
print_anno(doc.tokens[0])
print(f"the first row = ", end='')
print_anno(doc.rows[0])

Lets pretend to be some models called layoutparser and vila.

* LayoutParser https://github.com/allenai/mmda/blob/451004a9945646125d18121066517d7631de5618/mmda/predictors/lp_predictors.py#L89
* Vila https://github.com/allenai/mmda/blob/451004a9945646125d18121066517d7631de5618/mmda/predictors/hf_predictors/vila_predictor.py#L176

In [None]:
# from layoutparser, returns groups of boxes
boxes: List[BoxGroup] = [BoxGroup(id=1, boxes=[Box(0.1, 0.1, 0.2, 0.2, 1)])]
# from vila, returns groups of spans (individual spans have no box) 
elements: List[SpanGroup] = [SpanGroup(id=1, type='Title', spans=[Span(start=5, end=45, box=None)])]

print_anno(boxes[0])
print_anno(elements[0])

doc.annotate(boxes=boxes)
doc.annotate(elements=elements)

MMDA converts `BoxGroup`s to `SpanGroup`s when annotated on a document. It leaves the orginial boxes you gave as input as a property called `box_group` on the each `SpanGroup`.

In [None]:
first_box_group = doc.boxes[0].to_json()
first_box_group['spans'] = first_box_group['spans'][:2]  # so that it prints less

print(type(doc.boxes[0]))
print(json.dumps(first_box_group, indent=2))

We can use MMDA to jump between annotation types.

In [None]:
print(f"number of tokens in the first page = {len(doc.pages[0].tokens)}")
print(f"number of rows in the second page = {len(doc.pages[1].rows)}")

In [None]:
for token in doc.rows[0].tokens:
    print(token.symbols)

Every document has all of its text stored in a prop called `symbols` using which `SpanGroups` also derive a property of the same name.

The example below really shows the strength of MMDA in action. We annotated the document with a Box and now are able to retrieve the text enclosed by that box.

In [None]:
doc.boxes[0].symbols

The "vila" prediction doesn't contain boxes. Yet, we can find the overlapping tokens and list their boxes instead for a visual perspective.

In [None]:
print_anno(doc.elements[0])
print()

for token in doc.elements[0].tokens:
    for box in token.spans:
        print(box.box.to_json())

# Annotation Store

* __Document__ - Some immutable file. Can be text, pdf, or token-stream. The document type determines the annotations it can have.
* __Annotation__ - "sources" provide boxes as annotations for pdf documents, and spans for text documents.
* __Attributes__ - "sources" provide attributes attached to documents and/or annotations.

PDFAnnotation = (page, x, y, width, height, attributes)

TextAnnotation = (start, end, attributes)

## Document

In [None]:
import requests

url = "https://annotations-api.prod.s2.allenai.org"

s3_url = f"s3://ai2-s2-pdfs/{sha[:4]}/{sha[4:]}.pdf"
anno_doc = requests.post(f"{url}/pdf", json={"s3Url": s3_url}).json()
print(json.dumps(anno_doc, indent=2))

## Attributes

In [None]:
anno_doc_id = anno_doc['id']
# source = 'yogi-data-team-meeting-demo'

resp = requests.get(f"{url}/pdf/{anno_doc_id}/annotations", params={"annotationSources": "none", "attributeSources": "all"}).json()
print(json.dumps(resp, indent=2))

Re-creating the MMDA doc from Annotation Store

In [None]:
# first we need to get the doc-id associated with the text file
s3_text_url = 's3://ai2-s2-science-parse-plus-prod/document/cca0ef8230a85e9dc233980ebb8d5778b8982394/pdfplumber-0.0.5/text'
anno_text_doc_id = requests.post(f"{url}/plain-text", json={"s3Url": s3_text_url}).json()["id"]
print(anno_text_doc_id)

In [None]:
anno_pdf_annotations = requests.get(
    f"{url}/pdf/{anno_doc_id}/annotations", params={"annotationSources": "pdfplumber-0.0.5", "attributeSources": "none"}
).json()['annotationsFromSource']['pdfplumber-0.0.5']
anno_text_annotations = requests.get(
    f"{url}/plain-text/{anno_text_doc_id}/annotations", params={"annotationSources": "pdfplumber-0.0.5", "attributeSources": "none"}
).json()['annotationsFromSource']['pdfplumber-0.0.5']

In [None]:
print("PDFAnnotation " + json.dumps(anno_pdf_annotations['pages'][0], indent=2))
print("TextAnnotation " + json.dumps(anno_text_annotations['pages'][0], indent=2))

In [None]:
from collections import defaultdict

def mk_mmda_annos(text_annos, pdf_annos) -> List[SpanGroup]:
    grouped_text_annos = defaultdict(list)
    for text_anno in text_annos:
        grouped_text_annos[text_anno['attributesFromSource']['pdfplumber-0.0.5']['_group']].append(text_anno)
    grouped_pdf_annos = defaultdict(list)
    for pdf_anno in pdf_annos:
        grouped_pdf_annos[pdf_anno['attributesFromSource']['pdfplumber-0.0.5']['_group']].append(pdf_anno)

    grouped_text_annos = dict(grouped_text_annos)
    grouped_pdf_annos = dict(grouped_pdf_annos)

    keys = sorted(list(grouped_text_annos.keys()))
    span_groups = []
    for key in keys:
        text_annos = grouped_text_annos[key]
        pdf_annos = grouped_pdf_annos[key]
        spans = []
        for (text_anno, pdf_anno) in zip(text_annos, pdf_annos):
            span = Span(
                start=text_anno['startChar'],
                end=text_anno['endChar'],
                box=Box(l=pdf_anno['x'], t=pdf_anno['y'], w=pdf_anno['width'], h=pdf_anno['height'], page=pdf_anno['page'])
            )
            spans.append(span)
        span_groups.append(SpanGroup(spans=spans, id=pdf_annos[0]['attributesFromSource']['pdfplumber-0.0.5']['_group']))
    return span_groups

tokens = mk_mmda_annos(anno_text_annotations['tokens'], anno_pdf_annotations['tokens'])
rows = mk_mmda_annos(anno_text_annotations['rows'], anno_pdf_annotations['rows'])
pages = mk_mmda_annos(anno_text_annotations['pages'], anno_pdf_annotations['pages'])

In [None]:
# re-use text from the parsed doc. IRL, we get text from S3.
new_doc = Document(doc.symbols)
new_doc.annotate(tokens=tokens)
new_doc.annotate(rows=rows)
new_doc.annotate(pages=pages)

In [None]:
# this the doc we parsed at the start
for token in doc.rows[0].tokens:
    print(token.symbols)
print("-------")
# new doc we re-constructed from data SPP stored in annotation store
for token in new_doc.rows[0].tokens:
    print(token.symbols)

## New!!

python folk (or those forced into it through the magic of mmda) no longer need bother with how SPP stores MMDA annotations in the annotation store. 

https://github.com/allenai/spp/tree/main/client

Don't care about MMDA but are still curious about the annotation store?

https://github.com/allenai/annotation-store/tree/main/clients/python

https://github.com/allenai/annotation-store/tree/main/clients/scala

## Thank you!