In [1]:
%load_ext autoreload
%autoreload 2

## Augment basic MMDA document with Grobid Annotations

#### Generate a basic PdfPlumber parsed MMDA doc


In [2]:

PDF_PATH = '../../tests/fixtures/grobid_augment_existing_document_parser/0c027af0ee9c1901c57f6579d903aedee7f4.pdf'
from mmda.parsers import PDFPlumberParser
from mmda.types import Document
# PDF to text
pdf_plumber = PDFPlumberParser()
doc: Document = pdf_plumber.parse(input_pdf_path=PDF_PATH)
doc.fields

['tokens', 'rows', 'pages']

#### Pass the doc as well as the PDF to the Grobid parser

In [3]:

from mmda.parsers.grobid_augment_existing_document_parser import GrobidAugmentExistingDocumentParser
parser = GrobidAugmentExistingDocumentParser(config_path='../../src/mmda/parsers/grobid.config', check_server=True)

GROBID server is up and running


In [4]:
doc = parser.parse(PDF_PATH, doc, ".")



In [5]:
doc.fields

['tokens', 'rows', 'pages', 'authors', 'bib_entries']

## XML Playground

In [6]:
import xml.etree.ElementTree as et
from collections import defaultdict

XML_PATH = '../../tests/fixtures/grobid_augment_existing_document_parser/0c027af0ee9c1901c57f6579d903aedee7f4.xml'

xml = open(XML_PATH).read()

xml_root = et.fromstring(xml)

parser._cache_page_sizes(xml_root)

parser.page_sizes


{0: [612.0, 792.0],
 1: [612.0, 792.0],
 2: [612.0, 792.0],
 3: [612.0, 792.0],
 4: [612.0, 792.0],
 5: [612.0, 792.0],
 6: [612.0, 792.0],
 7: [612.0, 792.0],
 8: [612.0, 792.0],
 9: [612.0, 792.0]}

In [7]:
NS = {"tei": "http://www.tei-c.org/ns/1.0"}

for child in xml_root:
    print(child.tag)

{http://www.tei-c.org/ns/1.0}teiHeader
{http://www.tei-c.org/ns/1.0}facsimile
{http://www.tei-c.org/ns/1.0}text


In [8]:

author_list_root = xml_root.find(".//tei:sourceDesc", NS)
print('author_list_root', author_list_root)

author_names = []
author_name_structs = author_list_root.findall(".//tei:persName", NS)

for a in author_name_structs:
    coords_string = a.attrib["coords"]
    boxes = parser._xml_coords_to_boxes(coords_string)
author_names


author_list_root <Element '{http://www.tei-c.org/ns/1.0}sourceDesc' at 0x15ac39ea0>


[]

### Check out the Bibliography Entries

In [9]:
for bib in doc.bib_entries[:3]:
    print(bib.id, bib.box_group.id)

0 None
1 None
2 None


In [10]:
for bib in doc.bib_entries[:3]:
    print('\n\n', bib.spans, '\n', bib.text)



 [Span(start=34511, end=34621, box=Box(l=0.11442973856209156, t=0.3006296222222226, w=0.35348235294117647, h=0.03977661818181799, page=8))] 
 ISPRS 2D Semantic Labeling Challenge.
http:
//www2.isprs.org/commissions/comm3/wg4/
semantic-labeling.html . 4


 [Span(start=34626, end=34810, box=Box(l=0.11442973856209161, t=0.3449347757575761, w=0.35348859869281046, h=0.05283257575757566, page=8))] 
 N. Audebert, B. Saux, and S. Lefvre. Beyond RGB: Very
High Resolution Urban Remote Sensing with Multimodal
Deep Networks. ISPRS Journal of Photogrammetry and Re-
mote Sensing , 2018. 2


 [Span(start=34815, end=35032, box=Box(l=0.11442973856209161, t=0.4022946242424245, w=0.35348910588235294, h=0.05283257575757577, page=8))] 
 V. Badrinarayanan, A. Kendall, and R. Cipolla. SegNet: A
Deep Convolutional Encoder-Decoder Architecture for Im-
age Segmentation. IEEE Transactions on Pattern Analysis
and Machine Intelligence (TPAMI) , 2017. 2, 4, 7


### Check out the Author names

In [11]:
for author_name in doc.authors:
    print('\n\n', author_name.spans, '\n', author_name.text)



 [Span(start=87, end=99, box=Box(l=0.20770588235294118, t=0.20625141818181814, w=0.11447299346405226, h=0.015094949494949494, page=0))] 
 Xueqing Deng


 [Span(start=100, end=109, box=Box(l=0.2191045751633987, t=0.22386126666666661, w=0.09167606797385619, h=0.015094949494949494, page=0))] 
 UC Merced


 [Span(start=130, end=136, box=Box(l=0.4588872549019608, t=0.20625141818181814, w=0.0548142013071895, h=0.015094949494949494, page=0))] 
 Yi Zhu


 [Span(start=205, end=217, box=Box(l=0.42348039215686284, t=0.26663020606060595, w=0.12562727320261435, h=0.015094949494949494, page=0))] 
 Shawn Newsam
