# SP+ End-to-end Evaluation: Annotation

## 0. Setting up your environment
### Install spp client to your local device: 
`pip install -i https://pip.s2.allenai.org/simple spp_client==0.3.0`
`pip install --extra-index-url https://pip.s2.allenai.org/simple spp_client==0.3.0`

### Then install an upgraded version of MMDA:  
`pip install -i https://pip.s2.allenai.org/simple mmda==0.3.5`
`pip install --extra-index-url https://pip.s2.allenai.org/simple spp_client==0.3.0`

## 1. Get data from annotation store via spp_client

### 1a. Get and set sources
Retreive information about the sources for each annotation - mentions, bib-entries, bib-entry-parses, and pdf plumber

In [1]:
####################################################################
# ANNOTATOR TODO: change this to sha you're analyzing
####################################################################

#This sha will get overwritten when running autogenerater script
sha = 'd17961483e3eb55df82cc16a01d4519af1b4dbbd'
sha = '61f1ac16610fcc20ba49432aa7a5943ad5b7fdd8'


In [2]:
# SPP version for Eval Sample
spp_version = "dag-0.0.4"

In [3]:
# See step 0 -- should have MMDA version 0.3.5
! pip show mmda

Name: mmda
Version: 0.3.5
Summary: MMDA - multimodal document analysis
Home-page: 
Author: 
Author-email: Allen Institute for Artificial Intelligence <contact@allenai.org>
License: Apache-2.0
Location: /Users/angelez/miniconda3/envs/mmda_bailey_annot_b058/lib/python3.8/site-packages
Requires: ncls, necessary, pandas, pdf2image, pdfplumber, pydantic, requests, tqdm
Required-by: spp-client


In [9]:
# import sys
sys.path.remove('/Users/angelez/src/mmda')

In [10]:
import sys
print(sys.version)
print("\n".join(sys.path))

3.8.16 | packaged by conda-forge | (default, Feb  1 2023, 16:05:36) 
[Clang 14.0.6 ]
/Users/angelez/miniconda3/envs/mmda_bailey_annot_b058/lib/python38.zip
/Users/angelez/miniconda3/envs/mmda_bailey_annot_b058/lib/python3.8
/Users/angelez/miniconda3/envs/mmda_bailey_annot_b058/lib/python3.8/lib-dynload

/Users/angelez/miniconda3/envs/mmda_bailey_annot_b058/lib/python3.8/site-packages


In [6]:
# See step 0 -- should have MMDA version 0.3.5
# ! pip show spp-client


### 1b. Retrieve Annotations
Using the sources obtained from the previous cell, we will now retreive the annotations for each source via spp_client.

In [12]:
# Get annotations and data as text using spp client
from spp_client.client import (
    mmda_span_group_to_text_via_dwp_words,
    RelationTypeId,
    SppClient,
)
client = SppClient.mk(env="dev")

mmda_doc, relations = client.get_document(
            sha,
            spp_version,
            annotation_types=[
                "pages",
                "tokens",
                "rows",
                "bib_entries",
                "citation_mentions",
                "bib_entry_titles",
                "words"
            ],
            relation_types=[
                RelationTypeId("refers_to", "citation_mentions", "bib_entries")
            ],
            allow_missing=True,
            include_images=True
        )


ImportError: cannot import name 'api' from 'ai2_internal' (/Users/angelez/src/mmda/ai2_internal/__init__.py)

In [None]:
print(mmda_doc.fields)

if 'bib_entries' not in mmda_doc.fields:
    print("No bib_entries!")

## 2. Visalize Mentions (draw boxes)
Use draw_boxes to visualize each mention.

### 2a. Make Layout Rectangles for mentions so that vizes can be made for them.

In [None]:
#imports
from collections import defaultdict
import layoutparser.elements as lpe
import layoutparser.visualization as lpv
from typing import List

#function to create box groups from mentions on the mmda doc
def make_boxgroups_mentions(mmda_doc):

    result_boxes_by_page = defaultdict(list)
    original_boxes_by_page = defaultdict(list)
    
    for mention in mmda_doc.citation_mentions:
        for box in mention.box_group.boxes:
            original_boxes_by_page[box.page].append(box) 
            
    #mapping between pages and spans on page
    for page, list_of_boxes in original_boxes_by_page.items():
        page_w, page_h = mmda_doc.images[page].size


        for box in list_of_boxes:
            if box.page == page:
                result_boxes_by_page[page].append(lpe.Rectangle(
                        box.l * page_w,
                        box.t * page_h,
                        (box.l + box.w) * page_w,
                        (box.t + box.h) * page_h
                    ))
            else:
                raise ValueError('Page number')
    return result_boxes_by_page

### 2b. Make boxes for Grobid annotation

In [None]:
import os
import requests
import json
import boto3

####################################################################
# ANNOTATOR TODO: Update with path to your grobid papers and uncomment below
# if the s3 code doesn't work.
# Annos are in s3://ai2-s2-mmda/citation-link-evaluation-sample-2/grobid_annotation
####################################################################
# with open(f'/Users/baileyk/Work/mmda/spp_grobid/{sha}.json') as f:
#     grobid_annotation = json.load(f)

# s3 code
s3_resource = boto3.resource('s3')
key = f"citation-link-evaluation-sample-2/grobid_annotation/{sha}.json"
obj = s3_resource.Object("ai2-s2-mmda", key)

try:
    grobid_annotation = json.loads(obj.get()["Body"].read().decode("utf-8"))
    print("grobid annos found.")
except:
    print(f"no grobid annotations found at s3://ai2-s2-mmda/{key}")
    grobid_annotation = dict()

In [None]:
def make_boxgroup_mentions_from_grobid(grobid_annotation):
    grobid_mention_boxes_by_page = defaultdict(list)
    for mention in grobid_annotation["refMarkers"]:
        page = int(mention["p"]) - 1
        box = lpe.Rectangle(
            mention["x"], 
            mention["y"], 
            mention["x"] + mention["w"],
            mention["y"] + mention["h"]
        )
        grobid_mention_boxes_by_page[page].append(box)
    return grobid_mention_boxes_by_page

#egor version
def draw_boxgroups(image, rectangle_list, color_map=None, id_map=None):
    viz = []
    if color_map is None:
        color_map={'color_1': 'blue', 'color_2': 'green', 'color_3': 'yellow', 'color_4': 'pink'}

    for rectangle in rectangle_list:
        viz.append(lpe.TextBlock(rectangle,
                                 type=f'color_1'
                                )
                  )

    if image:
        return lpv.draw_box(image,
                            viz,
                            color_map=color_map,
                            show_element_id=False,
                            id_font_size=10)
    
    return None

In [None]:
# annotate the doc with grobid mentions and bibs
from mmda.types.annotation import BoxGroup, Box, Metadata

# make grobid mention box groups:
grobid_mention_box_groups = []
for anno in grobid_annotation['refMarkers']:
    metadata = Metadata()
    metadata['bib_entry_id'] = anno.get('id')
    
    # grobid is silly
    page = anno['p']-1
    relative_box = Box(l=anno['x'], t=anno['y'], w=anno['w'], h=anno['h'], 
        page=page
        ).get_relative(
            page_width=grobid_annotation['pages'][page]['page_width'],
            page_height=grobid_annotation['pages'][page]['page_height'],
        )
    
    grobid_mention_box_groups.append(BoxGroup(
        boxes=[relative_box],
        metadata=metadata
        )
    )
print(f"{len(grobid_mention_box_groups)} grobid mentions.")
    

# make grobid bib entry box groups:
grobid_bib_entry_box_groups = []
for anno in grobid_annotation['refBibs']:
    bib_boxes = []
    for box in anno['pos']:
        # grobid is silly
        page = box['p']-1
        bib_boxes.append(Box(l=box['x'], t=box['y'], w=box['w'], h=box['h'], 
        page=page).get_relative(
            page_width=grobid_annotation['pages'][page]['page_width'],
            page_height=grobid_annotation['pages'][page]['page_height'],
            )
        )

    grobid_bib_entry_box_groups.append(BoxGroup(
        boxes=bib_boxes,
        id=anno['id']
        )
    )
print(f"{len(grobid_bib_entry_box_groups)} grobid bibs.")

mmda_doc.annotate(grobid_mentions=grobid_mention_box_groups)
mmda_doc.annotate(grobid_bibs=grobid_bib_entry_box_groups)

print(mmda_doc.fields)

## 3. Get references and s2 id's for each mention.

### 3a. Retreive citation mention numbers and which bib entries they are linked to.

In [None]:
# ie: for citation_mention '0', the bib_entry_id it's linked to is '3'. 
citation_links = defaultdict(int)

for relation in relations[list(relations.keys())[0]]:
    #add to list of citation link data
    citation_links[relation.from_id] = relation.to_id

### 3b. Use API call to find what s2 paper each bib entry is linked to.

In [None]:
def get_index_of_bib_entry_with_id(bib_entry_id):
    linked_bib_entry_index_list = [i for i, bib_entry in enumerate(mmda_doc.bib_entries) if bib_entry.id == bib_entry_id]
    assert len(linked_bib_entry_index_list) == 1, f"there should be exactly one bib_entry with the id {bib_entry_id}."
    return linked_bib_entry_index_list[0]

def extract_first_group_symbols(span_groups):
    if len(span_groups) > 1:
        print("warning: multiple span_groups detected when 0 or 1 expected")
    try: 
        first_one = span_groups[0] 
        text = mmda_span_group_to_text_via_dwp_words(first_one)
    except IndexError: # case where was no annotation detected so span_groups was empty list
            text = "" # it can be empty or say "no text detected" or something
    return text


def get_bib_entry_title(citation_mention_id):
    linked_bib_id = citation_links[citation_mention_id]
    # print("linked_bib_id: ", linked_bib_id)
    # Accessing the linked bib_entry this way is error-prone because MMDA annotations can't be assumed to be sorted by SpanGroup IDs
    # bib_title_span_groups = mmda_doc.bib_entries[linked_bib_id].bib_entry_titles 
    # We have to find the correct bib_entry this way:
    linked_bib_index = get_index_of_bib_entry_with_id(linked_bib_id)
    
    bib_title_span_groups = mmda_doc.bib_entries[linked_bib_index].bib_entry_titles 
    text_symbols = extract_first_group_symbols(bib_title_span_groups)

    return text_symbols

def get_full_bib_entry_text(citation_mention_id):
    linked_bib_id = citation_links[citation_mention_id]
    linked_bib_index = get_index_of_bib_entry_with_id(linked_bib_id)

    bib_span_groups = [mmda_doc.bib_entries[linked_bib_index]]
    text_symbols = extract_first_group_symbols(bib_span_groups)

    return text_symbols

In [None]:
#make api call to get linked papers
def bibLinkingApiPost(title, end_point='http://pipeline-api.prod.s2.allenai.org/citation/match'):
    payload = json.dumps([{'title': title}])
    headers = {
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", end_point, headers=headers, data=payload).json()
    return response


#get s2_ids linked to each mention
s2_ids = []
for citation_mention in mmda_doc.citation_mentions:
    title = get_bib_entry_title(citation_mention.id)
    s2_ids.append(str((bibLinkingApiPost(title))).replace('[', '').replace(']', ''))

## 4. Annotation Info!

In [None]:
####################################################################
# ANNOTATOR TODO: Record page # from output in annotation spreadsheet
####################################################################

# get random page number to review and annotate
import random

pages_w_mentions = [citation_mention.pages[0].id for citation_mention in mmda_doc.citation_mentions]

annot_page = random.choice(pages_w_mentions)
print("page to annotate:", annot_page)

In [None]:
# gives table of SPP mentions and their corresponding bib_entry ids 
# (if the doc is annotated with bib_entries), and s2 linked papers 
import pandas as pd
from IPython.display import HTML

if 'bib_entries' in mmda_doc.fields:
    ref_info = {
        # "Mention ID": [citation_mention.id for citation_mention in mmda_doc.citation_mentions],
        "Page": [citation_mention.pages[0].id for citation_mention in mmda_doc.citation_mentions],
        "Mention text": [citation_mention.symbols for citation_mention in mmda_doc.citation_mentions],
        # Bib entry and title texts are DWP "words".texts
        "Linked Bib Entry Extracted Text": [get_full_bib_entry_text(citation_mention.id) for citation_mention in mmda_doc.citation_mentions],    
        "Linked Bib Entry Title": [get_bib_entry_title(citation_mention.id) for citation_mention in mmda_doc.citation_mentions],    
        # "Linked Bib id": [citation_links[citation_mention.id] for citation_mention in mmda_doc.citation_mentions],
        "S2 id:": [ids if ids != str(-1) else " " for ids in s2_ids],
        "S2 url": [f"http://api.semanticscholar.org/corpusid:{ids}" if ids != str(-1) else " " for ids in s2_ids]
        }
else:
    ref_info = {
        # "Mention ID": [citation_mention.id for citation_mention in mmda_doc.citation_mentions],
        "Page": [citation_mention.pages[0].id for citation_mention in mmda_doc.citation_mentions],
        "Mention text": [citation_mention.symbols for citation_mention in mmda_doc.citation_mentions],
        # TODO: use DWP "words" text instead of doc.symbols (ex: the title "Reaction-Diﬀusion processes and meta-population mod-\nels in Heterogeneous Networks" gives no match, but manually changing "mod-\nels" to "models" results in an S2 id match)
        "Linked Bib Entry Title": ["no bibs" for citation_mention in mmda_doc.citation_mentions],    
        "Linked Bib Entry Extracted Text": ["no bibs" for citation_mention in mmda_doc.citation_mentions],    
        "S2 id:": ["no bibs" for ids in s2_ids],
        "S2 url": ["no bibs" for ids in s2_ids]
        }

spp_df = pd.DataFrame.from_dict(ref_info, orient='index')
spp_df = spp_df.transpose()

# selecting rows based on random page selected to annotate
# you can comment the line below out to see all mentions across all pages
spp_df = spp_df[spp_df['Page'] == annot_page]


# make s2 links clickable
print("SPP:")
HTML(spp_df.to_html(render_links=True, escape=False))

In [None]:
# gives table of GROBID mentions and their corresponding bib_entry ids, s2 linked papers 
import pandas as pd
from IPython.display import HTML

def get_index_of_grobid_bib_entry_with_id(grobid_bib_entry_id):
    linked_bib_entry_index_list = [i for i, grobid_bib_entry in enumerate(mmda_doc.grobid_bibs) if grobid_bib_entry.box_group.id == grobid_bib_entry_id]
    assert len(linked_bib_entry_index_list) <2, f"there should be no more than one grobid bib entry with the id {grobid_bib_entry_id}. \ninstead, we got this: {linked_bib_entry_index_list}"
    return linked_bib_entry_index_list[0] if len(linked_bib_entry_index_list) == 1 else None


def get_full_grobid_bib_entry_text(grobid_mention):
    linked_grobid_bib_id = grobid_mention.box_group.metadata.bib_entry_id
    linked_grobid_bib_index = get_index_of_grobid_bib_entry_with_id(linked_grobid_bib_id)
    if linked_grobid_bib_index == None:
        return None
    else:
        grobid_bib_span_groups = [mmda_doc.grobid_bibs[linked_grobid_bib_index]]
        text_symbols = extract_first_group_symbols(grobid_bib_span_groups)
    return text_symbols

# get titles from bibentry-predictor API
def get_linked_bib_title_from_bib_predictor(grobid_mention):
    bib_text = get_full_grobid_bib_entry_text(grobid_mention)
    if bib_text:
        payload = json.dumps({"instances": [{"bib_entry": bib_text}]})    
        headers = {
            'Content-Type': 'application/json'
        }
        end_point = "http://bibentry-predictor.v0.dev.models.s2.allenai.org/invocations"

        response = requests.request("POST", end_point, headers=headers, data=payload).json()
        title_prediction = response['predictions'][0].get('title', 'no title detected')
    else:
        return None
    return title_prediction


linked_bib_entry_titles_grobid = [get_linked_bib_title_from_bib_predictor(grobid_mention) for grobid_mention in mmda_doc.grobid_mentions]


#get grobid_s2_ids linked to each mention
grobid_s2_ids = []
for title in linked_bib_entry_titles_grobid:
    if title:
        grobid_s2_ids.append(str((bibLinkingApiPost(title))).replace('[', '').replace(']', ''))
    else:
        grobid_s2_ids.append(None)

print(grobid_s2_ids)

ref_info = {
    "Page": [grobid_mention.box_group.boxes[0].page for grobid_mention in mmda_doc.grobid_mentions], # some mentions don't overlap with tokens so missing span and therefore page info
    "Grobid Mention text": [grobid_mention.symbols for grobid_mention in mmda_doc.grobid_mentions],
    "Linked Bib Entry ID": [grobid_mention.box_group.metadata.bib_entry_id for grobid_mention in mmda_doc.grobid_mentions],    
    "Linked Bib Entry Title": linked_bib_entry_titles_grobid,
    "Linked Bib Entry Extracted Text": [get_full_grobid_bib_entry_text(grobid_mention) for grobid_mention in mmda_doc.grobid_mentions],
    "S2 id:": [id if id != str(-1) else " " for id in grobid_s2_ids],
    "S2 url": [f"http://api.semanticscholar.org/corpusid:{id}" if id not in [str(-1), None] else " " for id in grobid_s2_ids]

     }
    
grobid_df = pd.DataFrame.from_dict(ref_info, orient='index')
grobid_df = grobid_df.transpose()

# selecting rows based on random page selected to annotate
# you can comment the line below out to see all mentions across all pages
grobid_df = grobid_df[grobid_df['Page'] == annot_page]


#make s2 links clickable
print("Grobid:")
HTML(grobid_df.to_html(render_links=True, escape=False))

In [None]:
# mentions imgs
import cv2
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure


rows = 1
columns = 2
fontsize = 48

#spp
#make boxes and save as image
boxes_by_page = make_boxgroups_mentions(mmda_doc)
image1 = draw_boxgroups(mmda_doc.images[annot_page], boxes_by_page[annot_page])

#plot
fig = plt.figure(figsize=(50, 50))
fig.add_subplot(rows, columns, 1)
plt.axis('off')
plt.title("SPP Mentions",fontdict={'fontsize': fontsize})
plt.imshow(image1)


#grobid
#make boxes and save as image
grobid_boxes_by_page = make_boxgroup_mentions_from_grobid(grobid_annotation)
image2 = draw_boxgroups(mmda_doc.images[annot_page], grobid_boxes_by_page[annot_page])

#plot side by side
fig.add_subplot(rows, columns, 2)
plt.axis('off')
plt.title("Grobid Mentions",fontdict={'fontsize': fontsize})
plt.imshow(image2)
plt.subplots_adjust(bottom=0.15, wspace=0.05)

In [None]:
def draw_text_blocks(page, text_blocks_dictionary, color_map=None, id_map=None):
    image = mmda_doc.images[page]

    if color_map is None:
        color_map={'color_1': 'blue', 'color_2': 'green', 'color_3': 'yellow', 'color_4': 'pink'}

    if image:
        return lpv.draw_box(image,
                            text_blocks_dictionary[page],
                            color_map=color_map,
                            show_element_id=True,
                            id_font_size=10
                            # id_text_background_alpha=0.2
                            )
    
    return None

#function to create labeled text blocks from grobid bibs on the mmda doc
def make_text_blocks_bibs(mmda_doc):

    result_text_blocks_list = defaultdict(list)
    dictionary_of_box_groups = defaultdict(list)
    
    for bib in mmda_doc.bib_entries:
        # it's always 1 box
        dictionary_of_box_groups[bib.box_group.boxes[0].page].append(bib.box_group)
    
    #mapping between pages and spans on page
    for page, box_groups in dictionary_of_box_groups.items():
        page_w, page_h = mmda_doc.images[page].size

        for box_group in box_groups:
            # it's always 1 box
            box = box_group.boxes[0]
            rect = lpe.Rectangle(
                    box.l * page_w,
                    box.t * page_h,
                    (box.l + box.w) * page_w,
                    (box.t + box.h) * page_h
                )
            result_text_blocks_list[page].append(lpe.TextBlock(
                rect,
                id=box_group.id,
                type='color_1'
            ))
    return result_text_blocks_list

### SPP Bibs Pages

In [None]:
# bibs imgs
bib_pages = set([bib_entry.box_group.boxes[0].page for bib_entry in mmda_doc.bib_entries])

rows = len(bib_pages)
columns = 2
fontsize = 48

#spp
#make bib text blocks and save as image
print("SPP")

dic_text_blocks = make_text_blocks_bibs(mmda_doc)
bib_pages = sorted(list(bib_pages))
print('bib_pages', bib_pages)
for page in bib_pages:
    print(f"page {page}:")
    display(draw_text_blocks(page, dic_text_blocks))


### Grobid Bibs Pages

In [None]:
#function to create labeled text blocks from bibs on the mmda doc
def make_text_blocks_grobid_bibs(mmda_doc):

    result_text_blocks_list = defaultdict(list)
    dictionary_of_box_groups = defaultdict(list)
    
    for bib in mmda_doc.grobid_bibs:
        dictionary_of_box_groups[bib.box_group.boxes[0].page].append(bib.box_group)
    
    #mapping between pages and spans on page
    for page, box_groups in dictionary_of_box_groups.items():
        page_w, page_h = mmda_doc.images[page].size

        for box_group in box_groups:
            # it's usually multiple boxes
            for i, box in enumerate(box_group.boxes):
                box_id = box_group.id if i == 0 else " "
                rect = lpe.Rectangle(
                        box.l * page_w,
                        box.t * page_h,
                        (box.l + box.w) * page_w,
                        (box.t + box.h) * page_h
                    )
                result_text_blocks_list[page].append(lpe.TextBlock(
                    rect,
                    id=box_id,
                    type='color_1'
                ))
    return result_text_blocks_list
    
# bibs imgs
print("Grobid 0.5")
bib_pages = set([bib_entry.box_group.boxes[0].page for bib_entry in mmda_doc.grobid_bibs])
rows = len(bib_pages)
columns = 2
fontsize = 48

#grobid
#make bib text blocks and save as image
dic_text_blocks = make_text_blocks_grobid_bibs(mmda_doc)
bib_pages = sorted(list(bib_pages))
print('bib_pages', bib_pages)
for page in bib_pages:
    print(f"page {page}:")
    display(draw_text_blocks(page, dic_text_blocks))


## ANNOTATOR TODO: 
Record each mention on this page (regardless of if it got recognized by SPP or not) as 1 row in the spreadsheet.