In [1]:
# notebook is used to download pre-trained BERT and compute NER scores for paper acknowledgments
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import json
from pathlib import Path

DATA_DIR = Path(__file__).parent.parent.parent / 'data'

NameError: name '__file__' is not defined

In [None]:
# load data
with open(DATA_DIR / 'papers.json', 'r') as handle:
    input_data = pd.read_json(handle)

In [None]:
# create tokenizer + pipeline
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [None]:
# compute NER for all acknowledgements
ner = []
for acknowledgement in input_data.acknowledgents.values:
    ner.append(nlp(acknowledgement))
print(ner)

In [None]:
# filter entities by applying accuracy threshold + filter PER and ORG
print(ner)
for n, paper_entities in enumerate(ner):
    for i in range(len(paper_entities)-1, -1, -1):
        entity = paper_entities[i]
        if entity['entity_group'] not in ['ORG', 'PER'] or entity['score'] < 0.9:
            ner[n].pop(i)

In [None]:
# we can now use these combinations of paper (represented by title) + ORG/PER to create a triple: ORG/PER acknowledged by paper

In [None]:
triple_list = []

ACKNOWLEDGED_BY = 'ACKNOWLEDGED_BY'

for title, paper_entities in zip(input_data.title.values, ner):
    for entity in paper_entities:
        triple_list.append({'entity': entity['word'], 'title': title})
print(np.array(triple_list))
with open(DATA_DIR / 'acknowledgement_triple.json', 'w') as handle:
    json.dump(triple_list, handle)