In [3]:
# notebook is used to download pre-trained BERT and compute NER scores for paper acknowledgments
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import json
from pathlib import Path
import os

DATA_DIR = Path(os.path.abspath('')).parent.parent / 'data'

In [5]:
with open(DATA_DIR / 'abstract_ai_data.json', 'r') as handle:
    data = pd.read_json(handle)
print(data.attrs)

{}


In [11]:
# load data
with open(DATA_DIR / 'extracted.json', 'r') as handle:
    input_data = pd.read_json(handle)

In [12]:
# create tokenizer + pipeline
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [13]:
# compute NER for all acknowledgements
ner = []
for acknowledgement in input_data.acknowledgents.values:
    ner.append(nlp(acknowledgement))
print(ner)

[[{'entity_group': 'ORG', 'score': 0.9982005, 'word': ' BELSPO', 'start': 39, 'end': 45}, {'entity_group': 'ORG', 'score': 0.9988405, 'word': ' MELiSSA', 'start': 54, 'end': 61}], [{'entity_group': 'MISC', 'score': 0.9331735, 'word': ' National Key Technologies R&D Program of China', 'start': 307, 'end': 353}, {'entity_group': 'MISC', 'score': 0.61934775, 'word': '.', 'start': 363, 'end': 364}, {'entity_group': 'ORG', 'score': 0.8554997, 'word': ' Fundamental Research Funds for the Central Universities', 'start': 386, 'end': 441}, {'entity_group': 'ORG', 'score': 0.91061413, 'word': ' China Postdoctoral Science Foundation', 'start': 482, 'end': 519}, {'entity_group': 'MISC', 'score': 0.52763563, 'word': 'ed', 'start': 524, 'end': 526}, {'entity_group': 'MISC', 'score': 0.7231648, 'word': ' Qing', 'start': 564, 'end': 568}, {'entity_group': 'ORG', 'score': 0.5465146, 'word': ' Lan', 'start': 569, 'end': 572}, {'entity_group': 'MISC', 'score': 0.74676067, 'word': ' Project', 'start': 573

In [14]:
# filter entities by applying accuracy threshold + filter PER and ORG
print(ner)
for n, paper_entities in enumerate(ner):
    for i in range(len(paper_entities)-1, -1, -1):
        entity = paper_entities[i]
        if entity['entity_group'] not in ['ORG', 'PER'] or entity['score'] < 0.9:
            ner[n].pop(i)

[[{'entity_group': 'ORG', 'score': 0.9982005, 'word': ' BELSPO', 'start': 39, 'end': 45}, {'entity_group': 'ORG', 'score': 0.9988405, 'word': ' MELiSSA', 'start': 54, 'end': 61}], [{'entity_group': 'MISC', 'score': 0.9331735, 'word': ' National Key Technologies R&D Program of China', 'start': 307, 'end': 353}, {'entity_group': 'MISC', 'score': 0.61934775, 'word': '.', 'start': 363, 'end': 364}, {'entity_group': 'ORG', 'score': 0.8554997, 'word': ' Fundamental Research Funds for the Central Universities', 'start': 386, 'end': 441}, {'entity_group': 'ORG', 'score': 0.91061413, 'word': ' China Postdoctoral Science Foundation', 'start': 482, 'end': 519}, {'entity_group': 'MISC', 'score': 0.52763563, 'word': 'ed', 'start': 524, 'end': 526}, {'entity_group': 'MISC', 'score': 0.7231648, 'word': ' Qing', 'start': 564, 'end': 568}, {'entity_group': 'ORG', 'score': 0.5465146, 'word': ' Lan', 'start': 569, 'end': 572}, {'entity_group': 'MISC', 'score': 0.74676067, 'word': ' Project', 'start': 573

In [15]:
# we can now use these combinations of paper (represented by title) + ORG/PER to create a triple: ORG/PER acknowledged by paper

In [16]:
triple_list = []

ACKNOWLEDGED_BY = 'ACKNOWLEDGED_BY'

for i, (title, paper_entities) in enumerate(zip(input_data.title.values, ner)):
    for entity in paper_entities:
        triple_list.append({'entity_type': entity['entity_group'], 'entity_name': entity['word'], 'title': title, 'paper_id': i})
print(np.array(triple_list))
with open(DATA_DIR / 'acknowledgement_triple.json', 'w') as handle:
    json.dump(triple_list, handle)

[{'entity_type': 'ORG', 'entity_name': ' BELSPO', 'title': 'A five-stage treatment train for water recovery from urine and shower water for long-term human Space missions', 'paper_id': 0}
 {'entity_type': 'ORG', 'entity_name': ' MELiSSA', 'title': 'A five-stage treatment train for water recovery from urine and shower water for long-term human Space missions', 'paper_id': 0}
 {'entity_type': 'ORG', 'entity_name': ' China Postdoctoral Science Foundation', 'title': 'Multiobjective stochastic programming with recourses for real-time flood water conservation of a multireservoir system under uncertain forecasts', 'paper_id': 1}
 {'entity_type': 'ORG', 'entity_name': 'National Natural Science Foundation of China', 'title': 'Construction risk knowledge management in BIM using ontology and semantic web technology', 'paper_id': 2}
 {'entity_type': 'ORG', 'entity_name': 'Fundamental Research Funds for the Central Universities', 'title': 'Construction risk knowledge management in BIM using ontolog