In [4]:
# notebook is used to download pre-trained BERT and compute NER scores for paper acknowledgments
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import json
from pathlib import Path
import os

DATA_DIR = Path(os.path.abspath('')).parent.parent / 'data'

/Users/christian/Desktop/Open Science and Artificial Intelligence/Assignment/PDF_Knowledge_Graph/data


In [5]:
# load data
with open(DATA_DIR / 'papers.json', 'r') as handle:
    input_data = pd.read_json(handle)

In [6]:
# create tokenizer + pipeline
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [7]:
# compute NER for all acknowledgements
ner = []
for acknowledgement in input_data.acknowledgents.values:
    ner.append(nlp(acknowledgement))
print(ner)

[[{'entity_group': 'ORG', 'score': 0.9996489, 'word': ' Departmento de Oceanografía', 'start': 45, 'end': 72}, {'entity_group': 'ORG', 'score': 0.99972737, 'word': ' Instituto Oceanográfico de Venezuela', 'start': 80, 'end': 116}, {'entity_group': 'PER', 'score': 0.99912685, 'word': ' LOREN LOCKWOOD', 'start': 175, 'end': 189}, {'entity_group': 'ORG', 'score': 0.99969786, 'word': ' Consejo de Investigación de la Universidad de Oriente', 'start': 263, 'end': 316}], [{'entity_group': 'MISC', 'score': 0.9998764, 'word': ' Vietnamese', 'start': 49, 'end': 59}, {'entity_group': 'MISC', 'score': 0.99983084, 'word': ' Americans', 'start': 108, 'end': 117}, {'entity_group': 'MISC', 'score': 0.99988735, 'word': ' American', 'start': 174, 'end': 182}, {'entity_group': 'MISC', 'score': 0.9998573, 'word': ' Chinese', 'start': 260, 'end': 267}, {'entity_group': 'MISC', 'score': 0.99980253, 'word': ' Soviets', 'start': 276, 'end': 283}, {'entity_group': 'MISC', 'score': 0.9999039, 'word': ' American

In [8]:
# filter entities by applying accuracy threshold + filter PER and ORG
print(ner)
for n, paper_entities in enumerate(ner):
    for i in range(len(paper_entities)-1, -1, -1):
        entity = paper_entities[i]
        if entity['entity_group'] not in ['ORG', 'PER'] or entity['score'] < 0.9:
            ner[n].pop(i)

[[{'entity_group': 'ORG', 'score': 0.9996489, 'word': ' Departmento de Oceanografía', 'start': 45, 'end': 72}, {'entity_group': 'ORG', 'score': 0.99972737, 'word': ' Instituto Oceanográfico de Venezuela', 'start': 80, 'end': 116}, {'entity_group': 'PER', 'score': 0.99912685, 'word': ' LOREN LOCKWOOD', 'start': 175, 'end': 189}, {'entity_group': 'ORG', 'score': 0.99969786, 'word': ' Consejo de Investigación de la Universidad de Oriente', 'start': 263, 'end': 316}], [{'entity_group': 'MISC', 'score': 0.9998764, 'word': ' Vietnamese', 'start': 49, 'end': 59}, {'entity_group': 'MISC', 'score': 0.99983084, 'word': ' Americans', 'start': 108, 'end': 117}, {'entity_group': 'MISC', 'score': 0.99988735, 'word': ' American', 'start': 174, 'end': 182}, {'entity_group': 'MISC', 'score': 0.9998573, 'word': ' Chinese', 'start': 260, 'end': 267}, {'entity_group': 'MISC', 'score': 0.99980253, 'word': ' Soviets', 'start': 276, 'end': 283}, {'entity_group': 'MISC', 'score': 0.9999039, 'word': ' American

In [9]:
# we can now use these combinations of paper (represented by title) + ORG/PER to create a triple: ORG/PER acknowledged by paper

In [10]:
triple_list = []

ACKNOWLEDGED_BY = 'ACKNOWLEDGED_BY'

for title, paper_entities in zip(input_data.title.values, ner):
    for entity in paper_entities:
        triple_list.append({'entity': entity['word'], 'title': title})
print(np.array(triple_list))
with open(DATA_DIR / 'acknowledgement_triple.json', 'w') as handle:
    json.dump(triple_list, handle)

[{'entity': ' Departmento de Oceanografía', 'title': ''}
 {'entity': ' Instituto Oceanográfico de Venezuela', 'title': ''}
 {'entity': ' LOREN LOCKWOOD', 'title': ''}
 {'entity': ' Consejo de Investigación de la Universidad de Oriente', 'title': ''}
 {'entity': ' Spanish National Agrarian Accounting Network', 'title': 'Economic risk assessment of the quality labels and productive efficiency strategies in Spanish extensive sheep farms'}
 {'entity': 'RECAN', 'title': 'Economic risk assessment of the quality labels and productive efficiency strategies in Spanish extensive sheep farms'}
 {'entity': ' Ministry of Agriculture, Fisheries and Food', 'title': 'Economic risk assessment of the quality labels and productive efficiency strategies in Spanish extensive sheep farms'}
 {'entity': 'Geran Penyelidikan Industri', 'title': 'DIVERSITY OF TIME ZONES AT BURJ KHALIFA IN PERFORMING PRAYERS AND FASTING IN SKYSCRAPERS'}
 {'entity': ' Dana Insentif Penerbitan Fakulti Pengajian Islam 2023', 'title'