### Entity Extraction

#### module initialisation

In [None]:
import spacy
import srsly
from tqdm import tqdm

# Setup module paths
m_ner_model_path = "./Models/METHOD_NER"
a_ner_model_path = "./Models/ACTIVITY_NER"
g_ner_model_path = "./Models/GOAL_NER"

# Setup input and output paths
ee_input_path = "./Dataset/example_subset_20.jsonl"
ee_output_path = "./Dataset/example_subset_20_EE.jsonl"

#### module functions

In [2]:
def NER(model_path, in_data, entity):
    print("NER for:", entity)
    ner_model = spacy.load(model_path)
    annotated_data = []
    for row in tqdm(in_data):
        sent_nlp = ner_model(row["text"])
        ner_spans = [{"start_char": span.start_char, "end_char": span.end_char, "token_start":span.start, "token_end":span.end, "mention":row["text"][span.start_char:span.end_char], "label": entity} for span in sent_nlp.ents]
        if "spans" in row:
            row["spans"] += ner_spans
        else:
            row["spans"] = ner_spans

        row["_annotator_id"] = "NER"
        row["_session_id"] = "NER"
        annotated_data.append(row)

    return(annotated_data)

#### module call

In [None]:
input_data = srsly.read_jsonl(ee_input_path)

sents_with_M = NER(m_ner_model_path, input_data, "METHOD")
sents_with_A = NER(a_ner_model_path, sents_with_M, "ACTIVITY")
sents_with_G = NER(g_ner_model_path, sents_with_A, "GOAL")

srsly.write_jsonl(ee_output_path, sents_with_G)

### Entity Disambiguation

#### module initialisation

In [None]:
import spacy
from zshot import PipelineConfig, MentionsExtractor
from zshot.linker import LinkerRegen
from zshot.linker.linker_regen.utils import load_wikipedia_trie
from zshot.utils.mappings import spans_to_wikipedia
from zshot.utils.data_models import Span
import srsly
from tqdm import tqdm
import ssl
import logging

# If you want to disable warnings (let only the ERRORs pass) uncomment the following line:
logging.getLogger().setLevel(logging.ERROR)
# To use unverified ssl you can add this to your code:
ssl._create_default_https_context = ssl._create_unverified_context
# Load the Wikipedia trie
wikipedia_trie = load_wikipedia_trie()

# Setup input and output paths
ed_input_path = "./Dataset/example_subset_20_EE.jsonl"
ed_output_path = "./Dataset/example_subset_20_EE_ED.jsonl"

#### module functions

In [3]:
class SimpleMentionExtractor(MentionsExtractor):
    def __init__(self, positions):
        self.positions = positions
    def predict(self, docs, batch_size=None):
        # Returns the character indexes of the mention spans for every text as spaCy Span object
        return [[Span(start, end) for start, end in self.positions] for _ in docs]

# Function to run Entity Disambiguation
def genre_wikipedia(text, start, end):
    nlp_wikipedia = spacy.load("en_core_web_sm")
    nlp_config = PipelineConfig(
        mentions_extractor=SimpleMentionExtractor([(start,end)]),
        linker=LinkerRegen(trie=wikipedia_trie)
    )
    nlp_wikipedia.add_pipe("zshot", config=nlp_config, last=True)
    doc = nlp_wikipedia(text)
    # Extract Wikipedia IDs from the spans after the entity linking, if no valid ID is found, return "NIL"
    wikipedia_id = [i if i and "=" in i else "NIL" for i in spans_to_wikipedia(doc._.spans)]
    return wikipedia_id[0]

#### module call

In [None]:
data = list(srsly.read_jsonl(ed_input_path))
for item in tqdm(data):
    text = item.get("text")
    for span in item.get("spans", []):
        if span.get("label") == "METHOD":
            span["wikipedia_url"] = genre_wikipedia(text, span["start_char"], span["end_char"])

srsly.write_jsonl(ed_output_path, data)

### Entity Linking

#### module initialisation

In [None]:
import srsly
from tqdm import tqdm
from linking_information_queries.information_linking_orcid import information_linking_orcid
from linking_information_queries.information_linking_apis import information_linking

# Setup input and output paths
el_input_path = "./Dataset/example_subset_20_EE_ED.jsonl"
el_output_path = "./Dataset/example_subset_20_EE_ED_EL.jsonl"

#### module functions

In [None]:
def link_author_info(l_row):
    author_list = []
    for a in l_row.get("meta").get("creator", []):
        # Retrieve the first and last name of the author
        # Check if the first word is bigger than two characters
        f_name = a.split()[0].strip()
        l_name = a.split()[-1].strip()

        try:
            orcid_info = information_linking_orcid(f_name, l_name)
            author_list.append({'full_name':a, 'given_name': orcid_info['given-names'], 'family_name':orcid_info['family-names'], 'orcid': orcid_info['orcid-id'], 'affiliations':orcid_info['institution-name'], 'email':orcid_info['email']})
        except:
            author_list.append({'full_name':a, 'given_name': f_name.capitalize(), 'family_name':l_name.capitalize(), 'orcid': 'None', 'affiliations':'None', 'email':'None'})
    l_row['meta']['creator'] = author_list
    
    return(l_row)

def link_method_info(l_row):
    for label in l_row.get("spans", []):
        if label.get("label") == "METHOD":
            method_info = information_linking(wikipedia_url=label["wikipedia_url"])

            label["description"] = method_info["description"]
            label["proper_name"] = method_info["label"]
            label["aliases"] = method_info["aliases"]
            label["wikidata_url"] = method_info["wikidata"]
            label["dbpedia_url"] = method_info["dbpedia"]

    return(l_row)

#### module call

In [None]:
in_data = list(srsly.read_jsonl(el_input_path))
linked_data = []

for row in tqdm(in_data):
    row = link_author_info(row)
    row = link_method_info(row)
    linked_data.append(row)

srsly.write_jsonl(el_output_path, linked_data)

### Relation Extraction

#### module initialisation

In [None]:
# Setup input and output paths
re_input_path = "./Dataset/example_subset_20_EE_ED_EL.jsonl"
re_output_path = "./Dataset/example_subset_20_EE_ED_EL_RE.jsonl"

#### module functions

In [None]:
def is_overlapping(a_start, a_end, m_start, m_end):
    return max(a_start, m_start) < min(a_end, m_end)

def relation_extraction_employs(spans, text):
    activity_list = []
    methods_list = []
    for span in spans:
      if span.get("label") == "ACTIVITY":
        activity_list.append((span.get("start_char"), span.get("end_char"), span.get("label"), text))
      if span.get("label") == "METHOD":
        methods_list.append((span.get("start_char"), span.get("end_char"), span.get("label")))

    relation = []
    for domain in activity_list:
      activity_begin_num = domain[0]
      activity_end_num = domain[1]
      for range in methods_list:
        method_begin_num = range[0]
        method_end_num = range[1]
        if is_overlapping(activity_begin_num, activity_end_num, method_begin_num, method_end_num):
          relation.append({"domain":{"start_char":activity_begin_num, "end_char":activity_end_num, "span":domain[3][activity_begin_num:activity_end_num],"label":domain[2]}, 
                           "range":{"start_char":method_begin_num, "end_char":method_end_num, "span":domain[3][method_begin_num:method_end_num], "label":range[2]},
                           "label":"EMPLOYS"})
    return relation

def relation_extraction_hasObjective(spans, text):
    activity_list = []
    goal_list = []
    for span in spans:
      if span.get("label") == "ACTIVITY":
        activity_list.append((span.get("start_char"), span.get("end_char"), span.get("label"), text))
      if span.get("label") == "GOAL":
        goal_list.append((span.get("start_char"), span.get("end_char"), span.get("label")))

    relation = []
    for domain in activity_list:
      activity_begin_num = domain[0]
      activity_end_num = domain[1]
      for range in goal_list:
        goal_begin_num = range[0]
        goal_end_num = range[1]
        relation.append({"domain":{"start_char":activity_begin_num, "end_char":activity_end_num, "span":domain[3][activity_begin_num:activity_end_num],"label":domain[2]}, 
                        "range":{"start_char":goal_begin_num, "end_char":goal_end_num, "span":domain[3][goal_begin_num:goal_end_num], "label":range[2]},
                        "label":"HAS_OBJECTIVE"})
    return relation

#### module call

In [None]:
data = list(srsly.read_jsonl(re_input_path))
for i in tqdm(data):
  text = i.get("text")
  spans = i.get("spans")
  checker = []
  relations_employs = []
  relations_hasObjective = []
  for span in spans:
    checker.append(span.get("label"))
  if "ACTIVITY" in checker and "METHOD" in checker:
    relations_employs = relation_extraction_employs(spans, text)
  if "ACTIVITY" in checker and "GOAL" in checker:
    relations_hasObjective = relation_extraction_hasObjective(spans, text)
  i["relations"] = relations_employs + relations_hasObjective 

srsly.write_jsonl(re_output_path, data)