# Research Spotlight Demo (Google Colab)

### Requirements

In [None]:
# Around 2 - 5 minutes depending on the internet connection speed
# If you can, switch to GPU for faster inference (Change runtime -> T4 GPU -> Connect)
%%capture
!pip install -r "https://raw.githubusercontent.com/athenarc/research-spotlight/refs/heads/main/MLE/requirements.txt"

# Google Colab setup
import os
os.makedirs("Dataset", exist_ok=True)
!wget https://raw.githubusercontent.com/athenarc/research-spotlight/refs/heads/main/MLE/example_subset_10.jsonl -P /content/Dataset
!wget https://huggingface.co/facebook/genre-linking-blink/raw/main/trie.py
!wget https://huggingface.co/facebook/genre-linking-blink/resolve/main/kilt_titles_trie_dict.pkl -P /content/Dataset

### Entity Extraction

#### Module initialisation

In [None]:
import spacy
import srsly
from tqdm import tqdm

# Setup module paths
m_ner_model_path = "en_deberta_v3_base_ner_method"
a_ner_model_path = "en_deberta_v3_base_ner_activity"
g_ner_model_path = "en_deberta_v3_base_ner_goal"

# Setup input and output paths
ee_input_path = "./Dataset/example_subset_10.jsonl"
ee_output_path = "./Dataset/example_subset_10_EE.jsonl"

#### Module functions

In [None]:
def NER(model_path, in_data, entity):
    print("NER for:", entity)
    ner_model = spacy.load(model_path)
    annotated_data = []
    for row in tqdm(in_data):
        sent_nlp = ner_model(row["text"])
        ner_spans = [{"start": span.start_char, "end": span.end_char, "token_start":span.start, "token_end":span.end, "mention":row["text"][span.start_char:span.end_char], "label": entity} for span in sent_nlp.ents]
        if "spans" in row:
            row["spans"] += ner_spans
        else:
            row["spans"] = ner_spans

        row["_annotator_id"] = "NER"
        row["_session_id"] = "NER"
        annotated_data.append(row)

    return(annotated_data)

#### Module call

In [None]:
input_data = srsly.read_jsonl(ee_input_path)

sents_with_M = NER(m_ner_model_path, input_data, "METHOD")
sents_with_A = NER(a_ner_model_path, sents_with_M, "ACTIVITY")
sents_with_G = NER(g_ner_model_path, sents_with_A, "GOAL")

srsly.write_jsonl(ee_output_path, sents_with_G)

In [None]:
# EXAMPLE: Visualize the NER
import spacy
from spacy import displacy
from spacy.tokens import Span

nlp = spacy.blank("en")
doc = nlp(sents_with_G[5]["text"])

doc.spans["sc"] = [
    Span(doc, sents_with_G[5]["spans"][0]["token_start"], sents_with_G[5]["spans"][0]["token_end"], sents_with_G[5]["spans"][0]["label"]),
    Span(doc, sents_with_G[5]["spans"][1]["token_start"], sents_with_G[5]["spans"][1]["token_end"], sents_with_G[5]["spans"][1]["label"]),
    Span(doc, sents_with_G[5]["spans"][2]["token_start"], sents_with_G[5]["spans"][2]["token_end"], sents_with_G[5]["spans"][2]["label"])
]

colors = {"METHOD":"cyan", "ACTIVITY":"orange","GOAL":"lime"}
options = {"colors": colors}

displacy.render(doc, jupyter=True, style="span", options=options)

### Entity Disambiguation

#### Module initialisation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
from trie import Trie
import warnings
import requests
import uuid
import os
from transformers import logging

# Disable HuggingFace & hub progress bars
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
logging.disable_progress_bar()
logging.set_verbosity_error()

# Suppress output warnings
warnings.filterwarnings("ignore", category=UserWarning, module="huggingface_hub.utils._auth")

# Load Wikipedia trie
with open("/content/Dataset/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

# Setup input and output paths
ed_input_path = "./Dataset/example_subset_10_EE.jsonl"
ed_output_path = "./Dataset/example_subset_10_EE_ED.jsonl"

#### Module functions

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/genre-linking-blink")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/genre-linking-blink").eval()

headers = {"User-Agent": f"GenreDisambiguationBot/1.0 (id={uuid.uuid4()})"}

def safe_prefix_fn(batch_id, sent):
    allowed = trie.get(sent.tolist())

    if not allowed:
        # fallback: allow EOS so this beam can terminate
        return [tokenizer.eos_token_id]

    return allowed

def genre_entity_disambiguation(model, tokenizer, text, start, end):
  sentence = [text[:start] + "[START_ENT] " + text[start:end] + " [END_ENT]" + text[end:]]
  outputs = model.generate(
    **tokenizer(sentence, return_tensors="pt"),
    num_beams=5,
    num_return_sequences=1,
    prefix_allowed_tokens_fn=safe_prefix_fn
  )

  url = f"https://en.wikipedia.org/w/api.php?action=query&prop=info&inprop=subjectid&titles={tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]}&format=json"
  json_response = requests.get(url, headers=headers).json()

  return f"https://en.wikipedia.org/wiki?curid={list(json_response["query"]["pages"].keys())[0]}"


#### Module call

In [None]:
# Around 5 minutes
data = list(srsly.read_jsonl(ed_input_path))
for item in tqdm(data):
    text = item.get("text")
    for span in item.get("spans", []):
        if span.get("label") == "METHOD":
            span["wikipedia_url"] = genre_entity_disambiguation(model, tokenizer, text, span["start"], span["end"])

srsly.write_jsonl(ed_output_path, data)

### Entity Linking

#### Module initialisation

In [None]:
import srsly
from tqdm import tqdm
from information_linking_queries.information_linking_orcid import information_linking_orcid
from information_linking_queries.information_linking_apis import information_linking

# Setup input and output paths
el_input_path = "./Dataset/example_subset_10_EE_ED.jsonl"
el_output_path = "./Dataset/example_subset_10_EE_ED_EL.jsonl"

#### Module functions

In [None]:
def link_author_info(l_row):
    author_list = []
    for a in l_row.get("meta").get("creator", []):
        # Retrieve the first and last name of the author
        # Check if the first word is bigger than two characters
        f_name = a.split()[0].strip()
        l_name = a.split()[-1].strip()

        try:
            orcid_info = information_linking_orcid(f_name, l_name)
            author_list.append({'full_name':a, 'given_name': orcid_info['given-names'], 'family_name':orcid_info['family-names'], 'orcid': orcid_info['orcid-id'], 'affiliations':orcid_info['institution-name'], 'email':orcid_info['email']})
        except:
            author_list.append({'full_name':a, 'given_name': f_name.capitalize(), 'family_name':l_name.capitalize(), 'orcid': 'None', 'affiliations':'None', 'email':'None'})
    l_row['meta']['creator'] = author_list

    return(l_row)

def link_method_info(l_row):
    for label in l_row.get("spans", []):
        if label.get("label") == "METHOD":
            method_info = information_linking(wikipedia_url=label["wikipedia_url"])

            label["description"] = method_info["description"]
            label["proper_name"] = method_info["label"]
            label["aliases"] = method_info["aliases"]
            label["wikidata_url"] = method_info["wikidata"]
            label["dbpedia_url"] = method_info["dbpedia"]

    return(l_row)

#### Module call

In [None]:
in_data = list(srsly.read_jsonl(el_input_path))
linked_data = []

for row in tqdm(in_data):
    row = link_author_info(row)
    row = link_method_info(row)
    linked_data.append(row)

srsly.write_jsonl(el_output_path, linked_data)

In [None]:
# Visualize the Entity Disambiguation for methods
from spacy import displacy

example = [{"text": linked_data[6]["text"],
            "ents": [{"start": linked_data[6]["spans"][0]["start"], "end": linked_data[6]["spans"][0]["end"], "label": linked_data[6]["spans"][0]["label"], "kb_id": linked_data[6]["spans"][0]["wikidata_url"].split("/")[-1], "kb_url": linked_data[6]["spans"][0]["wikidata_url"]},
                     {"start": linked_data[6]["spans"][1]["start"], "end": linked_data[6]["spans"][1]["end"], "label": linked_data[6]["spans"][1]["label"], "kb_id": linked_data[6]["spans"][0]["wikidata_url"].split("/")[-1], "kb_url": linked_data[6]["spans"][0]["wikidata_url"]},
                     {"start": linked_data[6]["spans"][2]["start"], "end": linked_data[6]["spans"][2]["end"], "label": linked_data[6]["spans"][2]["label"], "kb_id": linked_data[6]["spans"][0]["wikidata_url"].split("/")[-1], "kb_url": linked_data[6]["spans"][0]["wikidata_url"]}]}]

options = {"colors": {"METHOD":"cyan"}}

displacy.render(example, style="ent", jupyter=True, manual=True, options=options)

### Relation Extraction

#### Module initialisation

In [None]:
# Setup input and output paths
re_input_path = "./Dataset/example_subset_10_EE_ED_EL.jsonl"
re_output_path = "./Dataset/example_subset_10_EE_ED_EL_RE.jsonl"

#### Module functions

In [None]:
def is_overlapping(a_start, a_end, m_start, m_end):
    return max(a_start, m_start) < min(a_end, m_end)

def relation_extraction_employs(spans):
    activity_list = []
    methods_list = []
    for span in spans:
      if span.get("label") == "ACTIVITY":
        activity_list.append((span.get("start"), span.get("end"), span.get("label"), span.get("token_start"), span.get("token_end")))
      if span.get("label") == "METHOD":
        methods_list.append((span.get("start"), span.get("end"), span.get("label"), span.get("token_start"), span.get("token_end")))

    relation = []
    for domain in activity_list:
      activity_begin_num = domain[0]
      activity_end_num = domain[1]

      activity_char_start = domain[-2]
      activity_char_end = domain[-1]

      for range in methods_list:
        method_begin_num = range[0]
        method_end_num = range[1]

        method_char_start = range[-2]
        method_char_end = range[-1]
        if is_overlapping(activity_begin_num, activity_end_num, method_begin_num, method_end_num):
          relation.append({"domain":{"start":activity_char_start, "end":activity_char_end,"label":domain[2]},
                           "range":{"start":method_char_start, "end":method_char_end, "label":range[2]},
                           "label":"EMPLOYS"})
    return relation

def relation_extraction_hasObjective(spans):
    activity_list = []
    goal_list = []
    for span in spans:
      if span.get("label") == "ACTIVITY":
        activity_list.append((span.get("start"), span.get("end"), span.get("label"), span.get("token_start"), span.get("token_end")))
      if span.get("label") == "GOAL":
        goal_list.append((span.get("start"), span.get("end"), span.get("label"), span.get("token_start"), span.get("token_end")))

    relation = []
    for domain in activity_list:
      activity_begin_num = domain[0]
      activity_end_num = domain[1]

      activity_char_start = domain[-2]
      activity_char_end = domain[-1]

      for range in goal_list:
        goal_begin_num = range[0]
        goal_end_num = range[1]

        goal_char_start = domain[-2]
        goal_char_end = domain[-1]
        relation.append({"domain":{"start":activity_char_start, "end":activity_char_end, "label":domain[2]},
                        "range":{"start":goal_char_start, "end":goal_char_end, "label":range[2]},
                        "label":"HAS_OBJECTIVE"})
    return relation

#### Module call

In [None]:
data = list(srsly.read_jsonl(re_input_path))
for i in tqdm(data):
  spans = i.get("spans")
  checker = []
  relations_employs = []
  relations_hasObjective = []
  for span in spans:
    checker.append(span.get("label"))
  if "ACTIVITY" in checker and "METHOD" in checker:
    relations_employs = relation_extraction_employs(spans)
  if "ACTIVITY" in checker and "GOAL" in checker:
    relations_hasObjective = relation_extraction_hasObjective(spans)
  i["relations"] = relations_employs + relations_hasObjective

srsly.write_jsonl(re_output_path, data)

In [None]:
# Visualize the Relation Extraction module [WORK IN PROGRESS, MAY CHANGE]
import spacy
from spacy import displacy
from spacy.tokens import Span

nlp = spacy.blank("en")
doc = nlp(data[3]["text"])

doc.spans["sc"] = [
    Span(doc, data[3]["relations"][0]["domain"]["start"], data[3]["relations"][0]["domain"]["end"], data[3]["relations"][0]["domain"]["label"]),
    Span(doc, data[3]["relations"][0]["range"]["start"], data[3]["relations"][0]["range"]["end"], data[3]["relations"][0]["range"]["label"]),
    Span(doc, data[3]["relations"][0]["domain"]["start"], data[3]["relations"][0]["domain"]["end"], data[3]["relations"][0]["label"])
]

colors = {"METHOD":"cyan", "ACTIVITY":"orange", "EMPLOYS":"lime"}
options = {"colors": colors}

displacy.render(doc, jupyter=True, style="span", options=options)

### RDF conversion

#### Module initialisation

In [None]:
import srsly
from tqdm import tqdm
import re
import requests
from rdflib import Graph, Namespace, RDF, URIRef, RDFS, Literal, OWL

#### Module functions

In [None]:
def create_sentence_triples(jstor_row, article_uri, schema_ns, instances_ns, GraphObject):
    query = """
              PREFIX so: <https://scholarlyontology.aueb.gr/resources/so_schema/so_jstor_1.0#>
              SELECT DISTINCT ?m_name
              WHERE {
                  ?m_name rdf:type so:Sentence.
              }
            """

    s_list = [i[0] for i in GraphObject.query(query)]

    #print('article id:', re.sub('.*/','',jstor_row['meta']['id']))
    sent_uri = str(instances_ns)+'Sentence/'+str(row['meta']['sent_no'])
    if URIRef(sent_uri) not in s_list:
        GraphObject.add((URIRef(sent_uri), RDF.type, schema_ns.Sentence))
        GraphObject.add((URIRef(sent_uri), schema_ns.sentence_text, Literal(jstor_row['text']) ))
        GraphObject.add((URIRef(sent_uri), schema_ns.is_part_of, URIRef(article_uri) ))

    return(GraphObject, sent_uri)


def create_article_triples(jstor_row, schema_ns, instances_ns, GraphObject):
    query = """
              PREFIX so: <https://scholarlyontology.aueb.gr/resources/so_schema/so_jstor_1.0#>
              SELECT DISTINCT ?m_name
              WHERE {
                  ?m_name rdf:type so:Article.
              }
            """

    a_list = [i[0] for i in GraphObject.query(query)]

    article_uri = str(instances_ns)+'Article/'+re.sub('.*/','',row['meta']['id'])

    if URIRef(article_uri) not in a_list:
        GraphObject.add((URIRef(article_uri), RDF.type, schema_ns.Article))
        GraphObject.add((URIRef(article_uri), schema_ns.title, Literal(jstor_row['meta']['title']) ))
        if 'url' in jstor_row['meta']:
            GraphObject.add((URIRef(article_uri), schema_ns.article_URL, Literal(jstor_row['meta']['url']) ))

        local_doi = None
        for id_dict in jstor_row['meta'].get('identifier', []):
            if id_dict.get('name') == 'local_doi':
                local_doi = id_dict.get('value')
                break
        if local_doi:
            GraphObject.add((URIRef(article_uri), schema_ns.article_DOI, Literal(local_doi)))

        if 'datePublished' in jstor_row['meta']:
            GraphObject.add((URIRef(article_uri), schema_ns.publication_date, Literal(jstor_row['meta']['datePublished']) ))
        if 'publicationYear' in jstor_row['meta']:
            GraphObject.add((URIRef(article_uri), schema_ns.publication_year, Literal(jstor_row['meta']['publicationYear']) ))
        if 'issueNumber' in jstor_row['meta']:
            GraphObject.add((URIRef(article_uri), schema_ns.issue_number, Literal(jstor_row['meta']['issueNumber']) ))
        if 'publisher' in jstor_row['meta']:
            GraphObject.add((URIRef(article_uri), schema_ns.publisher, Literal(jstor_row['meta']['publisher']) ))
        if 'pageCount' in jstor_row['meta']:
            GraphObject.add((URIRef(article_uri), schema_ns.page_count, Literal(jstor_row['meta']['pageCount']) ))
        if 'docType' in jstor_row['meta']:
            GraphObject.add((URIRef(article_uri), schema_ns.doctype, Literal(jstor_row['meta']['docType']) ))


    return(GraphObject, article_uri)


def create_aggregation_triples(jstor_row, article_uri, schema_ns, instances_ns, GraphObject):
    query = """
            PREFIX so: <https://scholarlyontology.aueb.gr/resources/so_schema/so_jstor_1.0#>
            SELECT DISTINCT ?m_name
            WHERE {
                ?m_name rdf:type so:Aggregation.
            }
            """

    a_list = [i[0] for i in GraphObject.query(query)]

    aggregation_uri = str(instances_ns)+'Aggregation/'+re.sub(' ','_',jstor_row['meta']['isPartOf'])

    if URIRef(aggregation_uri) not in a_list:
        GraphObject.add((URIRef(aggregation_uri), RDF.type, schema_ns.Aggregation))
        GraphObject.add((URIRef(aggregation_uri), schema_ns.aggregation_name, Literal(jstor_row['meta']['isPartOf']) ))

    GraphObject.add((URIRef(article_uri), schema_ns.is_member_of, URIRef(aggregation_uri) ))

    return(GraphObject)


def create_topic_triples(jstor_row, article_uri, schema_ns, instances_ns, GraphObject):
    query = """
              PREFIX so: <https://scholarlyontology.aueb.gr/resources/so_schema/so_jstor_1.0#>
              SELECT DISTINCT ?m_name
              WHERE {
                  ?m_name rdf:type so:Topic.
              }
                """

    a_list = [i[0] for i in GraphObject.query(query)]

    for t in jstor_row['meta']['topics']:
        topic_uri = str(instances_ns)+'Topic/'+re.sub(' ','_',t)
        if URIRef(topic_uri) not in a_list:
            GraphObject.add((URIRef(topic_uri), RDF.type, schema_ns.Topic))
            GraphObject.add((URIRef(topic_uri), schema_ns.topic_name, Literal(t) ))

        GraphObject.add((URIRef(topic_uri), schema_ns.is_topic_of, URIRef(article_uri) ))

    return(GraphObject)


def create_author_organization_triples(jstor_row, article_uri, schema_ns, instances_ns, GraphObject):
    query = """
              PREFIX so: <https://scholarlyontology.aueb.gr/resources/so_schema/so_jstor_1.0#>
              SELECT DISTINCT ?m_name
              WHERE {
                  ?m_name rdf:type so:Person.
              }
              """

    p_list = [i[0] for i in GraphObject.query(query)]
    query = """
              PREFIX so: <https://scholarlyontology.aueb.gr/resources/so_schema/so_jstor_1.0#>
              SELECT DISTINCT ?m_name
              WHERE {
                  ?m_name rdf:type so:Organization.
              }
              """

    o_list = [i[0] for i in GraphObject.query(query)]

    authors = []
    for crt in jstor_row['meta']['creator']:
        if crt['orcid'] != 'None':
            author_uri = str(instances_ns)+'Person/'+str(crt['orcid'])
            if URIRef(author_uri) not in p_list:
                GraphObject.add((URIRef(author_uri), schema_ns.orcid, Literal(crt['orcid']) ))
                GraphObject.add((URIRef(author_uri), RDF.type, schema_ns.Person))
                GraphObject.add((URIRef(author_uri), schema_ns.full_name, Literal(crt['full_name']) ))
                GraphObject.add((URIRef(author_uri), schema_ns.family_name, Literal(crt['family_name']) ))
                GraphObject.add((URIRef(author_uri), schema_ns.given_name, Literal(crt['given_name']) ))

                for org in crt['affiliations']:
                    org_uri = str(instances_ns)+'Organization/'+re.sub(' ', '_', org)
                    if URIRef(org_uri) not in o_list:
                        GraphObject.add((URIRef(org_uri), RDF.type, schema_ns.Organization))
                        GraphObject.add((URIRef(org_uri), schema_ns.organization_name, Literal(org) ))
                    GraphObject.add((URIRef(author_uri), schema_ns.is_affiliated_to, URIRef(org_uri) ))

        elif crt['orcid'] == 'None':
            author_uri = str(instances_ns)+'Person/' + re.sub(' ','_',crt['full_name'])
            if URIRef(author_uri) not in p_list:
                GraphObject.add((URIRef(author_uri), RDF.type, schema_ns.Person))
                GraphObject.add((URIRef(author_uri), schema_ns.full_name, Literal(crt['full_name']) ))
                GraphObject.add((URIRef(author_uri), schema_ns.family_name, Literal(crt['family_name']) ))
                GraphObject.add((URIRef(author_uri), schema_ns.given_name, Literal(crt['given_name']) ))

        GraphObject.add((URIRef(author_uri), schema_ns.is_author_of, URIRef(article_uri) ))

        authors.append(author_uri)

    return(GraphObject, authors)


def create_activity_triples(row, act_span, author_uris, sent_uri, schema_ns, instances_ns, GraphObject):
    act_uri = str(instances_ns)+'Activity/'+str(row['meta']['sent_no'])+ '_'+ str(act_span['start'])+'_'+str(act_span['end'])
    GraphObject.add((URIRef(act_uri), RDF.type, schema_ns.Activity))
    GraphObject.add((URIRef(act_uri), schema_ns.textual_span, Literal(row['text'][act_span['start']:act_span['end']]) ))
    GraphObject.add((URIRef(act_uri), schema_ns.has_sentence_context, URIRef(sent_uri) ))
    GraphObject.add((URIRef(act_uri), schema_ns.begin_index, Literal(act_span['start']) ))
    GraphObject.add((URIRef(act_uri), schema_ns.end_index, Literal(act_span['end']) ))
    #GraphObject.add((URIRef(act_uri), schema_ns.has_article_context, URIRef(article_uri) ))
    for a_uri in author_uris:
        GraphObject.add((URIRef(a_uri), schema_ns.participates_in, URIRef(act_uri) ))

    return(GraphObject)

def create_goal_triples(row, g_span, author_uris, sent_uri, schema_ns, instances_ns, GraphObject):
    goal_uri = str(instances_ns)+'Goal/'+str(row['meta']['sent_no'])+ '_'+ str(g_span['start'])+'_'+str(g_span['end'])
    GraphObject.add((URIRef(goal_uri), RDF.type, schema_ns.Goal))
    GraphObject.add((URIRef(goal_uri), schema_ns.textual_span, Literal(row['text'][g_span['start']:g_span['end']]) ))
    GraphObject.add((URIRef(goal_uri), schema_ns.has_sentence_context, URIRef(sent_uri) ))
    GraphObject.add((URIRef(goal_uri), schema_ns.begin_index, Literal(g_span['start']) ))
    GraphObject.add((URIRef(goal_uri), schema_ns.end_index, Literal(g_span['end']) ))
    #GraphObject.add((URIRef(goal_uri), schema_ns.has_article_context, URIRef(article_uri) ))
    for a_uri in author_uris:
        GraphObject.add((URIRef(a_uri), schema_ns.has_goal, URIRef(goal_uri) ))

    return(GraphObject)


def create_method_triples(m_span, sent_uri, sent_no, schema_ns, instances_ns, GraphObject, sentence_text):
    """
    Create a unique Method node for each occurrence in a sentence.
    """
    # Make URI unique per sentence & span
    if 'qid' in m_span and m_span['qid'] != 'None':
        m_uri = str(instances_ns) + 'Method/' + str(m_span['qid']) +"_"+ str(sent_no)+"_"+ str(m_span["start"]) + "_"+ str(m_span["end"])
    else:
        m_uri = str(instances_ns) + 'Method/' + re.sub(' ', '_', m_span['proper_name']) +"_"+ str(sent_no) +"_"+ str(m_span["start"]) + "_"+ str(m_span["end"])

    # Create the node
    GraphObject.add((URIRef(m_uri), RDF.type, schema_ns.Method))
    GraphObject.add((URIRef(m_uri), schema_ns.method_name, Literal(str(m_span['proper_name']))))
    GraphObject.add((URIRef(m_uri), schema_ns.textual_span, Literal(sentence_text[m_span['start']:m_span['end']])))
    GraphObject.add((URIRef(m_uri), schema_ns.has_sentence_context, URIRef(sent_uri)))
    #GraphObject.add((URIRef(m_uri), schema_ns.has_article_context, URIRef(article_uri)))

    # Optional metadata
    if 'wikidata_url' in m_span and m_span['wikidata_url'] not in [None, 'None']:
        GraphObject.add((URIRef(m_uri), schema_ns.wikidata_url, Literal(str(m_span['wikidata_url']))))
    if 'description' in m_span and m_span['description'] not in [None, 'None']:
        GraphObject.add((URIRef(m_uri), schema_ns.description, Literal(str(m_span['description']))))
    if 'aliases' in m_span and m_span['aliases'] not in [None, []]:
        GraphObject.add((URIRef(m_uri), schema_ns.aliases, Literal(str(m_span['aliases']))))
    if 'wikipedia_url' in m_span and m_span['wikipedia_url'] not in [None, 'None']:
        GraphObject.add((URIRef(m_uri), schema_ns.wikipedia_url, Literal(str(m_span['wikipedia_url']))))
    if 'qid' in m_span and m_span['qid'] != 'None':
        GraphObject.add((URIRef(m_uri), schema_ns.qid, Literal(str(m_span['qid']))))

    return m_uri, GraphObject


def spans_overlap(start1, end1, start2, end2):
    return not (end1 < start2 or end2 < start1)

#### Schema decleration

In [None]:
schema_ns = Namespace("https://scholarlyontology.aueb.gr/resources/so_schema/so_MLE#")
instances_ns = Namespace("https://scholarlyontology.aueb.gr/resources/so_instances/so_MLE#")
nif_ns = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")

g = Graph()

g.bind("so", schema_ns)
g.bind("inst", instances_ns)
g.bind("nif", nif_ns)


# Classes
def declare_class(cls, parent=None):
    g.add((cls, RDF.type, RDFS.Class))
    if parent:
        g.add((cls, RDFS.subClassOf, parent))

# Higher level
declare_class(schema_ns.SO_Entity)
declare_class(schema_ns.Object, schema_ns.SO_Entity)
declare_class(schema_ns.ConceptualObject, schema_ns.Object)
declare_class(schema_ns.InformationResource, schema_ns.ConceptualObject)
declare_class(schema_ns.Assertion, schema_ns.ConceptualObject)
declare_class(schema_ns.Event, schema_ns.SO_Entity)
declare_class(schema_ns.Actor, schema_ns.SO_Entity)
declare_class(schema_ns.Group, schema_ns.Actor)
declare_class(schema_ns.ContentItem, schema_ns.InformationResource)

# With instances
declare_class(schema_ns.Activity, schema_ns.Event)
declare_class(schema_ns.Method, schema_ns.ConceptualObject)
declare_class(schema_ns.Topic, schema_ns.ConceptualObject)
declare_class(schema_ns.Goal, schema_ns.Assertion)
declare_class(schema_ns.Person, schema_ns.Actor)
declare_class(schema_ns.Organization, schema_ns.Group)
declare_class(schema_ns.Article, schema_ns.ContentItem)
declare_class(schema_ns.Sentence, schema_ns.ContentItem)
declare_class(schema_ns.Aggregation, schema_ns.InformationResource)


# Object properties
def declare_object_property(p, domain, range):
    g.add((p, RDF.type, OWL.ObjectProperty))
    g.add((p, RDFS.domain, domain))
    g.add((p, RDFS.range, range))

declare_object_property(schema_ns.employs, schema_ns.Activity, schema_ns.Method)
declare_object_property(schema_ns.participates_in, schema_ns.Person, schema_ns.Activity)
declare_object_property(schema_ns.has_goal, schema_ns.Person, schema_ns.Goal)
declare_object_property(schema_ns.has_objective, schema_ns.Activity, schema_ns.Goal)
declare_object_property(schema_ns.has_sentence_context, nif_ns.String, nif_ns.Context)
declare_object_property(schema_ns.is_member_of, schema_ns.Article, schema_ns.Aggregation)
declare_object_property(schema_ns.is_topic_of, schema_ns.Topic, schema_ns.Article)
declare_object_property(schema_ns.uses_method, schema_ns.Person, schema_ns.Method)
declare_object_property(schema_ns.is_part_of, schema_ns.Sentence, schema_ns.Article)
declare_object_property(schema_ns.is_author_of, schema_ns.Person, schema_ns.Article)
declare_object_property(schema_ns.is_affiliated_to, schema_ns.Person, schema_ns.Organization)


# Datatype properties
def declare_datatype_property(p, domain):
    g.add((p, RDF.type, OWL.DatatypeProperty))
    g.add((p, RDFS.domain, domain))

declare_datatype_property(schema_ns.full_name, schema_ns.Person)
declare_datatype_property(schema_ns.given_name, schema_ns.Person)
declare_datatype_property(schema_ns.family_name, schema_ns.Person)
declare_datatype_property(schema_ns.orcid, schema_ns.Person)

declare_datatype_property(schema_ns.begin_index, nif_ns.String)
declare_datatype_property(schema_ns.end_index, nif_ns.String)

declare_datatype_property(schema_ns.publication_year, schema_ns.Article)
declare_datatype_property(schema_ns.title, schema_ns.Article)
declare_datatype_property(schema_ns.article_DOI, schema_ns.Article)

declare_datatype_property(schema_ns.organization_name, schema_ns.Organization)
declare_datatype_property(schema_ns.aggregation_name, schema_ns.Aggregation)

declare_datatype_property(schema_ns.sentence_text, schema_ns.Sentence)
declare_datatype_property(schema_ns.topic_name, schema_ns.Topic)

declare_datatype_property(schema_ns.wikidata_url, schema_ns.Method)
declare_datatype_property(schema_ns.qid, schema_ns.Method)


# Save schema
g.serialize("./Dataset/so_MLE_schema.rdf", format="xml")

#### Module call

In [None]:
# Load schema
g = Graph()
g.parse("./Dataset/so_MLE_schema.rdf", format="xml")
g.bind("so", schema_ns)
g.bind("inst", instances_ns)
g.bind("nif", nif_ns)

# Load file
input_path = "./Dataset/example_subset_10_EE_ED_EL_RE.jsonl"
in_data = srsly.read_jsonl(input_path)

# Module call
for row in tqdm(in_data):
    if row["answer"] == "accept" or row["answer"] == "ignore":
        activity_uris = []
        goal_uris = []
        method_uris = []
        g, article_uri = create_article_triples(row, schema_ns, instances_ns, g)
        g, sent_uri = create_sentence_triples(row, article_uri, schema_ns, instances_ns, g)
        try:
            g = create_aggregation_triples(row, article_uri, schema_ns, instances_ns, g)
        except:
            pass
        try:
            g = create_topic_triples(row, article_uri, schema_ns, instances_ns, g)
        except:
            pass
        g, author_uris = create_author_organization_triples(row, article_uri, schema_ns, instances_ns, g)

        if len(row['spans']) >0:
            for span in row['spans']:
                if span['label'] == 'ACTIVITY':
                    g = create_activity_triples(row, span, author_uris, sent_uri, schema_ns, instances_ns, g)
                    activity_uri = str(instances_ns)+'Activity/'+str(row['meta']['sent_no'])+ '_'+ str(span['start'])+'_'+str(span['end'])
                    activity_uris.append({'uri': activity_uri, 'span': span})
                elif span['label'] == 'METHOD':
                    method_uri, g = create_method_triples(span, sent_uri, row['meta']['sent_no'], schema_ns, instances_ns, g, row['text'])
                    method_uris.append({'uri': method_uri, 'span': span})
                elif span['label'] == 'GOAL':
                    g = create_goal_triples(row, span, author_uris, sent_uri, schema_ns, instances_ns, g)
                    goal_uri = str(instances_ns)+'Goal/'+str(row['meta']['sent_no'])+ '_'+ str(span['start'])+'_'+str(span['end'])
                    goal_uris.append({'uri': goal_uri, 'span': span})

            for act in activity_uris:
                act_start, act_end = act['span']['start'], act['span']['end']

                for meth in method_uris:
                    m_start, m_end = meth['span']['start'], meth['span']['end']
                    if spans_overlap(int(act_start), int(act_end), int(m_start), int(m_end)):
                        g.add((URIRef(act['uri']), schema_ns.employs, URIRef(meth['uri'])))

            for act in activity_uris:
                act_uri = URIRef(act['uri'])
                for goal in goal_uris:
                    goal_uri = URIRef(goal['uri'])
                    # If you want to restrict to overlapping spans, uncomment the if:
                    # g_start, g_end = goal['span']['start'], goal['span']['end']
                    # if spans_overlap(act['span']['start'], act['span']['end'], g_start, g_end):
                    g.add((act_uri, schema_ns.has_objective, goal_uri))

            for act in activity_uris:
                act_uri = URIRef(act['uri'])
                for meth in method_uris:
                    meth_uri = URIRef(meth['uri'])
                    if (act_uri, schema_ns.employs, meth_uri) in g:
                        for person_uri in [p for p in author_uris if (URIRef(p), schema_ns.participates_in, act_uri) in g]:
                            g.add((URIRef(person_uri), schema_ns.uses_method, meth_uri))

# Save RDF file
output_path = "./Dataset/MLE_RDF.rdf"
g.serialize(destination=output_path, format='xml')