In [43]:
import requests
from rdflib.namespace import RDF, RDFS, XSD
from rdflib import Graph, Literal, URIRef
import os
import re


In [44]:
class Publication:
    def __init__(self, title, doi, cites, num_pages, publication_date, language, pages, published_in, main_subject, instance_of, author, topic, proba_topic):
        self._title = title
        self._doi = doi
        self._cites = cites
        self._num_pages = num_pages
        self._publication_date = publication_date
        self._language = language
        self._pages = pages
        self._published_in = published_in
        self._main_subject = main_subject
        self._instance_of = instance_of
        self._author = author
        self._topic = topic
        self._proba_topic = proba_topic

    # Getters
    def get_title(self):
        return self._title

    def get_doi(self):
        return self._doi

    def get_cites(self):
        return self._cites

    def get_num_pages(self):
        return self._num_pages

    def get_publication_date(self):
        return self._publication_date

    def get_language(self):
        return self._language

    def get_pages(self):
        return self._pages

    def get_published_in(self):
        return self._published_in

    def get_main_subject(self):
        return self._main_subject

    def get_instance_of(self):
        return self._instance_of

    def get_author(self):
        return self._author
    
    def get_topic(self):
        return self._topic
    
    def get_proba_topic(self):
        return self._proba_topic

    # Setters
    def set_title(self, title):
        self._title = title

    def set_doi(self, doi):
        self._doi = doi

    def set_cites(self, cites):
        self._cites = cites

    def set_num_pages(self, num_pages):
        self._num_pages = num_pages

    def set_publication_date(self, publication_date):
        self._publication_date = publication_date

    def set_language(self, language):
        self._language = language

    def set_pages(self, pages):
        self._pages = pages

    def set_published_in(self, published_in):
        self._published_in = published_in

    def set_main_subject(self, main_subject):
        self._main_subject = main_subject

    def set_instance_of(self, instance_of):
        self._instance_of = instance_of

    def set_author(self, author):
        self._author = author

    def set_topic(self, topic):
        self._topic = topic
    
    def set_proba_topic(self, proba_topic):
        self._proba_topic = proba_topic

    #Other fucntions
    def display_info(self):
        print("Title:", self._title)
        print("DOI:", self._doi)
        print("Cites:", self._cites)
        print("Number of Pages:", self._num_pages)
        print("Publication Date:", self._publication_date)
        print("Language:", self._language)
        print("Pages:", self._pages)
        print("Published In:", self._published_in)
        print("Main Subject:", self._main_subject)
        print("Instance Of:", self._instance_of)
        print("Author:", self._author)
        print("Topic:", self._topic)
        print("Probability of Topic:", self._proba_topic)
        print("\n")



In [45]:

list_papers = []
author_by_doi = {}
topic_by_doi = {}
topic_and_prob_by_title = {}
topic_and_prob_by_doi = {}

wikidata_res = '../../papers/wikidata/results.csv'
openalex_res = '../../papers/openalex/results.csv'
topic_res = '../../papers/probabilities/'
doi_res = '../../papers/doi/'

# read authors
with open(openalex_res, 'r') as f:
    f.readline()
    for line in f:
        line_split = line.split(',')
        doi = line_split[0]
        author = line_split[1]
        author_institution = line_split[2]

        if doi in author_by_doi and author not in author_by_doi[doi]:
            author_by_doi[doi].append(author)
        else:
            author_by_doi[doi] = [author]

# read topics and probabilities
for filename in os.listdir(topic_res):
    with open(topic_res + filename, 'r') as f:
        line = f.read()

        topic_start = line.find("Topic: [") + len("Topic: [")
        topic_end = line.find("]", topic_start)
        prob_start = line.find("Probability: ") + len("Probability: ")
        prob_end = line.find("\n", prob_start)

        topic = line[topic_start:topic_end]
        probabilidad = float(line[prob_start:prob_end])
        topic_and_prob_by_title[filename] = (topic, probabilidad)
    
for filename in os.listdir(doi_res):
    with open(doi_res + filename, 'r') as f:
        if filename in topic_and_prob_by_title:
            doi = f.read()
            topic = topic_and_prob_by_title[filename][0]
            probabilidad = topic_and_prob_by_title[filename][1]
            topic_and_prob_by_doi[doi] = (topic, probabilidad)

# read csv
with open(wikidata_res, 'r') as f:
    # skip header
    f.readline()
    for line in f:
        line_split = line.split(',')
        # save all atributes in paper object
        doi = line_split[1]
        if doi in author_by_doi:
            authors = author_by_doi[doi]
        else:
            authors = []

        if doi in topic_and_prob_by_doi:
            topic, probabilidad = topic_and_prob_by_doi[doi]
        else:
            topic = ""
            probabilidad = 0
        
        line_split[9] = line_split[9].replace('\n', '')
        paper = Publication(line_split[0], line_split[1], line_split[2], line_split[3], line_split[4], line_split[5], line_split[6], line_split[7], line_split[8], line_split[9], authors, topic, probabilidad)
        list_papers.append(paper)
        paper.display_info()

Title: A multilevel review of artificial intelligence in organizations: Implications for organizational behavior research and practice
DOI: 10.1002/JOB.2735
Cites: N/A
Number of Pages: N/A
Publication Date: 2023-08-02T00:00:00Z
Language: N/A
Pages: 159-182
Published In: http://www.wikidata.org/entity/Q1709866
Main Subject: N/A
Instance Of: N/A
Author: ['Simon Lloyd D. Restubog', 'Sang Eun Woo']
Topic: 'ai', 'use', 'technology', 'human', 'organizational', 'theme', 'worker', 'review', 'implication', 'algorithmic'
Probability of Topic: 0.9928


Title: A systematic review of the applications of artificial intelligence and machine learning in autoimmune diseases
DOI: 10.1038/S41746-020-0229-3
Cites: http://www.wikidata.org/entity/Q47173123
Number of Pages: N/A
Publication Date: 2020-03-09T00:00:00Z
Language: N/A
Pages: 30
Published In: http://www.wikidata.org/entity/Q73908508
Main Subject: http://www.wikidata.org/entity/Q2539
Instance Of: N/A
Author: []
Topic: 'data', 'learning', 'fraud', '

In [46]:
g = Graph()

for paper in list_papers:
    paper_uri = URIRef("http://example.org/" + paper.get_doi())
    g.add((paper_uri, RDF.type, URIRef("http://schema.org/ScholarlyArticle")))
    g.add((paper_uri, RDFS.label, Literal(paper.get_title())))
    g.add((paper_uri, URIRef("http://schema.org/doi"), Literal(paper.get_doi())))
    g.add((paper_uri, URIRef("http://schema.org/cites"), Literal(paper.get_cites())))
    g.add((paper_uri, URIRef("http://schema.org/numPages"), Literal(paper.get_num_pages())))
    g.add((paper_uri, URIRef("http://schema.org/publicationDate"), Literal(paper.get_publication_date())))
    g.add((paper_uri, URIRef("http://schema.org/inLanguage"), Literal(paper.get_language())))
    g.add((paper_uri, URIRef("http://schema.org/pages"), Literal(paper.get_pages())))
    g.add((paper_uri, URIRef("http://schema.org/publishedIn"), Literal(paper.get_published_in())))
    g.add((paper_uri, URIRef("http://schema.org/mainSubject"), Literal(paper.get_main_subject())))
    g.add((paper_uri, URIRef("http://schema.org/author"), Literal(paper.get_author())))
    g.add((paper_uri, URIRef("http://schema.org/topic"), Literal(paper.get_topic())))
    g.add((paper_uri, URIRef("http://schema.org/probability"), Literal(paper.get_proba_topic())))

http://example.org/ machine learning does not look like a valid URI, trying to serialize this will break.
http://example.org/ present and future." does not look like a valid URI, trying to serialize this will break.


In [47]:
from rdflib import Namespace
onto = Namespace("http://example.org/")
g.bind("onto", onto)

while True:
    try:
        query = input("Enter a SPARQL query or 'exit' to finish: ")
        if query == 'exit':
            break
        else:
            for row in g.query(query):
                print(row)
    except Exception as e:
        print(e)
        continue

In [48]:
for s, p, o in g:
    print(s, p, o)

http://example.org/10.1136/BJOPHTHALMOL-2018-313173 http://schema.org/mainSubject http://www.wikidata.org/entity/Q2539
http://example.org/10.1002/JOB.2735 http://schema.org/topic 'ai', 'use', 'technology', 'human', 'organizational', 'theme', 'worker', 'review', 'implication', 'algorithmic'
http://example.org/10.1177/0165551519877646 http://schema.org/cites N/A
http://example.org/10.1145/3184558.3191645 http://schema.org/publishedIn http://www.wikidata.org/entity/Q51885042
http://example.org/10.55248/GENGPI.2022.3.8.37 http://schema.org/probability 0.9857
http://example.org/10.1093/NAR/GKP760 http://schema.org/doi 10.1093/NAR/GKP760
http://example.org/ machine learning http://schema.org/publicationDate 10.1590/0100-3984.2019.0049
http://example.org/10.1093/NAR/GKP760 http://schema.org/inLanguage http://www.wikidata.org/entity/Q1860
http://example.org/10.1038/NATURE14539 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://schema.org/ScholarlyArticle
http://example.org/10.1038/NATURE14