In [1]:
import requests
from rdflib.namespace import RDF, RDFS, XSD
from rdflib import Graph, Literal, URIRef
import os
import re

In [2]:
class Publication:
    def __init__(self, title, doi, cites, num_pages, publication_date, language, pages, published_in, main_subject, instance_of, author, topic, similar_papers, entities):
        self._title = title
        self._doi = doi
        self._cites = cites
        self._num_pages = num_pages
        self._publication_date = publication_date
        self._language = language
        self._pages = pages
        self._published_in = published_in
        self._main_subject = main_subject
        self._instance_of = instance_of
        self._author = author
        self._topic = topic
        self._similar_papers = similar_papers
        self._entities = entities

    # Getters
    def get_title(self):
        return self._title

    def get_doi(self):
        return self._doi

    def get_cites(self):
        return self._cites

    def get_num_pages(self):
        return self._num_pages

    def get_publication_date(self):
        return self._publication_date

    def get_language(self):
        return self._language

    def get_pages(self):
        return self._pages

    def get_published_in(self):
        return self._published_in

    def get_main_subject(self):
        return self._main_subject

    def get_instance_of(self):
        return self._instance_of

    def get_author(self):
        return self._author
    
    def get_topic(self):
        return self._topic

    def get_similar_papers(self):
        return self._similar_papers
    
    def get_entities(self):
        return self._entities

    #Other fucntions
    def display_info(self):
        print("Title:", self._title)
        print("DOI:", self._doi)
        print("Cites:", self._cites)
        print("Number of Pages:", self._num_pages)
        print("Publication Date:", self._publication_date)
        print("Language:", self._language)
        print("Pages:", self._pages)
        print("Published In:", self._published_in)
        print("Main Subject:", self._main_subject)
        print("Instance Of:", self._instance_of)
        print("Author:", self._author)
        print("Topic:", self._topic)
        print("Similar Papers:", self._similar_papers)
        print("Entities:", self._entities)
        print("\n")

In [3]:
list_papers = []
author_by_doi = {}
topic_by_doi = {}
topic_and_prob_by_title = {}
topic_and_prob_by_doi = {}
possible_topics = []
similarities_by_doi = {}
similarities_by_title = []
entities_by_doi = {}

wikidata_res = '../../papers/wikidata/results.csv'
openalex_res = '../../papers/openalex/results.csv'
prob_res = '../../papers/probabilities/'
doi_res = '../../papers/doi/'
topic_res = '../../papers/topics/'
similarities_res = '../../papers/similarities/'
ner_res = '../../papers/ner/'
aux_list = []

# read authors
with open(openalex_res, 'r') as f:
    f.readline()
    for line in f:
        line_split = line.split(',')
        doi = line_split[0]
        author = line_split[1]
        author_institution = line_split[2]

        if doi in author_by_doi and author not in author_by_doi[doi]:
            author_by_doi[doi].append(author)
        else:
            author_by_doi[doi] = [author]

# read topics
with open(topic_res + 'topics.txt', 'r') as f:
    lines = f.readlines()

    for line in lines:
        if line == '\n':
            continue
        line = line.replace('\n', '')
        topic = line.split(":")[1].strip().replace(",", "_").replace(" ", "")
        topic = str(topic)
        possible_topics.append(topic)

# read topics and probabilities
for filename in os.listdir(prob_res):
    with open(prob_res + filename, 'r') as f:
        line = f.read()

        topic_start = line.find("Topic: [") + len("Topic: [")
        topic_end = line.find("]", topic_start)
        prob_start = line.find("Probability: ") + len("Probability: ")
        prob_end = line.find("\n", prob_start)

        topic = line[topic_start:topic_end]
        probabilidad = float(line[prob_start:prob_end])
        topic_and_prob_by_title[filename] = (topic, probabilidad)

with open(similarities_res + 'similarities.txt', 'r') as f:
    lines = f.readlines()

    for line in lines:
        line = line.replace('\n', '')
        line_split = line.split(";")
        title1 = line_split[0]
        title2 = line_split[1]
        similarity = float(line_split[2])

        similarities_by_title.append((title1, title2, similarity))
        
        if title1 not in aux_list:
            aux_list.append(title1)
        if title2 not in aux_list:
            aux_list.append(title2)    
    
for filename in os.listdir(doi_res):
    with open(doi_res + filename, 'r') as f:
        if filename in topic_and_prob_by_title:
            doi = f.read()
            topic = topic_and_prob_by_title[filename][0]
            probabilidad = topic_and_prob_by_title[filename][1]
            topic_and_prob_by_doi[doi] = (topic, probabilidad)
        
        if filename in aux_list:
            for title1, title2, similarity in similarities_by_title:
                doi1 = open(doi_res + title1, 'r').read()
                doi2 = open(doi_res + title2, 'r').read()
                if title1 == filename:
                    if similarities_by_doi.get(doi1) is None:
                        similarities_by_doi[doi1] = [doi2]
                    else:
                        similarities_by_doi[doi1].append(doi2)
                elif title2 == filename:
                    if similarities_by_doi.get(doi2) is None:
                        similarities_by_doi[doi2] = [doi1]
                    else:
                        similarities_by_doi[doi2].append(doi1)

# read ner
for filename in os.listdir(ner_res):
    with open(ner_res + filename, 'r') as f:
        with open(doi_res + filename, 'r') as f2:
            entities = f.read()
            doi = f2.read()
            entities = str(entities.replace('[', '').replace(']', '').replace(",", " -"))
            entities_by_doi[doi] = entities
            
# read csv
with open(wikidata_res, 'r') as f:
    # skip header
    f.readline()
    for line in f:
        if line.startswith('"'):
            title = line.split('"')[1]
            doi = line.split('"')[2]
            doi = doi.split(',')[1]
            # change content between double quotes
            line = line.replace(title, "")
        else:    
            line_split = line.split(',')
            # save all atributes in paper object
            title = line_split[0]
            doi = line_split[1]
            
        line_split = line.split(',')

        if doi in author_by_doi:
            authors = author_by_doi[doi]
            authors = str(authors).replace(',', ' -').replace('[', '').replace(']', '')
        else:
            authors = ""

        if doi in topic_and_prob_by_doi:
            topic, probabilidad = topic_and_prob_by_doi[doi]
        else:
            topic = ""

        if doi in entities_by_doi:
            entities = entities_by_doi[doi]
        else:
            entities = ""

        similar_papers = []
        
        if doi in similarities_by_doi:
            similar_papers = similarities_by_doi[doi]
        
        similar_papers = str(similar_papers).replace(',', ' -').replace('[', '').replace(']', '')
        
        line_split[9] = line_split[9].replace('\n', '')
        title = title.replace(" ", "_")
        title = title.replace(",", "")
        cleaned_topics = topic.replace("'", "").replace(",", "").replace(" ", "_")

        paper = Publication(title, doi, line_split[2], line_split[3], line_split[4], line_split[5], line_split[6], line_split[7], line_split[8], line_split[9], authors, cleaned_topics, similar_papers, entities)
        list_papers.append(paper)

In [4]:
g = Graph()

for paper in list_papers:
    paper_uri = URIRef("http://example.org/" + paper.get_doi())
    g.add((paper_uri, RDF.type, URIRef("http://schema.org/paper")))
    g.add((paper_uri, URIRef("http://schema.org/title"), Literal(paper.get_title())))
    g.add((paper_uri, URIRef("http://schema.org/doi"), Literal(paper.get_doi())))
    g.add((paper_uri, URIRef("http://schema.org/cites"), Literal(paper.get_cites())))
    g.add((paper_uri, URIRef("http://schema.org/numPages"), Literal(paper.get_num_pages())))
    g.add((paper_uri, URIRef("http://schema.org/publicationDate"), Literal(paper.get_publication_date())))
    g.add((paper_uri, URIRef("http://schema.org/inLanguage"), Literal(paper.get_language())))
    g.add((paper_uri, URIRef("http://schema.org/pages"), Literal(paper.get_pages())))
    g.add((paper_uri, URIRef("http://schema.org/publishedIn"), Literal(paper.get_published_in())))
    g.add((paper_uri, URIRef("http://schema.org/mainSubject"), Literal(paper.get_main_subject())))
    g.add((paper_uri, URIRef("http://schema.org/author"), Literal(paper.get_author())))
    g.add((paper_uri, URIRef("http://schema.org/topic"), Literal(paper.get_topic())))
    g.add((paper_uri, URIRef("http://schema.org/similarPapers"), Literal(paper.get_similar_papers())))
    g.add((paper_uri, URIRef("http://schema.org/acknowledges"), Literal(paper.get_entities())))

for topic in possible_topics:
    topic_uri = URIRef("http://example.org/"+topic)
    g.add((topic_uri, RDF.type, URIRef("http://schema.org/topic")))
    g.add((topic_uri, URIRef("http://schema.org/name"), Literal(topic)))

# g.serialize(destination='papers.xml', format='xml')

In [5]:
from rdflib import Namespace
onto = Namespace("http://example.org/")
g.bind("onto", onto)

while True:
    try:
        query = input("Enter a SPARQL query or 'exit' to finish: ")
        if query == 'exit':
            break
        else:
            for row in g.query(query):
                print(row)
    except Exception as e:
        print(e)
        continue

## Pruebas

In [5]:
print("Querying the graph for a specific paper")
print("\n")

for result in g.query(
    '''PREFIX schema: <http://schema.org/>

    SELECT ?title ?topic ?author
    WHERE {
    ?paper a schema:paper ;
        schema:doi "10.26735/TLYG7256" ;
        schema:title ?title ;
        schema:topic ?topic ;
        schema:author ?author .
    }'''
):
    print(result)

Querying the graph for a specific paper


(rdflib.term.Literal('An_Efficient_Deep_Learning_Classification_Model_for_Predicting_Credit_Card_Fraud_on_Skewed_Data'), rdflib.term.Literal('wikipedia_claim_citation_system_card_model_source_human_user_information'), rdflib.term.Literal(''))


In [6]:
print("All possible topics:")
print("\n")

for result in g.query(
    '''PREFIX schema: <http://schema.org/>
    SELECT ?topic
    WHERE {
        ?paper a schema:topic ;
               schema:name ?topic .
    }
    '''
):
    print(result)

All possible topics:


(rdflib.term.Literal('learning_technology_ml_ai_deep_higher_education_artificial_recent_paper'),)
(rdflib.term.Literal('ai_transaction_fraud_technology_card_algorithm_credit_machine_data_use'),)
(rdflib.term.Literal('ai_transaction_development_sustainable_target_card_data_study_authorized_credit'),)
(rdflib.term.Literal('task_text_row_knowledge_base_additional_table_generation_using_gap'),)
(rdflib.term.Literal('wikipedia_claim_citation_system_card_model_source_human_user_information'),)
(rdflib.term.Literal('dl_gene_ha_credit_data_fraud_card_wiki_challenge_community'),)
(rdflib.term.Literal('data_fraud_model_card_ha_disease_ml_credit_using_study'),)
(rdflib.term.Literal('fraud_card_credit_imaging_increase_learning_algorithm_transaction_diagnostic_problem'),)
(rdflib.term.Literal('application_deep_area_learning_processing_information_three_natural_language_use'),)
(rdflib.term.Literal('model_wit_learning_wikipedia_information_performance_article_multimodal_exampl

In [7]:
print("All papers:")
print("\n")

for result in g.query(
    '''
    PREFIX schema: <http://schema.org/>
    SELECT ?title
    WHERE {
        ?paper a schema:paper ;
               schema:title ?title .
    }
    '''
):
    print(result)

All papers:


(rdflib.term.Literal('A_multilevel_review_of_artificial_intelligence_in_organizations:_Implications_for_organizational_behavior_research_and_practice'),)
(rdflib.term.Literal('A_systematic_review_of_the_applications_of_artificial_intelligence_and_machine_learning_in_autoimmune_diseases'),)
(rdflib.term.Literal('Exploring_the_impact_of_artificial_intelligence_on_teaching_and_learning_in_higher_education'),)
(rdflib.term.Literal('Deep_Learning:_Methods_and_Applications'),)
(rdflib.term.Literal('Deep_learning'),)
(rdflib.term.Literal('Surrogate_techniques_for_testing_fraud_detection_algorithms_in_credit_card_operations'),)
(rdflib.term.Literal('use_of_data_mining_of_to_create_of_a_fraud_prevention_and_detection_system_in_credit_card'),)
(rdflib.term.Literal('WIT:_Wikipedia-based_Image_Text_Dataset_for_Multimodal_Multilingual_Machine_Learning'),)
(rdflib.term.Literal('Analysis_and_prediction_for_credit_card_fraud_detection_dataset_using_data_mining_approaches'),)
(rdflib.term