In [1]:
pip install rdflib



In [2]:
import pandas as pd
import random
import csv
import os
from itertools import islice
import urllib.parse
from rdflib import Graph
from rdflib import Literal
from rdflib import Namespace
from rdflib.namespace import RDF, RDFS, FOAF, XSD, URIRef

In [3]:
Abox = Namespace("http://SDM_LAB2.org/abox/")

g = Graph()
g.bind("abox", Abox)

In [4]:
# Read CSV
csv_files = [
    "abstracts-sample.csv",
    "affiliated-to.csv",
    "authors-sample.csv",
    "belongs-to.csv",
    "categoriesRelations.csv",
    "citations-sample.csv",
    "cited-by.csv",
    "companies.csv",
    "conferences.csv",
    "is-from.csv",
    "journals.csv",
    "keywords.csv",
    "paper-ids-sample.csv",
    "papers-processed.csv",
    "papers-sample.csv",
    "publication-venues-sample.csv",
    "published-in.csv",
    "related-to.csv",
    "reviewed-by.csv",
    "reviews.csv",
    "universities.csv",
    "volume-from.csv",
    "withAbstracts.csv",
    "written-by.csv"
]

csv_dir = '/content/drive/MyDrive/SDM_lab2/data/CSVs/'

csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]

dataframes = {}
for csv_file in csv_files:
    file_path = os.path.join(csv_dir, csv_file)
    df_name = os.path.splitext(csv_file)[0].replace('-', '_')
    dataframes[df_name] = pd.read_csv(file_path)

for df_name in dataframes.keys():
    print(df_name)

paper_belong_to_proceeding
paper
conference
paper_presented_in_conference
paper_cite_paper
scopus_500
sample
journal
journal_in_year
authors
author_write
paper_belong_to_journal
conference_detail
author_review
paper_in_year
paper_has_keywords
year
proceeding
proceeding_in_year
keywords


# Load Class and Properties

## Load Class ResearchPaper

In [5]:
dataframes['paper'].head(3)

Unnamed: 0,title,abstract,pages,DOI,link,year
0,Challenges posed by hijacked journals in Scopus,This study presents and explains the phenomeno...,395-422,10.1002/asi.24855,https://www.scopus.com/inward/record.uri?eid=2...,2023
1,Normalizing Large Scale Sensor-Based MWD Data:...,In the context of geo-infrastructures and spec...,-,10.3390/s24041209,https://www.scopus.com/inward/record.uri?eid=2...,2022
2,Instant learning based on deep neural network ...,Biometric-based identity verification systems ...,32099-32122,10.1007/s11042-023-16751-6,https://www.scopus.com/inward/record.uri?eid=2...,2024


In [6]:
ResearchPaper = Abox.ResearchPaper

paper_properties = {
    "title": Abox.title,
    "abstract": Abox.abstract,
    "pages": Abox.pages,
    "DOI": Abox.DOI,
    "link": Abox.link,
    "year": Abox.year
}

def process_papers(df):
    for _, record in df.iterrows():
        paper_uri = URIRef(Abox["ResearchPaper_" + str(record['DOI'])])
        g.add((paper_uri, RDF.type, ResearchPaper))
        for key, prop in paper_properties.items():
            if pd.notna(record[key]):
                g.add((paper_uri, prop, Literal(record[key])))

process_papers(dataframes['paper'])

## Load Property of Cites

In [7]:
dataframes['paper_cite_paper'].head(3)

Unnamed: 0,start_id,end_id
0,10.1002/asi.24855,10.4230/LIPIcs.ICDT.2024.23
1,10.1002/asi.24855,10.1109/SSI58917.2023.10387755
2,10.1002/asi.24855,10.4230/LIPIcs.ICDT.2023.14


In [8]:
Cites = Abox.Cites

Cites_properties = {
    "start_id": Abox.start_id,
    "end_id": Abox.end_id
}

def process_paper(df):
    for _, record in df.iterrows():
        # Create URIs for the start and end papers
        start_paper_uri = URIRef(Abox["ResearchPaper_" + str(record['start_id'])])
        end_paper_uri = URIRef(Abox["ResearchPaper_" + str(record['end_id'])])

        # Add the type triples for both papers to the graph
        # g.add((start_paper_uri, RDF.type, ResearchPaper)) #Already Added Before
        # g.add((end_paper_uri, RDF.type, ResearchPaper))

        # Add the citation relationship to the graph
        g.add((start_paper_uri, Cites, end_paper_uri))

process_paper(dataframes['paper_cite_paper'])

## Load Class Topics

In [9]:
dataframes['keywords'].head(3)

Unnamed: 0,keywords
0,filtering process
1,measurement while drilling (MWD)
2,normalizing index


In [10]:
Topics = Abox.Topics  # Name in Abox

Topics_properties = {
    "keywords": Abox.topics
}

def process_topics(df):
    for _, record in df.iterrows():
        keywords = urllib.parse.quote("Topics_" + str(record['keywords']))  # URL Code
        keywords_uri = URIRef(Abox[keywords])
        g.add((keywords_uri, RDF.type, Topics))  #
        for key, prop in Topics_properties.items():
            if pd.notna(record[key]):
                g.add((keywords_uri, prop, Literal(record[key])))

process_topics(dataframes['keywords'])

## Load Property HasTopics

In [11]:
dataframes['paper_has_keywords'].head(3)

Unnamed: 0,paper_id,keywords
0,10.3390/s24041209,filtering process
1,10.3390/s24041209,measurement while drilling (MWD)
2,10.3390/s24041209,normalizing index


In [12]:
HasTopics = Abox.HasTopics

HasTopics_properties = {
    "paper_id": Abox.paper_id,
    "keywords": Abox.keywords
}

def process_paper(df):
    for _, record in df.iterrows():
        author_uri = URIRef(Abox["HasTopics_" + str(record['paper_id'])])
        g.add((author_uri, RDF.type, Cites))
        for key, prop in HasTopics_properties.items():
            if pd.notna(record[key]):  # Check not Null
                g.add((author_uri, prop, Literal(record[key])))

process_paper(dataframes['paper_has_keywords'])

## Load

## Author-Write->ResearchPaper

In [13]:
dataframes['author_write'].head(3)

Unnamed: 0,author_id,paper_id,corresponding
0,57219571200,10.1002/asi.24855,True
1,35726817600,10.3390/s24041209,True
2,58897160900,10.3390/s24041209,False


In [14]:
Write = Abox.Write

g.add((Abox.Author, Abox.Write, Abox.ResearchPaper))
def process_authors_papers(df):
    for _, record in df.iterrows():
        author_uri = URIRef(Abox[str(record['author_id'])])
        paper_uri = URIRef(Abox[str(record['paper_id'])])
        g.add((author_uri, Write, paper_uri))

process_authors_papers(dataframes['author_write'])

# Export graph

In [15]:
for stmt in islice(g, 5):
    print(stmt)

(rdflib.term.URIRef('http://SDM_LAB2.org/abox/ResearchPaper_10.1016/j.procs.2023.11.108'), rdflib.term.URIRef('http://SDM_LAB2.org/abox/Cites'), rdflib.term.URIRef('http://SDM_LAB2.org/abox/ResearchPaper_10.1109/TNET.2024.3352029'))
(rdflib.term.URIRef('http://SDM_LAB2.org/abox/ResearchPaper_10.1109/APWiMob59963.2023.10365603'), rdflib.term.URIRef('http://SDM_LAB2.org/abox/Cites'), rdflib.term.URIRef('http://SDM_LAB2.org/abox/ResearchPaper_10.1145/3609956.3609968'))
(rdflib.term.URIRef('http://SDM_LAB2.org/abox/Topics_SQL'), rdflib.term.URIRef('http://SDM_LAB2.org/abox/topics'), rdflib.term.Literal('SQL'))
(rdflib.term.URIRef('http://SDM_LAB2.org/abox/HasTopics_10.1007/s10115-023-01948-w'), rdflib.term.URIRef('http://SDM_LAB2.org/abox/keywords'), rdflib.term.Literal('Memory efficient mining'))
(rdflib.term.URIRef('http://SDM_LAB2.org/abox/Topics_Distortion'), rdflib.term.URIRef('http://SDM_LAB2.org/abox/topics'), rdflib.term.Literal('Distortion'))


In [16]:
# Export as Turtle
g.serialize(destination='output.ttl', format='turtle')

<Graph identifier=N2a36cbf877fc49bdbda46531b20c08de (<class 'rdflib.graph.Graph'>)>