In [1]:
pip install rdflib



In [2]:
import pandas as pd
import random
import csv
import os
from rdflib import Graph
from rdflib import Namespace
from rdflib import Literal
from rdflib.namespace import RDF, RDFS, FOAF, XSD, URIRef

In [3]:
Abox = Namespace("http://SDM_LAB2.org/abox/")

g = Graph()
g.bind("abox", Abox)

# Define URIRefs
ResearchPaper = URIRef(Abox["ResearchPaper"])
SubmittedPaper = URIRef(Abox["SubmittedPaper"])
Author = URIRef(Abox["Author"])
Person = URIRef(Abox["Person"])
Reviewer = URIRef(Abox["Reviewer"])
Volumes = URIRef(Abox["Volumes"])
Proceedings = URIRef(Abox["Proceedings"])
Topics = URIRef(Abox["Topics"])
Workshop = URIRef(Abox["Workshop"])
RegularConference = URIRef(Abox["RegularConference"])
Conference = URIRef(Abox["Conference"])
Venue = URIRef(Abox["Venue"])
Journal = URIRef(Abox["Journal"])

# Read CSV
csv_files = [
    "abstracts-sample.csv",
    "affiliated-to.csv",
    "authors-sample.csv",
    "belongs-to.csv",
    "categoriesRelations.csv",
    "citations-sample.csv",
    "cited-by.csv",
    "companies.csv",
    "conferences.csv",
    "is-from.csv",
    "journals.csv",
    "keywords.csv",
    "paper-ids-sample.csv",
    "papers-processed.csv",
    "papers-sample.csv",
    "publication-venues-sample.csv",
    "published-in.csv",
    "related-to.csv",
    "reviewed-by.csv",
    "reviews.csv",
    "universities.csv",
    "volume-from.csv",
    "withAbstracts.csv",
    "written-by.csv"
]

In [4]:
csv_dir = '/content/drive/MyDrive/SDM_lab2/data/CSVs/'

csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]

dataframes = {}
for csv_file in csv_files:
    file_path = os.path.join(csv_dir, csv_file)
    df_name = os.path.splitext(csv_file)[0].replace('-', '_')
    dataframes[df_name] = pd.read_csv(file_path)

for df_name in dataframes.keys():
    print(df_name)

universities
companies
reviews
keywords
categoriesRelations
citations_sample
papers_sample
paper_ids_sample
journals
conferences
is_from
authors_sample
volume_from
abstracts_sample
publication_venues_sample
published_in
withAbstracts
belongs_to
written_by
reviewed_by
cited_by
papers_processed
related_to
affiliated_to


# Properties

In [5]:
Author = Abox.Author
ResearchPaper = Abox.ResearchPaper

## Load authors

In [6]:
dataframes['authors_sample'].head(3)

Unnamed: 0,authorid,externalids,url,name,aliases,affiliations,homepage,papercount,citationcount,hindex
0,2070772945,,https://www.semanticscholar.org/author/2070772945,M. Edwards,"['M Edwards', 'M. A. Professor D. Miall Edward...",,,4,3,1
1,79869264,,https://www.semanticscholar.org/author/79869264,Z. Lone,"['Zubair Lone', 'Zubair Ah Lone']",,,2,0,0
2,2221510848,,https://www.semanticscholar.org/author/2221510848,Carolin Kladt,,,,1,6,1


In [7]:
author_properties = {
    "name": Abox.hasName,
    "aliases": Abox.hasAlias,
    "affiliations": Abox.hasAffiliation,
    "homepage": Abox.hasHomepage,
    "papercount": Abox.hasPaperCount,
    "citationcount": Abox.hasCitationCount,
    "hindex": Abox.hasHIndex
}

def process_authors(df):
    for _, record in df.iterrows():
        author_uri = URIRef(Abox[str(record['authorid'])])
        g.add((author_uri, RDF.type, Author))
        for key, prop in author_properties.items():
            if pd.notna(record[key]):  # Check not Null
                g.add((author_uri, prop, Literal(record[key])))

process_authors(dataframes['authors_sample'])

# Relations

In [8]:
Write = Abox.Write

## Author-Write->ResearchPaper

In [9]:
dataframes['written_by'].head(3)

Unnamed: 0,paperID,authorID,is_corresponding
0,36833670,97911868,True
1,258790926,146465373,True
2,258790926,108284628,False


In [10]:
g.add((Abox.Author, Abox.Write, Abox.ResearchPaper))
def process_authors_papers(df):
    for _, record in df.iterrows():
        author_uri = URIRef(Abox[str(record['authorID'])])
        paper_uri = URIRef(Abox[str(record['paperID'])])
        g.add((author_uri, Write, paper_uri))

process_authors_papers(dataframes['written_by'])