In [41]:
!pip install rdflib
!pip install opendatasets


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [42]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import FOAF, SKOS, RDF, RDFS, XSD, OWL
import opendatasets as od
import pandas as pd

## Download the datasets from kaggle

In [43]:
od.download('https://www.kaggle.com/datasets/nobelfoundation/nobel-laureates')
od.download('https://www.kaggle.com/datasets/xabirhasan/journal-ranking-dataset')
# TODO download the other ones

Skipping, found downloaded files in ".\nobel-laureates" (use force=True to force download)
Skipping, found downloaded files in ".\journal-ranking-dataset" (use force=True to force download)


## Read the csv file

In [44]:
nobels = pd.read_csv('nobel-laureates/archive.csv')
#nobels
journals = pd.read_csv('journal-ranking-dataset/journal_ranking_data.csv')
journals

  journals = pd.read_csv('journal-ranking-dataset/journal_ranking_data.csv')


Unnamed: 0,Rank,Title,OA,Country,SJR-index,CiteScore,H-index,Best Quartile,Best Categories,Best Subject Area,...,2700 Medicine,2800 Neuroscience,2900 Nursing,"3000 Pharmacology, Toxicology and Pharmaceutics",3100 Physics and Astronomy,3200 Psychology,3300 Social Sciences,3400 Veterinary,3500 Dentistry,3600 Health Professions
0,1,Ca-A Cancer Journal for Clinicians,False,United States,86.091,642.9,198,Q1,"['Hematology', 'Oncology']",Medicine,...,1,0,0,0,0,0,0,0,0,0
1,2,Quarterly Journal of Economics,False,United Kingdom,36.730,25.1,292,Q1,['Economics and Econometrics'],"Economics, Econometrics and Finance",...,0,0,0,0,0,0,0,0,0,0
2,3,Nature Reviews Molecular Cell Biology,False,United Kingdom,34.201,164.4,485,Q1,"['Cell Biology', 'Molecular Biology']","Biochemistry, Genetics and Molecular Biology",...,0,0,0,0,0,0,0,0,0,0
3,4,Cell,False,United States,26.494,102.0,856,Q1,"['Biochemistry, Genetics and Molecular Biology...","Biochemistry, Genetics and Molecular Biology",...,0,0,0,0,0,0,0,0,0,0
4,5,New England Journal of Medicine,False,United States,26.015,134.4,1130,Q1,['Medicine (miscellaneous)'],Medicine,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18008,18009,Western Humanities Review,False,United States,0.100,0.0,6,Q4,['Literature and Literary Theory'],Arts and Humanities,...,0,0,0,0,0,0,0,0,0,0
18009,18010,Yale Review,False,United States,0.100,0.0,4,Q4,['Literature and Literary Theory'],Arts and Humanities,...,0,0,0,0,0,0,0,0,0,0
18010,18011,Zeitschrift fur Theologie und Kirche,False,Germany,0.100,0.2,6,Q4,['Religious Studies'],Arts and Humanities,...,0,0,0,0,0,0,0,0,0,0
18011,18012,Zivot Umjetnosti,True,Croatia,0.100,0.1,2,Q4,"['Museology', 'Visual Arts and Performing Arts']",Arts and Humanities,...,0,0,0,0,0,0,0,0,0,0


## Parse our ontology

In [45]:
graph = Graph()
graph.parse('nobelOntology.ttl', format='turtle')
graph.parse('http://eulersharp.sourceforge.net/2003/03swap/countries', format='turtle')

NOBEL = Namespace('http://www.semanticweb.org/a3d/ontologies/2024/10/nobelOntology/')
JUR = Namespace('http://sweet.jpl.nasa.gov/2.3/humanJurisdiction.owl#')

# binding prefixes to URIs
graph.bind('nobel', NOBEL)
graph.bind('jur', JUR)

for ns_prefix, namespace in graph.namespaces():
    print('{}: {}'.format(ns_prefix, namespace))

for s, p, o in graph.triples((None, RDF.type, JUR.Country)):
    print(f"{s} is a country")

brick: https://brickschema.org/schema/Brick#
csvw: http://www.w3.org/ns/csvw#
dcat: http://www.w3.org/ns/dcat#
dcmitype: http://purl.org/dc/dcmitype/
dcam: http://purl.org/dc/dcam/
doap: http://usefulinc.com/ns/doap#
foaf: http://xmlns.com/foaf/0.1/
geo: http://www.opengis.net/ont/geosparql#
odrl: http://www.w3.org/ns/odrl/2/
org: http://www.w3.org/ns/org#
prof: http://www.w3.org/ns/dx/prof/
prov: http://www.w3.org/ns/prov#
qb: http://purl.org/linked-data/cube#
schema: https://schema.org/
sh: http://www.w3.org/ns/shacl#
skos: http://www.w3.org/2004/02/skos/core#
sosa: http://www.w3.org/ns/sosa/
ssn: http://www.w3.org/ns/ssn/
time: http://www.w3.org/2006/time#
vann: http://purl.org/vocab/vann/
void: http://rdfs.org/ns/void#
wgs: https://www.w3.org/2003/01/geo/wgs84_pos#
owl: http://www.w3.org/2002/07/owl#
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
xsd: http://www.w3.org/2001/XMLSchema#
xml: http://www.w3.org/XML/1998/namespace
jur: http:

## Populate the graph with nobel-laureates

In [46]:
import datetime, unicodedata, html, re
import topic_extraction

def normalize_name(raw_name):
    name = html.unescape(raw_name)
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    uri_name = name.replace(',', '')
    uri_name = uri_name.replace('-', ' ')
    uri_name = ''.join(x for x in uri_name.title() if not x.isspace())
    return (name, uri_name)

def handle_city(raw_city):
    (city_name, uri_city_name) = normalize_name(raw_city)
    city = URIRef(NOBEL[uri_city_name])
    if (city, RDF.type, NOBEL.City) not in graph: # new city
        graph.add((city, RDF.type, NOBEL.City))
        graph.add((city, FOAF.name, Literal(city_name, datatype=XSD.string)))
    return city

def handle_org(raw_org):
    (org_name, uri_org_name) = normalize_name(raw_org)
    org = URIRef(NOBEL[uri_org_name])
    if (org, RDF.type, FOAF.Organization) not in graph: # new organization
        graph.add((org, RDF.type, FOAF.Organization))
        graph.add((org, FOAF.name, Literal(org_name, datatype=XSD.string)))
    return org

def handle_city_country(country_name):
    # special cases

    if country_name == 'United States of America':
        country_name = 'United States'
    if country_name == 'Scotland' or country_name == 'Northern Ireland':
        country_name = 'United Kingdom'
    if country_name == 'Guadeloupe Island':
        country_name = 'Guadeloupe'
    if country_name == 'East Timor':
        country_name = 'Timor-Leste'
    if country_name == 'East Germany' or country_name == 'Federal Republic of Germany':
        country_name = 'Germany'
    if country_name == 'Union of Soviet Socialist Republics':
        country_name = 'Russian Federation'
    if country_name == 'Czechoslovakia':
        country_name = 'Czech Republic'
    if country_name == 'Vietnam':
        country_name = 'Viet Nam'
    
    # Regex pattern to capture content inside parentheses
    pattern = r"\((.*?)\)"
    match = re.findall(pattern, country_name)
    if len(match) != 0:
        country_name = match[0]

    if country_name == 'Republic of Macedonia':
        country_name = 'Macedonia, the former Yugoslav Republic of'
    if country_name == 'South Korea':
        country_name = 'Korea, Republic of'
    if country_name == "People's Republic of China":
        country_name = 'China'
    if country_name == 'then Germany, now France':
        country_name = 'France'
    
    country_query = f'''
    SELECT ?country
    WHERE {{
        ?country rdf:type jur:Country;
                foaf:name ?name.
        FILTER(REGEX(?name, "{country_name}"))
    }}'''

    qres = graph.query(country_query)
    if (len(qres) == 0):
        print('Country not found: {}'.format(country_name))
        return None

    return qres


for index, row in nobels.iterrows():
    nobel = URIRef(NOBEL[row['Category'] + str(row['Year'])]) # the URI will be nobelNamespace + Category + Year
    graph.add((nobel, RDF.type, NOBEL.NobelPrize))
    graph.add((nobel, NOBEL.hasYear, Literal(row['Year'], datatype=XSD.gYear)))
    graph.add((nobel, NOBEL.hasNobelCategory, Literal(row['Category'], datatype=XSD.string)))

    # handle Prize Share
    prizeShare = str(row['Prize Share']).split('/')
    if (nobel, NOBEL.hasPrizeShare, None) not in graph:
        graph.add((nobel, NOBEL.hasPrizeShare, Literal(prizeShare[1], datatype=XSD.integer)))

    # handle Motivation
    if ((nobel, NOBEL.hasMotivationTopics, None) not in graph) and (pd.notna(row['Motivation'])):
        topics = topic_extraction.extract_topics(str(row['Motivation']), num_topics=1, num_words=5)
        for idx, topic in enumerate(topics):
            graph.add((nobel, NOBEL.hasMotivationTopics, Literal(','.join(topic), datatype=XSD.string)))
    
    # handle Laureate Type
    if row['Laureate Type'] == 'Organization':
        (org_name, uri_org_name) = normalize_name(str(row['Full Name']))
        laureate = URIRef(NOBEL[uri_org_name])
        graph.add((laureate, RDF.type, FOAF.Organization))
        graph.add((laureate, FOAF.name, Literal(org_name, datatype=XSD.string)))
    elif row['Laureate Type'] == 'Individual':
        laureate = URIRef(NOBEL[str(row['Laureate ID'])]) # the URI will be nobelNamespace + Laureate ID
        graph.add((laureate, RDF.type, FOAF.Person))
        graph.add((laureate, FOAF.name, Literal(row['Full Name'], datatype=XSD.string)))
        graph.add((laureate, FOAF.gender, Literal(row['Sex'], datatype=XSD.string)))

    graph.add((laureate, RDF.type, NOBEL.Laureate))
    
    if pd.notna(row['Birth Date']):
        try:
            datetime.datetime.strptime(str(row['Birth Date']), '%Y-%m-%d')
            graph.add((laureate, NOBEL.birthDate, Literal(row['Birth Date'], datatype=XSD.date)))
        except ValueError:
            splitted_date = str(row['Birth Date']).split('-')
            new_date = splitted_date[0] + '-01-01'
            print('Wrong Birth Date format in {}. The new date will be {}'.format(laureate, new_date))
            graph.add((laureate, NOBEL.birthDate, Literal(new_date, datatype=XSD.date)))

    if pd.notna(row['Death Date']):
        try:
            datetime.datetime.strptime(str(row['Death Date']), '%Y-%m-%d')
            graph.add((laureate, NOBEL.deathDate, Literal(row['Death Date'], datatype=XSD.date)))
        except ValueError:
            splitted_date = str(row['Death Date']).split('-')
            new_date = splitted_date[0] + '-01-01'
            print('Wrong Death Date format in {}. The new date will be {}'.format(laureate, new_date))
            graph.add((laureate, NOBEL.deathDate, Literal(new_date, datatype=XSD.date)))

    # handle birth city
    if pd.notna(row['Birth City']):
        birth_city = handle_city(str(row['Birth City']))
        graph.add((laureate, NOBEL.bornIn, birth_city))

    # handle birth city country
    if pd.notna(row['Birth Country']):
        qres = handle_city_country(str(row['Birth Country']))
        if (qres is not None) and ((birth_city, NOBEL.locatedIn, JUR.Country) not in graph): # new locatedIn
            graph.add((birth_city, NOBEL.locatedIn, next(iter(qres)).country)) # only the first match

    # handle death city
    if pd.notna(row['Death City']):
        death_city = handle_city(str(row['Death City']))
        graph.add((laureate, NOBEL.diedIn, death_city))

    # handle death city country
    if pd.notna(row['Death Country']):
        qres = handle_city_country(str(row['Death Country']))
        if (qres is not None) and ((death_city, NOBEL.locatedIn, JUR.Country) not in graph): # new locatedIn
            graph.add((death_city, NOBEL.locatedIn, next(iter(qres)).country)) # only the first match

    # handle organization
    if pd.notna(row['Organization Name']):
        org = handle_org(str(row['Organization Name']))
        graph.add((laureate, NOBEL.worksFor, org))

        if pd.notna(row['Organization City']):
            org_city = handle_city(str(row['Organization City']))
            graph.add((org, NOBEL.basedIn, org_city))

        if pd.notna(row['Organization Country']):
            qres = handle_city_country(str(row['Organization Country']))
            if (qres is not None) and ((org_city, NOBEL.locatedIn, JUR.Country) not in graph): # new locatedIn
                graph.add((org_city, NOBEL.locatedIn, next(iter(qres)).country)) # only the first match

Wrong Birth Date format in http://www.semanticweb.org/a3d/ontologies/2024/10/nobelOntology/519. The new date will be 1898-01-01


KeyboardInterrupt: 

## Fix laureate type errors

In [None]:
for s, p, o in graph.triples((None, RDF.type, FOAF.Organization)):
    if ((s, NOBEL.bornIn, None)) in graph:
        graph.add((s, RDF.type, FOAF.Person))
        graph.remove((s, RDF.type, FOAF.Organization))

## Serialization

In [None]:
with open('test.ttl', 'w', encoding='utf-8') as out:
    out.write(graph.serialize(format='turtle'))