In [None]:
!pip install rdflib
!pip install opendatasets
!pip install nltk
!pip install gensim

In [None]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import FOAF, SKOS, RDF, RDFS, XSD, OWL
import opendatasets as od
import pandas as pd

## Download the datasets from kaggle

In [None]:
od.download('https://www.kaggle.com/datasets/nobelfoundation/nobel-laureates')
od.download('https://www.kaggle.com/datasets/xabirhasan/journal-ranking-dataset')
od.download('https://www.kaggle.com/datasets/nechbamohammed/research-papers-dataset')
# TODO download the other ones

## Read the csv file

In [None]:
nobels = pd.read_csv('nobel-laureates/archive.csv')
nobels.head(5)

In [None]:
journals = pd.read_csv('journal-ranking-dataset/journal_ranking_data.csv')
journals.head(5)

In [None]:
papers = pd.read_csv('research-papers-dataset/dblp-v10.csv')
papers.info()
papers.head(5)

## Parse our ontology

In [None]:
graph = Graph()
graph.parse('nobelOntology.ttl', format='turtle')
graph.parse('http://eulersharp.sourceforge.net/2003/03swap/countries', format='turtle')

NO = Namespace('http://www.semanticweb.org/a3d/ontologies/2024/10/nobelOntology/')
JUR = Namespace('http://sweet.jpl.nasa.gov/2.3/humanJurisdiction.owl#')

# binding prefixes to URIs
graph.bind('nobel', NO)
graph.bind('jur', JUR)

for ns_prefix, namespace in graph.namespaces():
    print('{}: {}'.format(ns_prefix, namespace))

for s, p, o in graph.triples((None, RDF.type, JUR.Country)):
    print(f"{s} is a country")

## Populate the graph with nobel-laureates

In [None]:
import datetime, unicodedata, html, re
import topic_extraction

def normalize_name(raw_name):
    name = html.unescape(raw_name)
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    uri_name = name.replace(',', '')
    uri_name = uri_name.replace('-', ' ')
    uri_name = ''.join(x for x in uri_name.title() if not x.isspace())
    return (name, uri_name)

def handle_city(raw_city):
    (city_name, uri_city_name) = normalize_name(raw_city)
    city = URIRef(NO[uri_city_name])
    if (city, RDF.type, NO.City) not in graph: # new city
        graph.add((city, RDF.type, NO.City))
        graph.add((city, FOAF.name, Literal(city_name, datatype=XSD.string)))
    return city

def handle_org(raw_org):
    (org_name, uri_org_name) = normalize_name(raw_org)
    org = URIRef(NO[uri_org_name])
    if (org, RDF.type, FOAF.Organization) not in graph: # new organization
        graph.add((org, RDF.type, FOAF.Organization))
        graph.add((org, FOAF.name, Literal(org_name, datatype=XSD.string)))
    return org

def handle_city_country(country_name):
    # special cases

    if country_name == 'United States of America':
        country_name = 'United States'
    if country_name == 'Scotland' or country_name == 'Northern Ireland':
        country_name = 'United Kingdom'
    if country_name == 'Guadeloupe Island':
        country_name = 'Guadeloupe'
    if country_name == 'East Timor':
        country_name = 'Timor-Leste'
    if country_name == 'East Germany' or country_name == 'Federal Republic of Germany':
        country_name = 'Germany'
    if country_name == 'Union of Soviet Socialist Republics':
        country_name = 'Russian Federation'
    if country_name == 'Czechoslovakia':
        country_name = 'Czech Republic'
    if country_name == 'Vietnam':
        country_name = 'Viet Nam'
    
    # Regex pattern to capture content inside parentheses
    pattern = r"\((.*?)\)"
    match = re.findall(pattern, country_name)
    if len(match) != 0:
        country_name = match[0]

    if country_name == 'Republic of Macedonia':
        country_name = 'Macedonia, the former Yugoslav Republic of'
    if country_name == 'South Korea':
        country_name = 'Korea, Republic of'
    if country_name == "People's Republic of China":
        country_name = 'China'
    if country_name == 'then Germany, now France':
        country_name = 'France'
    
    country_query = f'''
    SELECT ?country
    WHERE {{
        ?country rdf:type jur:Country;
                foaf:name ?name.
        FILTER(REGEX(?name, "{country_name}"))
    }}'''

    qres = graph.query(country_query)
    if (len(qres) == 0):
        print('Country not found: {}'.format(country_name))
        return None

    return qres


for index, row in nobels.iterrows():
    nobel = URIRef(NO[row['Category'] + str(row['Year'])]) # the URI will be nobelNamespace + Category + Year
    graph.add((nobel, RDF.type, NO.NobelPrize))
    graph.add((nobel, NO.hasYear, Literal(row['Year'], datatype=XSD.gYear)))
    graph.add((nobel, NO.hasNobelCategory, Literal(row['Category'], datatype=XSD.string)))

    # handle Prize Share
    prizeShare = str(row['Prize Share']).split('/')
    if (nobel, NO.hasPrizeShare, None) not in graph:
        graph.add((nobel, NO.hasPrizeShare, Literal(prizeShare[1], datatype=XSD.integer)))

    # handle Motivation
    if ((nobel, NO.hasMotivationTopics, None) not in graph) and (pd.notna(row['Motivation'])):
        topics = topic_extraction.extract_topics(str(row['Motivation']), num_topics=1, num_words=5)
        for idx, topic in enumerate(topics):
            graph.add((nobel, NO.hasMotivationTopics, Literal(','.join(topic), datatype=XSD.string)))
    
    # handle Laureate Type
    if row['Laureate Type'] == 'Organization':
        (org_name, uri_org_name) = normalize_name(str(row['Full Name']))
        laureate = URIRef(NO[uri_org_name])
        graph.add((laureate, RDF.type, FOAF.Organization))
        graph.add((laureate, FOAF.name, Literal(org_name, datatype=XSD.string)))
    elif row['Laureate Type'] == 'Individual':
        laureate = URIRef(NO[str(row['Laureate ID'])]) # the URI will be nobelNamespace + Laureate ID
        graph.add((laureate, RDF.type, FOAF.Person))
        graph.add((laureate, FOAF.name, Literal(row['Full Name'], datatype=XSD.string)))
        graph.add((laureate, FOAF.gender, Literal(row['Sex'], datatype=XSD.string)))

    graph.add((laureate, RDF.type, NO.Laureate))
    
    if pd.notna(row['Birth Date']):
        try:
            datetime.datetime.strptime(str(row['Birth Date']), '%Y-%m-%d')
            graph.add((laureate, NO.birthDate, Literal(row['Birth Date'], datatype=XSD.date)))
        except ValueError:
            splitted_date = str(row['Birth Date']).split('-')
            new_date = splitted_date[0] + '-01-01'
            print('Wrong Birth Date format in {}. The new date will be {}'.format(laureate, new_date))
            graph.add((laureate, NO.birthDate, Literal(new_date, datatype=XSD.date)))

    if pd.notna(row['Death Date']):
        try:
            datetime.datetime.strptime(str(row['Death Date']), '%Y-%m-%d')
            graph.add((laureate, NO.deathDate, Literal(row['Death Date'], datatype=XSD.date)))
        except ValueError:
            splitted_date = str(row['Death Date']).split('-')
            new_date = splitted_date[0] + '-01-01'
            print('Wrong Death Date format in {}. The new date will be {}'.format(laureate, new_date))
            graph.add((laureate, NO.deathDate, Literal(new_date, datatype=XSD.date)))

    # handle birth city
    if pd.notna(row['Birth City']):
        birth_city = handle_city(str(row['Birth City']))
        graph.add((laureate, NO.bornIn, birth_city))

    # handle birth city country
    if pd.notna(row['Birth Country']):
        qres = handle_city_country(str(row['Birth Country']))
        if (qres is not None) and ((birth_city, NO.locatedIn, JUR.Country) not in graph): # new locatedIn
            graph.add((birth_city, NO.locatedIn, next(iter(qres)).country)) # only the first match

    # handle death city
    if pd.notna(row['Death City']):
        death_city = handle_city(str(row['Death City']))
        graph.add((laureate, NO.diedIn, death_city))

    # handle death city country
    if pd.notna(row['Death Country']):
        qres = handle_city_country(str(row['Death Country']))
        if (qres is not None) and ((death_city, NO.locatedIn, JUR.Country) not in graph): # new locatedIn
            graph.add((death_city, NO.locatedIn, next(iter(qres)).country)) # only the first match

    # handle organization
    if pd.notna(row['Organization Name']):
        org = handle_org(str(row['Organization Name']))
        graph.add((laureate, NO.worksFor, org))

        if pd.notna(row['Organization City']):
            org_city = handle_city(str(row['Organization City']))
            graph.add((org, NO.basedIn, org_city))

        if pd.notna(row['Organization Country']):
            qres = handle_city_country(str(row['Organization Country']))
            if (qres is not None) and ((org_city, NO.locatedIn, JUR.Country) not in graph): # new locatedIn
                graph.add((org_city, NO.locatedIn, next(iter(qres)).country)) # only the first match

## Fix laureate type errors

In [None]:
for s, p, o in graph.triples((None, RDF.type, FOAF.Organization)):
    if ((s, NO.bornIn, None)) in graph:
        graph.add((s, RDF.type, FOAF.Person))
        graph.remove((s, RDF.type, FOAF.Organization))

## Populate the graph with journals

In [None]:
for index, row in journals.iterrows():
    journal = URIRef(NO[row['Title'].replace(" ", "_")])
    graph.add((journal, RDF.type, NO.Journal))
    graph.add((journal, NO.hasTitle, row['Title']))

## Serialization

In [None]:
with open('test.ttl', 'w', encoding='utf-8') as out:
    out.write(graph.serialize(format='turtle'))