In [None]:
!pip install rdflib
!pip install opendatasets
!pip install nltk
!pip install gensim
!pip install rapidfuzz
!pip install unidecode

In [None]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import FOAF, SKOS, RDF, RDFS, XSD, OWL
import opendatasets as od
import pandas as pd

## Download the datasets from kaggle

In [None]:
od.download('https://www.kaggle.com/datasets/nobelfoundation/nobel-laureates')
od.download('https://www.kaggle.com/datasets/xabirhasan/journal-ranking-dataset')
od.download('https://www.kaggle.com/datasets/nechbamohammed/research-papers-dataset')

## Read the csv file

In [None]:
nobels = pd.read_csv('nobel-laureates/archive.csv')
nobels.head(5)

In [None]:
journals = pd.read_csv('journal-ranking-dataset/journal_ranking_data.csv')
journals.head(5)

In [None]:
papers = pd.read_csv('research-papers-dataset/dblp-v10.csv', index_col='id')
papers.info()
papers.head(5)

In [None]:
fundings = pd.read_csv('budget_allocations.csv')
fundings.info()
fundings.head(5)

## Parse our ontology

In [None]:
graph = Graph()
graph.parse('nobelOntology.ttl', format='turtle')
graph.parse('http://eulersharp.sourceforge.net/2003/03swap/countries', format='turtle')

NO = Namespace('http://www.semanticweb.org/a3d/ontologies/2024/10/nobelOntology/')
JUR = Namespace('http://sweet.jpl.nasa.gov/2.3/humanJurisdiction.owl#')

for ns_prefix, namespace in graph.namespaces():
    print('{}: {}'.format(ns_prefix, namespace))

for s, p, o in graph.triples((None, RDF.type, JUR.Country)):
    print(f"{s} is a country")

## Shared handling functions

In [None]:
import datetime, unicodedata, html, re

def normalize_name(raw_name):
    name = html.unescape(raw_name)
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    uri_name = name.replace(',', '')
    uri_name = uri_name.replace('.', '')
    uri_name = uri_name.replace('-', ' ')
    uri_name = uri_name.replace('"', '')
    uri_name = uri_name.replace('\\', '')
    uri_name = uri_name.replace('/', '')
    uri_name = ''.join(x for x in uri_name.title() if not x.isspace())
    return (name, uri_name)

country_mapping = {
    'United States of America': 'United States',
    'Scotland': 'United Kingdom',
    'Northern Ireland': 'United Kingdom',
    'Guadeloupe Island': 'Guadeloupe',
    'East Timor': 'Timor-Leste',
    'East Germany': 'Germany',
    'Federal Republic of Germany': 'Germany',
    'Union of Soviet Socialist Republics': 'Russian Federation',
    'Czechoslovakia': 'Czech Republic',
    'Czechia': 'Czech Republic',
    'Vietnam': 'Viet Nam',
    'Russia': 'Russian Federation',
    'Venezuela': 'Venezuela, Bolivarian Republic of',
    'Taiwan': 'Taiwan, Province of China',
    'Trinidad': 'Trinidad and Tobago',
    'Iran': 'Iran, Islamic Republic of',
    'Slovak Republic': 'Slovakia',
    'Moldova': 'Moldova, Republic of',
    'Libya': 'Libyan Arab Jamahiriya',
    'Republic of Macedonia': 'Macedonia, the former Yugoslav Republic of',
    'Macedonia': 'Macedonia, the former Yugoslav Republic of',
    'South Korea': 'Korea, Republic of',
    'Korea': 'Korea, Republic of',
    "People's Republic of China": 'China',
    'then Germany, now France': 'France',
    "Türkiye": 'Turkey',
    'Chinese Taipei': 'Taiwan, Province of China',
}

def handle_country(country_name):
    
    # Regex pattern to capture content inside parentheses
    pattern = r"\((.*?)\)"
    match = re.findall(pattern, country_name)
    if len(match) != 0:
        country_name = match[0]

    # map country_name to its equivalent in Country namespace
    country_name = country_mapping.get(country_name, country_name)
    
    country_query = f'''
    SELECT ?country
    WHERE {{
        ?country rdf:type jur:Country;
                foaf:name ?name.
        FILTER(REGEX(?name, "^{country_name}$"))
    }}'''

    qres = graph.query(country_query)
    if (len(qres) == 0):
        print('Country not found: {}'.format(country_name))
        return None

    return qres

## Populate the graph with nobel-laureates

In [None]:
import topic_extraction

def handle_city(raw_city):
    (city_name, uri_city_name) = normalize_name(raw_city)
    city = URIRef(NO[uri_city_name])
    if (city, RDF.type, NO.City) not in graph: # new city
        graph.add((city, RDF.type, NO.City))
        graph.add((city, FOAF.name, Literal(city_name, datatype=XSD.string)))
    return city

def handle_org(raw_org):
    (org_name, uri_org_name) = normalize_name(raw_org)
    org = URIRef(NO[uri_org_name])
    if (org, RDF.type, FOAF.Organization) not in graph: # new organization
        graph.add((org, RDF.type, FOAF.Organization))
        graph.add((org, FOAF.name, Literal(org_name, datatype=XSD.string)))
    return org


for index, row in nobels.iterrows():
    nobel = URIRef(NO[row['Category'] + str(row['Year'])]) # the URI will be nobelNamespace + Category + Year
    graph.add((nobel, RDF.type, NO.NobelPrize))
    graph.add((nobel, NO.hasYear, Literal(row['Year'], datatype=XSD.gYear)))
    graph.add((nobel, NO.hasNobelCategory, Literal(row['Category'], datatype=XSD.string)))

    # handle Prize Share
    prizeShare = str(row['Prize Share']).split('/')
    if (nobel, NO.hasPrizeShare, None) not in graph:
        graph.add((nobel, NO.hasPrizeShare, Literal(prizeShare[1], datatype=XSD.integer)))

    # handle Motivation
    if ((nobel, NO.hasMotivationTopics, None) not in graph) and (pd.notna(row['Motivation'])):
        topics = topic_extraction.extract_topics(str(row['Motivation']), num_topics=1, num_words=5)
        for idx, topic in enumerate(topics):
            graph.add((nobel, NO.hasMotivationTopics, Literal(','.join(topic), datatype=XSD.string)))
    
    (laureate_name, uri_laureate_name) = normalize_name(str(row['Full Name']))
    laureate = URIRef(NO[uri_laureate_name])
    graph.add((laureate, FOAF.name, Literal(laureate_name, datatype=XSD.string)))

    # handle Laureate Type
    if row['Laureate Type'] == 'Organization':
        graph.add((laureate, RDF.type, FOAF.Organization))
    elif row['Laureate Type'] == 'Individual':
        graph.add((laureate, RDF.type, FOAF.Person))

    if pd.notna(row['Sex']):
        graph.add((laureate, FOAF.gender, Literal(row['Sex'], datatype=XSD.string)))

    graph.add((laureate, RDF.type, NO.Laureate))
    graph.add((laureate, NO.hasWon, nobel))
    
    if pd.notna(row['Birth Date']):
        try:
            datetime.datetime.strptime(str(row['Birth Date']), '%Y-%m-%d')
            graph.add((laureate, NO.birthDate, Literal(row['Birth Date'], datatype=XSD.date)))
        except ValueError:
            splitted_date = str(row['Birth Date']).split('-')
            new_date = splitted_date[0] + '-01-01'
            print('Wrong Birth Date format in {}. The new date will be {}'.format(laureate, new_date))
            graph.add((laureate, NO.birthDate, Literal(new_date, datatype=XSD.date)))

    if pd.notna(row['Death Date']):
        try:
            datetime.datetime.strptime(str(row['Death Date']), '%Y-%m-%d')
            graph.add((laureate, NO.deathDate, Literal(row['Death Date'], datatype=XSD.date)))
        except ValueError:
            splitted_date = str(row['Death Date']).split('-')
            new_date = splitted_date[0] + '-01-01'
            print('Wrong Death Date format in {}. The new date will be {}'.format(laureate, new_date))
            graph.add((laureate, NO.deathDate, Literal(new_date, datatype=XSD.date)))

    # handle birth city
    if pd.notna(row['Birth City']):
        birth_city = handle_city(str(row['Birth City']))
        graph.add((laureate, NO.bornIn, birth_city))

    # handle birth city country
    if pd.notna(row['Birth Country']):
        qres = handle_country(str(row['Birth Country']))
        if (qres is not None) and ((birth_city, NO.locatedIn, None) not in graph): # new locatedIn
            graph.add((birth_city, NO.locatedIn, next(iter(qres)).country)) # only the first match

    # handle death city
    if pd.notna(row['Death City']):
        death_city = handle_city(str(row['Death City']))
        graph.add((laureate, NO.diedIn, death_city))

    # handle death city country
    if pd.notna(row['Death Country']):
        qres = handle_country(str(row['Death Country']))
        if (qres is not None) and ((death_city, NO.locatedIn, None) not in graph): # new locatedIn
            graph.add((death_city, NO.locatedIn, next(iter(qres)).country)) # only the first match

    # handle organization
    if pd.notna(row['Organization Name']):
        org = handle_org(str(row['Organization Name']))
        graph.add((laureate, NO.worksFor, org))

        if pd.notna(row['Organization City']):
            org_city = handle_city(str(row['Organization City']))
            graph.add((org, NO.basedIn, org_city))

        if pd.notna(row['Organization Country']):
            qres = handle_country(str(row['Organization Country']))
            if (qres is not None) and ((org_city, NO.locatedIn, None) not in graph): # new locatedIn
                graph.add((org_city, NO.locatedIn, next(iter(qres)).country)) # only the first match

## Fix laureate type errors

In [None]:
for s, p, o in graph.triples((None, RDF.type, FOAF.Organization)):
    if ((s, NO.bornIn, None)) in graph:
        graph.add((s, RDF.type, FOAF.Person))
        graph.remove((s, RDF.type, FOAF.Organization))

## Populate the graph with journals

In [None]:
def normalize_category(category):
    # Remove numbers at the beginning, whitespaces, commas and the "and" word
    cleaned = re.sub(r'^\d+\s+', '', category)
    cleaned = cleaned.replace(',', '')
    cleaned = cleaned.replace(' and ', ' ')
    
    # camelCase
    parts = cleaned.split()
    camel_case = parts[0].lower() + ''.join(word.capitalize() for word in parts[1:])
    
    return camel_case

categories = [col for col in journals.columns[-27:]]

for index, row in journals.iterrows():
    journal_title, uri_journal_title = normalize_name(str(row['Title']))

    if uri_journal_title == 'TelAviv':
        uri_journal_title = 'Telaviv'

    journal = URIRef(NO[uri_journal_title]) # URI will be nobelNamespace + Title
    graph.add((journal, RDF.type, NO.Journal))
    graph.add((journal, NO.hasTitle, Literal(journal_title, datatype=XSD.string)))
    graph.add((journal, NO.hasHIndex, Literal(row['H-index'], datatype=XSD.integer)))
    graph.add((journal, NO.hasSJR, Literal(row['SJR-index'], datatype=XSD.decimal)))
    graph.add((journal, NO.hasOpenAccess, Literal(row['OA'], datatype=XSD.boolean)))

    # Handle countries
    qres = handle_country(str(row['Country']))
    if (qres is not None) and ((journal, NO.hasCountry, None) not in graph): # new locatedIn
        graph.add((journal, NO.hasCountry, next(iter(qres)).country)) # only the first match

    # Handle categories
    for category in categories:
        if row[category] == 1:
            graph.add((journal, NO.hasJournalCategory, NO[normalize_category(category)]))

## Populate the graph with fundings

In [None]:
for index, row in fundings.iterrows():
    funding_country, uri_funding_country = normalize_name(str(row['Reference area']))
    funding = URIRef(uri_funding_country + str(row['TIME_PERIOD'])) # URI will be nobelNamespace + Country + Year
    graph.add((funding, RDF.type, NO.Funding))
    graph.add((funding, NO.hasYear, Literal(row['TIME_PERIOD'], datatype=XSD.gYear)))
    graph.add((funding, NO.hasAmount, Literal(row['OBS_VALUE'], datatype=XSD.decimal)))

    # Handle countries
    qres = handle_country(str(row['Reference area']))
    if (qres is not None) and ((None, NO.hasFunded, funding) not in graph): # new locatedIn
        graph.add((next(iter(qres)).country, NO.hasFunded, funding)) # only the first match

## Populate graph with Papers

In [None]:
from rdflib import URIRef, Literal, RDF
from rdflib.namespace import FOAF, XSD
from unidecode import unidecode
from rapidfuzz import fuzz

def normalize_person_name(name):
    return unidecode(name).lower().strip()

# Function to find similar people in the graph
def find_similar_person(new_name, graph, threshold=90):
    new_name_normalized = normalize_person_name(new_name)
    for person in graph.subjects(RDF.type, NO.Laureate):
        existing_name = graph.value(person, FOAF.name)
        if existing_name:
            existing_name_normalized = normalize_person_name(str(existing_name))
            similarity = fuzz.ratio(new_name_normalized, existing_name_normalized)
            if similarity >= threshold:
                return person
    return None

# Function to add researchers to the graph
def add_researchers_to_graph(authors_list, graph):
    for author in authors_list:
        person_uri = find_similar_person(author, graph)
        
        if person_uri is None:
            _, author_uri = normalize_name(author)
            researcher = URIRef(NO[author_uri])
            graph.add((researcher, RDF.type, FOAF.Person))
            graph.add((researcher, FOAF.name, Literal(author, datatype=XSD.string)))

            return researcher
        else:
            print(f"Researcher {author} already exists as {person_uri}")
            return person_uri

def find_or_create(graph, venue_name):
    """Ensure a venue exists in the graph, creating it if necessary."""
    CONFERENCE_KEYWORDS = ['workshop', 'conference', 'symposium']

    venue_name, venue_uri = normalize_name(venue_name)
    venue = URIRef(NO[venue_uri])

    # Check if the venue already exists in the graph
    if not (venue, RDF.type, NO.Journal) in graph and not (venue, RDF.type, NO.Conference) in graph:
        print(f"Creating {venue_uri}")
        # Determine type of venue based on name
        if any(keyword in venue_name.lower() for keyword in CONFERENCE_KEYWORDS):
            graph.add((venue, RDF.type, NO.Conference))
        else:
            graph.add((venue, RDF.type, NO.Journal))

        graph.add((venue, NO.hasTitle, Literal(venue_name, datatype=XSD.string)))
        
    return venue

# TODO: restringere il dataset con: researchers che sono laureates, venues che già esistono, N papers in più
# FOR DEVELOPMENT
papers = papers[:5000]

for index, row in papers.iterrows():
    paper = URIRef(NO[index])
    graph.add((paper, RDF.type, NO.Paper))
    graph.add((paper, NO.hasTitle, Literal(row['title'], datatype=XSD.string)))
    graph.add((paper, NO.hasYear, Literal(row['year'], datatype=XSD.gYear)))
    graph.add((paper, NO.hasCitations, Literal(row['n_citation'], datatype=XSD.integer)))

    if pd.notna(row['abstract']):
        topics = topic_extraction.extract_topics(row['abstract'], num_topics=1, num_words=5)
        for idx, topic in enumerate(topics):
            graph.add((paper, NO.hasAbstractTopics, Literal(','.join(topic), datatype=XSD.string)))

    if pd.notna(row['venue']):
        venue = find_or_create(graph, row['venue'])
        graph.add((paper, NO.publishedIn, venue))

    if pd.notna(row['authors']):
        authors = row['authors'].replace('[', '').replace(']', '').replace('\'', '').split(", ")
        researcher = add_researchers_to_graph(authors, graph)
        graph.add((researcher, NO.hasWritten, paper))


## Serialization

In [None]:
with open('test.ttl', 'w', encoding='utf-8') as out:
    out.write(graph.serialize(format='turtle'))