Imports

In [7]:
import pandas as pd
import numpy as np
from rdflib import Graph, URIRef, RDF, Literal
from rdflib.namespace import RDF

Retrieve DFs

In [8]:
moves = pd.read_pickle("../Prepared_DFs/moves.pkl")
house_prices = pd.read_pickle("../Prepared_DFs/house_prices.pkl")
house_prices_new = pd.read_pickle("../Prepared_DFs/house_prices_new.pkl")
house_availability = pd.read_pickle("../Prepared_DFs/house_availability.pkl")
population = pd.read_pickle("../Prepared_DFs/population.pkl")

RDF Creation Functions

In [13]:
"""
As soon as we're done with cleaning, we should take a look at:
    - Does the rdf look exactly as desired?
    - Expand rdf? Maybe adding stuff like 
    township = URIRef('http://rdfs.co/juso/Township') to townships,
    and do similar stuff for other attributes?
"""

"""
Note to myself: I tried to create IRI's for the tablenames and 
columns/rows because IRI's are mentioned here:

https://www.w3.org/TR/rdb-direct-mapping/PR-to-REC#RDF-IRI
"""

def add_all_townships_to_rdf(rdf_graph):
    """
    For each township in the data, create a URIRef, and add to triple.
    E.g. for township GM0034, add triple 

        (GM0034, rdf.type, township)
        'GM0034 is a township'
    
    to the rdf.
    """
    #Create nodes for the townships, will be the object of the triple
    township_URI = URIRef('http://rdfs.co/juso/Township')

    #I only look at this column. RegioVanVertrek has exactly the same values (I checked)
    unique_townships_vestiging = moves.RegioVanVestiging.unique()

    #loop over all townships, as we want to create triples for all townships
    for township in unique_townships_vestiging:
        
        #this will be the subject of the triple
        township_node = URIRef(f"http://example.org/{township}")

        #add the triple
        rdf_graph.add((township_node, RDF.type, township_URI))


def from_table_to_rdf(rdf_graph, df, table_name):
    """
    Given a table, add all content from the table to the rdf.

    The resulting rdf will look as shown in rdf.pdf in the Images folder.
    """
    columns = df.columns.to_list()
    columns.remove('ID')

    #remove regio column(s), the triples relating to these columns 
    #will be added manually
    if 'RegioS' in columns:
            columns.remove('RegioS')
            single_regio_col = True
    else:
            columns.remove('RegioVanVestiging')
            columns.remove('RegioVanVertrek')
            single_regio_col = False
    
    #Based on slide 12 from lecture 7, the table name will be
    #the type of the common subjects
    table_IRI = URIRef(f"{table_name}")

    for _, row in df.iterrows():

        #create e.g. <Moves/ID=283955>
        row_IRI = URIRef(f"{table_name}/ID={row['ID']}")
        rdf_graph.add((row_IRI, RDF.type, table_IRI))

        #add triples corresponding to townships manually, because this is not as
        #simple as just adding a literal. We want to refer back to specific township node
        #as added to the rdf by add_all_townships_to_rdf()
        if single_regio_col:            
            township = row['RegioS']

            #E.g add (HouseAvailability/ID=664, HouseAvailability#RegioS, GM0034)
            rdf_graph.add((row_IRI, URIRef(f"{table_name}#RegioS"), URIRef(f"http://example.org/{township}")))

        else:
            township_vestiging = row['RegioVanVestiging']
            township_vertrek = row['RegioVanVertrek']

            # E.g. add (Moves/ID=283955, Moves#RegioVanVestiging, GM1680)
            # and      (Moves/ID=283955, Moves#RegioVanVertrek, GM0034)       
            rdf_graph.add((row_IRI, URIRef(f"{table_name}#RegioVanVestiging"), URIRef(f"http://example.org/{township_vestiging}")))
            rdf_graph.add((row_IRI, URIRef(f"{table_name}#RegioVanVertrek"), URIRef(f"http://example.org/{township_vertrek}")))

        #loop over leftover columns in order to add all other values
        for col in columns:
            col_IRI = URIRef(f"{table_name}#{col}")
            literal = Literal(row[col])

            rdf_graph.add((row_IRI, col_IRI, literal))
        
        
    


Actually creating the RDF

In [10]:
#Create rdf graph
g = Graph()

In [14]:
#add all township nodes to rdf
add_all_townships_to_rdf(g)

In [None]:
#add all info from tables to rdf
from_table_to_rdf(g, moves.head(), "Moves")
from_table_to_rdf(g, house_prices_new.head(), "HousePrices")
from_table_to_rdf(g, house_availability.head(), "HouseAvailability")
from_table_to_rdf(g, population.head(), "Population")

In [None]:
#Check results
print(g.serialize(format='n3'))

Hieronder is geklad/ideen, niet belangrijk

In [12]:
township = URIRef('http://rdfs.co/juso/Township')
gm0034 = URIRef("http://example.org/gm0034")

"""Ik denk dat we door de verschillende tables moeten loopen en dan triples toevoegen waarbij de gemeentes steeds de subject (eerste ding in de triple) zijn
Zoals bijv hieronder:

GM0034 -> is een -> gemeente (aangegeven door de uri hierboven)
(ik denk eigenlijk dat we voor GM0034 ook online references kunnen vinden naar de wiki oid van de daadwerkelijke gemeente ipv t te doen via http://example.org/gm0034

Door alle tables in triples aan 1 graph toe te voegen heb je dan alle info op 1 plek"""

g = Graph()

g.add((gm0034, RDF.type, township))

# print(g.serialize(format='n3'))

<Graph identifier=Nc7f70efd9c6f48ddac731444b46d5dbf (<class 'rdflib.graph.Graph'>)>