Imports

In [125]:
import pandas as pd
import numpy as np
from rdflib import Graph, URIRef, RDF, Literal
from rdflib.namespace import RDF, XSD

Retrieve DFs

In [126]:
moves = pd.read_pickle("../cleanedDFs/moves.pkl")
house_prices = pd.read_pickle("../cleanedDFs/prices.pkl")
house_availability = pd.read_pickle("../cleanedDFs/availability.pkl")
population = pd.read_pickle("../cleanedDFs/population.pkl")

Create townships dict to map the township code to the name of the township

In [127]:
gemeentes = pd.read_excel("../Data/gemeentes.xlsx",header=None)
zip_iter = zip(gemeentes[0].to_list(), gemeentes[1].to_list())
municipalities_dict = dict(zip_iter)

RDF Creation Functions

In [128]:
"""
As soon as we're done with cleaning, we should take a look at:
    - Does the rdf look exactly as desired?
    - Expand rdf? Maybe adding stuff like 
    municipality = URIRef('http://rdfs.co/juso/municipality') to municipalities,
    and do similar stuff for other attributes?
"""

"""
Note to myself: I tried to create IRI's for the tablenames and 
columns/rows because IRI's are mentioned here:

https://www.w3.org/TR/rdb-direct-mapping/PR-to-REC#RDF-IRI
"""
def add_all_possible_years(rdf_graph):
    """
    Add all five years to the rdf graph.
    """
    ###Hier moet ik voor deze row t jaar pakken en in een literal?? gooien
    # en daarnaa verwjizen dat t een XSD.gYear is zoals ik ook in een cel onderaan/bovenaan de notebook doe
    #ik denk eigenlijk zo'n blank node / example.org ding
    for year in [np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020)]: 
        rdf_graph.add((URIRef(f"http://example.org/{year}"), RDF.type, XSD.gYear))
        
        #I think we should be able to get a more fitting property than an example.org property here
        rdf_graph.add((URIRef(f"http://example.org/{year}"), URIRef("http://example.org/hasValue"), Literal(year)))

    

def add_all_municipalities_to_rdf(rdf_graph, municipalities_dict):
    """
    For each municipality in the data, create a URIRef, and add to triple.
    E.g. for municipality GM0034, add triple 

        (GM0034, rdf.type, municipality)
        'GM0034 is a municipality'
        &
        (GM0034, official_name, Almere)
        'GM0034's official name is Almere'
    
    to the rdf.
    """
    #Create nodes for the municipalities, will be the object of the triple
    municipality_URI = URIRef('http://rdfs.co/juso/Municipality')

    #name property of 'spatial things' -> e.g. a municipality
    official_name_URI = URIRef('http://rdfs.co/juso/official_name')

    #Get all unique municipalities. To be sure get the unique values from both 
    #RegioVanVestiging and RegioVanVestiging
    unique_municipalities = list(set(moves.RegioVanVertrek.unique().tolist() + moves.RegioVanVestiging.unique().tolist()))

    #loop over all municipalities, as we want to create triples for all municipalities
    for municipality in unique_municipalities:
        
        #this will be the subject of the triple
        municipality_node = URIRef(f"http://example.org/{municipality}")

        #get official name from the dict
        official_name_literal = Literal(municipalities_dict[municipality])

        #add the triples
        rdf_graph.add((municipality_node, RDF.type, municipality_URI))
        rdf_graph.add((municipality_node, official_name_URI, official_name_literal))


def from_table_to_rdf(rdf_graph, df, table_name):
    """
    Given a table, add all content from the table to the rdf.

    The resulting rdf will look as shown in rdf.pdf in the Images folder.
    """
    columns = df.columns.to_list()
    columns.remove('Perioden')
    
    #remove regio column(s), the triples relating to these columns 
    #will be added manually
    if 'RegioS' in columns:
            columns.remove('RegioS')
            single_regio_col = True
    else:
            columns.remove('RegioVanVestiging')
            columns.remove('RegioVanVertrek')
            single_regio_col = False
    
    #Based on slide 12 from lecture 7, the table name will be
    #the type of the common subjects
    table_IRI = URIRef(f"{table_name}")

    for _, row in df.iterrows():

        #create e.g. <Moves/ID=283955>
        row_IRI = URIRef(f"{table_name}/ID={row.name}")
        rdf_graph.add((row_IRI, RDF.type, table_IRI))

        #Link this row to the correct year
        rdf_graph.add((row_IRI, URIRef(f"{table_name}#Perioden"), URIRef(f"http://example.org/{row['Perioden']}"))) 

        #add triples corresponding to municipalities manually, because this is not as
        #simple as just adding a literal. We want to refer back to specific municipality node
        #as added to the rdf by add_all_municipalities_to_rdf()
        if single_regio_col:            
            municipality = row['RegioS']

            #E.g add (HouseAvailability/ID=664, HouseAvailability#RegioS, GM0034)
            rdf_graph.add((row_IRI, URIRef(f"{table_name}#RegioS"), URIRef(f"http://example.org/{municipality}")))

        else:
            municipality_vestiging = row['RegioVanVestiging']
            municipality_vertrek = row['RegioVanVertrek']

            # E.g. add (Moves/ID=283955, Moves#RegioVanVestiging, GM1680)
            # and      (Moves/ID=283955, Moves#RegioVanVertrek, GM0034)       
            rdf_graph.add((row_IRI, URIRef(f"{table_name}#RegioVanVestiging"), URIRef(f"http://example.org/{municipality_vestiging}")))
            rdf_graph.add((row_IRI, URIRef(f"{table_name}#RegioVanVertrek"), URIRef(f"http://example.org/{municipality_vertrek}")))

        #loop over leftover columns in order to add all other values
        for col in columns:
            col_IRI = URIRef(f"{table_name}#{col}")
            
            #If a value is NaN, don't put it in the rdf
            #nan is a float. If you dont do both check we get TypeErrors
            col_value = row[col]
            if isinstance(col_value, float):
                if np.isnan(col_value):
                    continue
            literal = Literal(col_value)

            rdf_graph.add((row_IRI, col_IRI, literal))
        

Actually creating the RDF

In [129]:
#Create rdf graph
g = Graph()

In [130]:
add_all_possible_years(g)

In [131]:
municipalities_dict['GM9999'] = 'NotSureWhatToDoHereYet'

#add all township nodes to rdf
add_all_municipalities_to_rdf(g, municipalities_dict)

In [132]:
#add all info from tables to rdf
from_table_to_rdf(g, moves.head(), "Moves")
from_table_to_rdf(g, house_prices.head(), "HousePrices")
from_table_to_rdf(g, house_availability.head(), "HouseAvailability")
from_table_to_rdf(g, population.head(), "Population")

In [133]:
#Check results
print(g.serialize(format='n3'))

@prefix ns1: <http://rdfs.co/juso/> .
@prefix ns2: <HousePrices#> .
@prefix ns3: <HouseAvailability#> .
@prefix ns4: <Population#> .
@prefix ns5: <Moves#> .
@prefix ns6: <http://example.org/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<HouseAvailability/ID=0> a <HouseAvailability> ;
    ns3:EigendomOnbekend_6 2e+00 ;
    ns3:EigendomOverigeVerhuurders_5 3.43e+02 ;
    ns3:EigendomWoningcorporatie_4 2.464e+03 ;
    ns3:Koopwoningen_2 2.813e+03 ;
    ns3:Perioden ns6:2016 ;
    ns3:RegioS ns6:GM0003 ;
    ns3:StatusVanBewoning "A028725" ;
    ns3:TotaalHuurwoningen_3 2.807e+03 ;
    ns3:TotaleWoningvoorraad_1 5.622e+03 .

<HouseAvailability/ID=1> a <HouseAvailability> ;
    ns3:EigendomOnbekend_6 2e+00 ;
    ns3:EigendomOverigeVerhuurders_5 3.58e+02 ;
    ns3:EigendomWoningcorporatie_4 2.477e+03 ;
    ns3:Koopwoningen_2 2.81e+03 ;
    ns3:Perioden ns6:2017 ;
    ns3:RegioS ns6:GM0003 ;
    ns3:StatusVanBewoning "A028725" ;
    ns3:TotaalHuurwoningen_3 2.835e+03 ;
    ns3:Total

Hieronder is geklad/ideen, niet belangrijk