In [8]:
import pandas as pd
import json
import csv
import glob
import os

def list_columns(year, location):
    year = str(year)
    return sorted(pd.DataFrame.from_csv(location + year + ".csv").columns)

def get_vars(csvfile):
    """
    Returns the vars used by the CBS as a dict.
    """
    with open(csvfile, "rbU") as f:
        reader = reader=csv.reader(f, delimiter=';')
        return {row[0]:row[1] for row in reader}

variables = {os.path.basename(fn)[:-4]: get_vars(fn) for fn in glob.glob('data/csv/vars/*')
             if not fn.endswith("~")}

provinces = ["Groningen", "Friesland", "Drenthe", "Overijssel", "Flevoland", "Gelderland", 
             "Utrecht", "Noord-Holland", "Zuid-Holland", "Zeeland", "Noord-Brabant", "Limburg"]

In [9]:
def get_json(var, variables, provinces, years, location = "data/csv/"):
    """
    Returns a json string per item in which the data for the given years is included.
    
    var: string
    variables: nested dictionary
    years: tuple -> ints
    location: string
    
    E.g.
    
    get_json("aantal_mannen", variables, provinces, (2003, 2015))
    """
    
    JSON = {}
    
    for year in xrange(years[0], years[1] + 1):
        year = str(year)
        JSON[year] = {}
        
        item = variables[var][year]
        areades = variables["regioaanduiding"][year]
        index = variables["regionaam"][year]
        data = pd.DataFrame.from_csv(location + year + ".csv")

        # Lower all input and all columns (CBS uses lower/upper case for its columns throughout the years).
        item, areades, index = item.lower(), areades.lower(), index.lower()
        data.columns = map(str.lower, data.columns)


        # First select only all gemeentes, then filter based on given item and index.
        # This will be for all gemeentes in all provinces.
        root = data[(data[areades] == 'Gemeente') | (data[areades] == 'G')].filter([item, index])
        
        # DataFrame consisting of Gemeente,Provincie data.
        # Foor 2004 and 2005, assuming data is equal to 2006 data because of nonexistence of data for these years..
        try:
            gemprov = pd.DataFrame.from_csv(location + "gemprov/" + year + ".csv")
        except:
            gemprov = pd.DataFrame.from_csv(location + "gemprov/2006.csv")
            
        
        for prov in provinces:
            # List of Gemeentes in currently iterated province
            gem = list(gemprov[(gemprov["PROVINCIE"] == prov)].filter(["GEMEENTE"]).T.columns)
            
            # Root filtered on Gemeentes which are in the currently itereated province
            branch = root[(root[index].isin(gem))].set_index([index])
            
            # Write to JSON dictionary
            JSON[year][prov] = json.loads(branch[item].to_json())
            
    return JSON

def write_json(variables):
    for var in variables:
        # Broken variables and unnecessary ones
        if not var in ["aantal_ao_uitkering", "aantal_geboorte", "meest_voorkomende_postcode", 
                       "personen_ao_uitkering_totaal", "woningvoorraad_aantal", "regionaam", 
                       "buurtcode", "regioaanduiding", "gemeentecode"]:
            print "Currently writing %s.json." % var
            with open("data/json/" + var + ".json", 'w') as f:
                json.dump(get_json(var, variables, provinces, (2004, 2014), location = "data/csv/"), f)