In [6]:
"""
{"2006": {"Flevoland": 
                {"noordoostpolder": "22", 
                 "urk": "32", 
                 "lelystad": "20", 
                  "almere": "23", 
                  "dronten": "21", 
                  "zeewolde": "26"
                  "__provdata": {
                              "min": ["lelystad", "20"], 
                              "max": ["urk", "32"], 
                              "avg": "24"
                            }
          "__countrydata":
                   {"min": "Flevoland",
                    "max": "Zeeland",
                    "avg": "39"}
"""

#var["2006"][["__countrydata"]["min"]]["__provdata"]["min"]
print




In [2]:
import pandas as pd
import json
import csv
import glob
import os


def list_columns(year, location):
	"""
	Returns all columns of a specific year.csv file.
	This file is identical to the .xls file provided by the CBS,
	but is converted to CSV format.
	"""
	return sorted(pd.DataFrame.from_csv(location + \
					str(year) + ".csv").columns)

def get_vars(csvfile):
    """
    Returns a dictionary of correct variables for a specific year.
    """
    with open(csvfile, "rbU") as f:
        reader = reader=csv.reader(f, delimiter=';')
        return {row[0]:row[1] for row in reader}

# Dictionary of correct variables for all years.
variables = {os.path.basename(fn)[:-4]: get_vars(fn)
				for fn in glob.glob('data/csv/vars/*')
				if not fn.endswith("~")}

# List of all provinces.
provinces = ["Groningen", "Friesland", "Drenthe", "Overijssel",
            "Flevoland", "Gelderland", "Utrecht", "Noord-Holland",
             "Zuid-Holland", "Zeeland", "Noord-Brabant", "Limburg"]

def get_json(var, variables, provinces, years, location = "data/csv/"):
    """
    Returns a json string per item,
    in which the data for the given years is included.

    var: string
    variables: nested dictionary
    provinces: list of Dutch provinces
    years: tuple -> ints
    location: string

    E.g.

    > get_json("aantal_mannen", variables, provinces, (2010, 2014))
    >>> {'2010': {'Drenthe':  { u'Aa en Hunze': 12685,
								u'Assen': 32605,
								...
							  },
				 'Flevoland': {u'Almere': 93510,
							   u'Dronten': 20185,
							   ...
							  }
				 },

		 '2011': {...
		 }
    """

    JSON = {}

    # For each year provided in the years tuple parameter...
    for year in xrange(years[0], years[1] + 1):
		# Create a new entry in the JSON dict.

        year = str(year)
        JSON[year] = {}

        # Retrieve correct variable names.
        item = variables[var][year]
        areades = variables["regioaanduiding"][year]
        index = variables["regionaam"][year]
        data = pd.DataFrame.from_csv(location + year + ".csv")

        # Lower all input and all columns
        # (CBS uses multicase for its columns throughout the years).
        item, areades = item.lower(), areades.lower()
        index = index.lower()
        data.columns = map(str.lower, data.columns)

        # First select only all gemeentes,
        # then filter based on given item and index.
        # This will be for all gemeentes in all provinces.
        root = data[(data[areades] == 'Gemeente') |
                    (data[areades] == 'G')].filter([item, index])
                    
        # If var is float, multiply by 100.
        if var == "gemiddelde_huishoudensgrootte" or var == "personenautos_per_huishouden":
            root[item] = root[item].apply(lambda x: float(x.replace(',', '.'))*100 if x != "x" else 'x')

        # DataFrame consisting of Gemeente,Provincie data.
        # Foor 2004 and 2005, assuming data is equal to 2006 data,
        # because of nonexistence of data for these years.
        try:
            gemprov = pd.DataFrame.from_csv(location +
                                            "gemprov/" + year + ".csv")
        except Exception as e:
            gemprov = pd.DataFrame.from_csv(location +
                                            "gemprov/2006.csv")

        # For each province in the provinces list parameter...
        for prov in provinces:
            # Create a list of Gemeentes in currently iterated province.
            gem = list(gemprov[(gemprov["PROVINCIE"] == prov)]
                       .filter(["GEMEENTE"]).T.columns)

            # Filter root based on this list.
            branch = root[(root[index].isin(gem))].set_index([index])

            # Lowercase all Gemeente names.
            branch.index = branch.index.str.lower()

            # Write to JSON dictionary.
            JSON[year][prov] = json.loads(branch[item].to_json())

    return JSON

In [3]:
get_json("aantal_mannen", variables, provinces, (2010, 2014))

{'2010': {'Drenthe': {u'aa en hunze': 12685,
   u'assen': 32605,
   u'borger-odoorn': 13035,
   u'coevorden': 17995,
   u'de wolden': 11805,
   u'emmen': 54130,
   u'hoogeveen': 27170,
   u'meppel': 15790,
   u'midden-drenthe': 16750,
   u'noordenveld': 15240,
   u'tynaarlo': 15915,
   u'westerveld': 9660},
  'Flevoland': {u'almere': 93510,
   u'dronten': 20185,
   u'lelystad': 37270,
   u'noordoostpolder': 23360,
   u'urk': 9245,
   u'zeewolde': 10620},
  'Friesland': {u'achtkarspelen': 14265,
   u'ameland': 1760,
   u'boarnsterhim': 9765,
   u'bolsward': 4855,
   u'dantumadiel': 9670,
   u'dongeradeel': 12235,
   u'ferwerderadiel': 4535,
   u'franekeradeel': 10395,
   u'gaasterl\xe2n-sleat': 5220,
   u'harlingen': 7905,
   u'heerenveen': 21325,
   u'het bildt': 5595,
   u'kollumerland en nieuwkruisland': 6660,
   u'leeuwarden': 45915,
   u'leeuwarderadeel': 5275,
   u'lemsterland': 6930,
   u'littenseradiel': 5555,
   u'menaldumadeel': 6970,
   u'nijefurd': 5435,
   u'ooststellingwer