In [1]:
#### Find HDNG Data and restructure it so that it be useful
### Also combine this with a key between different municipality names

## Import libraries
import pandas as pd
import urllib.request
import re
import os

In [77]:
## First, download the data and read it into python's memory
url="https://datasets.iisg.amsterdam/api/access/datafile/10264"
urllib.request.urlretrieve(url, '../Data/hdng.txt')


('../Data/hdng.txt', <http.client.HTTPMessage at 0x7fc5f0faf9d0>)

In [2]:
hdng = pd.read_csv("../Data/hdng.txt",  
  dtype={'amco':'Int32', 'naam':str, 'variable':str, 'description':str,'information':str, 'year':'Int32','values':str},
  delimiter = ",", 
  usecols=['amco','name', 'variable', 'description','information','year','value'])

#os.remove("../Data/hdng.txt")

In [3]:
hdng = hdng[(hdng['year'] < 1940) & (~pd.isna(hdng['amco']))]
hdng['name'] = hdng['name'].str.title().replace("'S Gravenhage", 'Den Haag').replace("'S Hertogenbosch", "Den Bosch")

## Taxes

In [204]:
## Now, clean the data
### Fiscal dataframe, the years for
### 'totaal personele belastingen' are 1859, 1869, 1870, 1879
### we can use this for birthplace and also for district

belastingen = hdng[hdng['description'].str.contains('Belastingen')]
belastingen = belastingen.iloc[:, [0,1, 4, 5,6]].pivot(columns=['information'], 
                                                       values='value', 
                                                       index =['amco', 'year', 'name'])
belastingen = belastingen.iloc[:,2].reset_index()
# now also get this per inhabitant
years_i_look_for = belastingen['year'].unique()
bevolking = hdng[(hdng['description'] == 'Bevolking') 
                 & (hdng['information'] == 'totaal') 
                 & (hdng['year'].isin(years_i_look_for))]
bevolking = bevolking.rename(columns={'value':'bevolking'}).filter(['amco', 'name', 'year', 'bevolking'])


In [205]:
# Write clean bevolking
clean_bevolking = (bevolking
    .groupby(['amco','name', 'year'])
    .apply(lambda x: np.mean(x['bevolking']))
    .reset_index()
    .rename(columns={0:'total_inhabitants'})
)

In [207]:
taxes = pd.merge(belastingen, clean_bevolking, on=['amco', 'year', 'name']).assign(
    taxes_percap = lambda df: np.where(
        df['totaal personele belastingen'] / df['total_inhabitants'] == float("inf"), 
        None, 
        df['totaal personele belastingen'] / df['total_inhabitants']))

In [208]:
taxes.to_csv("../Data/district_data/taxes_and_population.csv")

## Professional composition

In [140]:
### Professional composition dataframe
profcomp = hdng[hdng['description'].str.startswith('Beroepsbevolking')]
profcomp = profcomp[(profcomp['information'] == 'totaal') & (profcomp['year'] == 1889)]

In [141]:
### From this categories, divide into industry, 
beroepen = profcomp[profcomp['year'] == 1889]['description'].unique()

industry = []
services = []
agriculture = []

industriele_beroepen = ['aardewerk', 'drukkersbedrijven', 'bouwbedrijven', 'chemische nijverheid',
                       'diamantbewerking', 'houtbewerking', 'kunstnijverheid', 'leder',
                       'metaalbewerking', 'mijnen en veenderijen', 'papier', 'textiel',
                       'verlichting', 'voeding']
service_beroepen = ['huiselijke diensten', 'godsdienst', 'kleding en reiniging', 'krediet',
                   'losse werklieden', 'onderwijs', 'verkeerswezen','vrije beroepen', 'verzekeringswezen',
                   'handel']
landbouw_beroepen = ['landbouw', 'visserij']

for i in beroepen:
    for j in industriele_beroepen:
        if j in i:
            industry.append(i)
    for z in service_beroepen:
        if z in i:
            services.append(i)
    for y in landbouw_beroepen:
        if y in i:
            agriculture.append(i)

In [142]:
profcomp = profcomp.assign(category = profcomp['description'].apply(lambda row: 
                np.where(row in industry, "industry",
               np.where(row in services, "services",
                       "agriculture"))
               )
               )

In [194]:
total_profcomp = (profcomp.groupby(['amco','name','year','category'])
            .apply(lambda x: sum(x['value'].fillna(0)))
            .reset_index()
            .rename(columns={0:'total_profession_count'})
       )

In [195]:
clean_bevolking = (
    bevolking[bevolking['year'] == 1889]
    .groupby(['amco','name'])
    .apply(lambda x: max(x['bevolking']))
    .reset_index()
    .rename(columns={0:'total_inhabitants'})
)

In [196]:
total_profcomp = total_profcomp.merge(clean_bevolking, 
                     how = 'left',
                     on = ['amco', 'name']
    ).assign(prof_count_per_cap = lambda x: x['total_profession_count'] / x['total_inhabitants'])


In [197]:
total_profcomp.to_csv("../Data/district_data/professional_composition.csv")

## Other variables from HDNG

### Onderwijsuitgaven + Alfabetisme

In [220]:
others = hdng[(hdng['description'] == 'Onderwijsuitgaven in guldens') | (hdng['description'] == 'Alfabetisme')].iloc[:, [0,1,3, 5,6]]

In [226]:
others = (others
     .pivot(columns='description', values='value', index = ['amco', 'name', 'year'])
     .reset_index()
)

In [227]:
others.to_csv("../Data/district_data/alphabetism_educ_expenses.csv")

### Religion over time

In [28]:
religion = hdng[hdng['description'] == 'Aantal gelovigen'].rename(columns={'value':'aantal'})
bevolking = hdng[hdng['description'] == 'Bevolking'].rename(columns={'value':'bevolking'}).filter(['amco', 'name','year','bevolking'])

clean_bevolking = (bevolking
    .groupby(['amco','name', 'year'])
    .apply(lambda x: np.mean(x['bevolking']))
    .reset_index()
    .rename(columns={0:'total_inhabitants'})
)

religion_per_cap = pd.merge(religion, clean_bevolking, 
         left_on =['amco', 'name','year'],
        right_on=['amco','name','year'], how='left')

In [30]:
religion_per_cap.to_csv("../Data/district_data/religion_over_time.csv")