# Imports

In [498]:
import pandas as pd
import numpy as np
import math

# Load in datasets

In [499]:
# Load in dataset
moves = pd.read_pickle("../Prepared_DFs/moves_final.pkl")
conversions = pd.read_csv("../Data/Conversions.csv").astype('string')

## Clean prices

In [500]:
# Load prices
prices = pd.read_csv("../Data/house_prices_new.csv", sep = ';')

In [501]:
# Drop everything that is not a municipality
prices = prices.loc[285:]
prices.reset_index(inplace=True)
prices.drop(columns = 'index',inplace=True)

# Filter the year code to become int64
prices['Perioden'] = prices['Perioden'].map(lambda x: x.rstrip('00').rstrip("JJ"))
prices['Perioden'] = pd.to_datetime(prices['Perioden'], format='%Y').dt.year

# Replace "." with nan values and convert to float
prices.loc[ prices['GemiddeldeWoningwaarde_4'] == '       .', 'GemiddeldeWoningwaarde_4'] = np.nan
prices['GemiddeldeWoningwaarde_4'] = prices['GemiddeldeWoningwaarde_4'].astype(float)

In [502]:
# Convert all municipalty codes to the new ones.
conv_dict = pd.Series(conversions['newCode'].values, index = conversions['oldCode']).to_dict()
prices['RegioS'] = prices['RegioS'].apply(lambda x: conv_dict[x] if x in conv_dict.keys() else x)

# Group by regio and year and add up woningen_2 and take the mean of gemiddeldewoningwaarde_4.
prices = prices.groupby(['RegioS', 'Perioden']).agg({'Woningen_2':'sum','GemiddeldeWoningwaarde_4': 'mean'}).reset_index()

In [503]:
# Remove all municipalities from the prices df that are not in the moves df (RegiovanVertrek & RegioVanVestiging are the same size.)
unique_prices = prices.RegioS.unique()
unique_moves = moves.RegioVanVestiging.unique()
difference = set(unique_prices) - set(unique_moves)
difference2 = set(unique_moves) -set(unique_prices) # There is no municipality in prices that is not in moves.
difference_index = prices.loc[prices["RegioS"].isin(difference)].index
prices.drop(difference_index, inplace=True)

In [505]:
# Save the dataset
prices.to_pickle("../ikosdfs/prices.pkl")

## Clean population

In [506]:
# Load in the dataset
population = pd.read_csv("../Data/population_size.csv", sep = ';')

# Do the thing with the gender that Bram did
population = population[population['Geslacht'] == 'T001038']

# Filter the year code to become int64
population['Perioden'] = population['Perioden'].map(lambda x: x.rstrip('00').rstrip("JJ"))
population['Perioden'] = pd.to_datetime(population['Perioden'], format='%Y').dt.year

In [507]:
# Holy shit het klopt (Dit is een test om te kijken of de CBS data tussen datasets overeenkomt)
headers = population.columns.values.tolist()

# Dit is de som van alle mensen die uit amsterdam zijn vertrokken
dim = moves[moves["RegioVanVestiging"] == "GM0363"]
dim2 = dim[dim["Perioden"] == 2017]
print("People who moved from amsterdam according to moves dataset: ", sum(dim2["TussenGemeentenVerhuisdePersonen_1"]))

dom = population[population["RegioS"]=="GM0363"]
dom2 = dom[dom["Perioden"] == 2017]
print("People who moved from amsterdam according to population dataset: ", dom2["UitAndereGemeente_11"])

People who moved from amsterdam according to moves dataset:  36285.0
People who moved from amsterdam according to population dataset:  186    36285.0
Name: UitAndereGemeente_11, dtype: float64


In [508]:
# Make a dictionary of the relevant headers and their aggregation method

header_agg = { 'BevolkingOp1Januari_1': 'sum', 
  'Bevolkingsdichtheid_2': 'sum', 
  'LevendGeborenKinderen_3': 'sum', 
  'LevendGeborenKinderenRelatief_4' : 'mean',   # Not sure
  'Overledenen_5': 'sum', 
  'OverledenenRelatief_6': 'mean', # Not sure
  'Geboorteoverschot_7': 'sum', 
  'TotaalVestiging_8': 'sum', 
  'TotaalVestigingRelatief_9': 'mean', # Not sure
  'Immigratie_10': 'sum', 
  'UitAndereGemeente_11': 'sum', 
  'TotaalVertrekInclusiefCorrecties_12': 'sum', 
  'TotaalVertrekInclusiefCoRelatief_13': 'sum', # Not sure
  'EmigratieInclusiefAdministratieveC_14': 'sum', 
  'SaldoAdministratieveCorrecties_15': 'sum', 
  'NaarAndereGemeente_16': 'sum', 
  'VestigingsoverschotInclusiefCorrecties_17': 'sum', 
  'TotaleGroei_18': 'sum', 
  'TotaleGroeiRelatief_19': 'mean', # Not sure
  'BevolkingOp31December_20': 'sum'
}

# Convert all object items to floats to avoid errors:
population['LevendGeborenKinderenRelatief_4'] = population['LevendGeborenKinderenRelatief_4'].astype(float)
population['OverledenenRelatief_6'] = population['OverledenenRelatief_6'].astype(float)
population['TotaalVestigingRelatief_9'] = population['TotaalVestigingRelatief_9'].astype(float)
population['TotaleGroeiRelatief_19'] = population['TotaleGroeiRelatief_19'].astype(float)
population['TotaalVertrekInclusiefCoRelatief_13'] = population['TotaalVertrekInclusiefCoRelatief_13'].astype(float)
population['BevolkingOp1Januari_1'] = population['BevolkingOp1Januari_1'].astype(float)
population['Bevolkingsdichtheid_2'] = population['Bevolkingsdichtheid_2'].astype(float)


In [509]:
# Convert all municipalty codes to the new ones.
conv_dict = pd.Series(conversions['newCode'].values, index = conversions['oldCode']).to_dict()
population['RegioS'] = population['RegioS'].apply(lambda x: conv_dict[x] if x in conv_dict.keys() else x)

# Group by regio and year and use the aggregation dictionary to decide on the aggregation per column.
population = population.groupby(['RegioS', 'Perioden']).agg(header_agg).reset_index()

In [510]:
# Remove all municipalities from the population df that are not in the moves df (RegiovanVertrek & RegioVanVestiging are the same size.)
unique_population = population.RegioS.unique()
unique_moves = moves.RegioVanVestiging.unique()
difference = set(unique_population) - set(unique_moves)
difference2 = set(unique_moves) -set(unique_population) # There is no municipality in population that is not in moves.
difference_index2 = population.loc[population["RegioS"].isin(difference)].index
population.drop(difference_index2, inplace=True)

In [511]:
population.to_pickle("../ikosdfs/population.pkl")

## Clean availability

In [512]:
# Load the dataset
availy = pd.read_csv("../Data/house_availability.csv", sep = ';')

# Filter the year code to become int64
availy['Perioden'] = availy['Perioden'].map(lambda x: x.rstrip('00').rstrip("JJ"))
availy['Perioden'] = pd.to_datetime(availy['Perioden'], format='%Y').dt.year

# Drop all 2021 periods
availy = availy.drop(availy[availy.Perioden == 2021].index)


In [513]:
column_headers = availy.columns.values.tolist()
column_headers
header_dict = {
 'TotaleWoningvoorraad_1': 'sum',
 'Koopwoningen_2': 'sum',
 'TotaalHuurwoningen_3': 'sum',
 'EigendomWoningcorporatie_4': 'sum',
 'EigendomOverigeVerhuurders_5': 'sum',
 'EigendomOnbekend_6': 'sum'
}

In [514]:
# Convert all municipalty codes to the new ones.
conv_dict = pd.Series(conversions['newCode'].values, index = conversions['oldCode']).to_dict()
availy['RegioS'] = availy['RegioS'].apply(lambda x: conv_dict[x] if x in conv_dict.keys() else x)

# Group by regio and year and use the aggregation dictionary to decide on the aggregation per column.
availy = availy.groupby(['StatusVanBewoning', 'RegioS', 'Perioden']).agg(header_dict).reset_index()

There are three types of "StatusVanBewoning":

    A028725 = Bewoonde woning.
    A028726 = Onbewoonde woning.
    T001235 = Totale voorraad woningen.
    
It holds for all regions for a specific year that: A028725 + A028726 = T001235 For all values in the columns

In [515]:
# Remove all municipalities from the prices df that are not in the moves df (RegiovanVertrek & RegioVanVestiging are the same size.)
unique_availy = availy.RegioS.unique()
unique_moves = moves.RegioVanVestiging.unique()
difference = set(unique_availy) - set(unique_moves)
difference3 = set(unique_moves) -set(unique_availy) # moves has 4 more municipalities that availy does not have
difference_index3 = availy.loc[availy["RegioS"].isin(difference)].index
availy.drop(difference_index3, inplace=True)

Right now there are 4 municipalities in moves that are not in the availability data. There is however not much we can do about that. They are recently converted municipalities (2021). You can see them if you type: difference3

In [516]:
availy.to_pickle("../ikosdfs/availability.pkl")

# Endnotes/TODO's 

-Ik Weet van sommige kolommen niet zeker of de aggregatie met een som/mean of andere soort functie moet worden gedaan. Als iemand daar even een sterke mening over zou kunnen geven zou chill zijn.

-Er gaan volgens mij nog een paar dingen fout bij het cleanen van de prices dataset. Wat de gemiddelde woningwaarde precies inhoud is mij vrij onduidelijk en er zijn ook nog een paar nan values in diezelfde tabel waar we nog mee moeten dealen.

-Er is overlap tussen kolom "Woningen_2" van de prices tabel en "TotaleWoningvoorraad_1" van de availability dataset. De cijfers komen echter niet overeen, wat enigszins raar is gezien andere overlappende waardes in CBS tabellen wel overeenkomen.

-De availability dataset heeft geen informatie over vier specifieke gemeenten die in 2021 zijn gemerged. Wat willen we daar mee doen?

-Alle gecleanede datasets zijn nu opgeslagen in de map ikosdfs.