In [None]:
# Python packages
import sys
sys.path.append('../')
from datetime import datetime
import numpy as np
import pandas as pd
import pickle
# Added (new in developing predict)
from itertools import product


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV, cross_validate, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, LassoCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor, XGBClassifier

# Custom functions
import src.settings as settings
import src.mapper_cols as mapper_cols
from src.run_all.main_get_data import get_data, get_data_predict
from src.run_all.main_preprocess import preprocess_data, preprocess_data_predict
from src.utilities.utilities import get_latest_file, list_filenames

# instellingen voor panda weergave aanpassen
pd.set_option('display.max_rows', 500) # alle rijen tonen
pd.set_option('display.max_columns', 500) # alle kolommen tonen
pd.set_option('display.width', 1000) # kolombreedte
pd.set_option("display.precision", 2)     # precisie van de kolommen aanpassen
pd.set_option('display.float_format', lambda x: '{:.15f}'.format(x)) # floats output tot 15 decimalen

# Load dataframe to extend features for future

In [None]:
datapath = '../data/'
filename = 'df_get_data_WMO_WIJK_HUISHOUDENS_BEVOLKING_HEFFING_202104042111.parquet.gzip'
df_get_data_WMO = pd.read_parquet(datapath + filename)

In [None]:
df_get_data = df_get_data_WMO.reset_index().copy()
df_get_data.head()

In [None]:
# datapath = '../data/'
# filename = 'df_preprocessed_202104042151_Boerenverstand_Maikel.parquet.gzip'
# df_preprocessed = pd.read_parquet(datapath + filename)

In [None]:
# df_preprocessed

In [None]:
# print(list(df_preprocessed.reset_index().columns))

# Main_predict

In [None]:
# Settings
periods = [2020,2021,2022]
trained_model = get_latest_file(filename_str_contains='best_model_', datapath=datapath, filetype='pickle')
df_prognoses = get_data_predict(periods=periods, save_all=True, personal_note="")

In [None]:
df_get_data.empty

In [None]:
## Get data
if df_get_data.empty:
    df_get_data_WMO = get_data(save=True)
if df_prognoses.empty:
    df_prognoses = get_data_predict(periods=periods, save_all=True, personal_note="")

In [None]:
## Preprocess
# Preprocess predict
df_preprocessed_predict = preprocess_data_predict(df_get_data, df_prognoses, save_all=True, personal_note="")

In [None]:
df_preprocessed_predict

In [None]:
# Preprocess (general)
df_preprocessed = preprocess_data(df=df_preprocessed_predict, save_all=False, personal_note='predict')
df_preprocessed = df_preprocessed.drop(settings.Y_TARGET_COLS, axis=1)

In [None]:
## Predict
y_preds = trained_model.predict(df_preprocessed)

In [None]:
y_preds

## Extend strategy

In [None]:
# import src.settings as settings
# from src.preprocess.preprocess import get_and_combine_cbs_tables, rename_and_subset_cols, \
#     get_region_period_spec_val_subtable, downcast_variables_dataframe


# pickle file inladen voor predict
loaded_model = get_latest_file(filename_str_contains='best_model_', datapath=datapath, filetype='pickle')

In [None]:
from os import listdir
from os.path import isfile, join
filename_str_contains='best_model_'
datapath='../data/'
onlyfiles = sorted([f for f in listdir(datapath) if isfile(join(datapath, f))])
# Get last file
filename = [s for s in onlyfiles if filename_str_contains in s][-1]
filename

In [None]:
from src.run_all.main_predict import predict_data
periods = [2020,2021,2022]

In [None]:
%time df = predict_data(df_get_data=df_get_data, periods=periods, trained_model=loaded_model)
df

In [None]:
df_get_data.shape

In [None]:
loaded_model

In [None]:
2235+936

In [None]:
periods_2 = [2020,2021,2022]

In [None]:
[str(x) for x in periods_2]

In [None]:
# settings.preprocess['MISSING_BOUNDARY'] = 0.99
df_preprocessed = preprocess_data(df=df, save_all=False, personal_note='test')

In [None]:
settings.preprocess

In [None]:
df_preprocessed

In [None]:
## Get data (for extending get data with future)
# Determine boundaries for get prognose data
roundedto5periods = max(periods) + (5 - max(periods)) % 5
total_periods = list(range(min(periods), roundedto5periods+1, 1))

print("Get 'regio-indeling'")
df_regioindeling = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_REGIOINDELING'],
                                              double_trouble_colnames=settings.predict[
                                                  'DICT_DOUBLETROUBLECOLNAMES_REGIOINDELING'],
                                              url=settings.get_data['CBS_OPEN_URL'])
df_regioindeling = rename_and_subset_cols(df=df_regioindeling,
                                          dict_rename=settings.predict['DICT_COLS_RENAMED_REGIOINDELING'],
                                          list_cols=settings.predict['LIST_COLS_SUBSET_REGIOINDELING'])
df_regioindeling[settings.predict['LIST_STR_STRIP_COLS_REGIOINDELING']] = df_regioindeling[
    settings.predict['LIST_STR_STRIP_COLS_REGIOINDELING']].apply(lambda x: x.str.strip())

print("Get 'prognose huishoudens' tables")
df_huishouden_prognose = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_HUISHOUDEN'],
                                                    url=settings.get_data['CBS_OPEN_URL'])
df_huishouden_prognose['interval'] = df_huishouden_prognose['perioden']
df_huishouden_prognose = df_huishouden_prognose.rename(columns=settings.predict['DICT_COLS_RENAMED_HUISHOUDEN'])
df_huishouden_prognose = df_huishouden_prognose[df_huishouden_prognose['prognoseinterval'] == 'Prognose']
df_huishouden_prognose = df_huishouden_prognose[
    (df_huishouden_prognose['gemeentenaam'].str.contains('(CR)') == False) &
    (df_huishouden_prognose['gemeentenaam'].str.contains('(PV)') == False) &
    (df_huishouden_prognose['gemeentenaam'] != 'Nederland')].copy()
df_huishouden_prognose['particulierehuishoudens'] = df_huishouden_prognose['particulierehuishoudens'] * 1000
df_huishouden_prognose['particulierehuishoudens'] = df_huishouden_prognose[
    'particulierehuishoudens'].round().astype(int)
df_huishouden_prognose_pivot = pd.pivot_table(data=df_huishouden_prognose, values='particulierehuishoudens',
                                              index=['gemeentenaam', 'interval'],
                                              columns=['samenstellingvanhethuishouden'],
                                              aggfunc=np.sum).reset_index()
df_huishouden_prognose_pivot = df_huishouden_prognose_pivot[
    df_huishouden_prognose_pivot['interval'].astype(int) <= roundedto5periods]
df_huishouden_prognose_pivot = rename_and_subset_cols(df=df_huishouden_prognose_pivot,
                                                      dict_rename=settings.predict[
                                                          'DICT_COLS_RENAMED_HUISHOUDEN_PIVOT'],
                                                      list_cols=settings.predict[
                                                          'LIST_COLS_SUBSET_HUISHOUDING_PIVOT'])

print("Get 'prognose bevolking' tables")
df_population_prognose = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_BEVOLKING'],
                                                    url=settings.get_data['CBS_OPEN_URL'])
df_population_prognose = rename_and_subset_cols(df=df_population_prognose,
                                                dict_rename=settings.predict['DICT_COLS_RENAMED_BEVOLKING'],
                                                list_cols=settings.predict['LIST_COLS_SUBSET_BEVOLKING'])
df_population_prognose['interval'] = df_population_prognose['perioden'].apply(lambda x: x.split(' ')[-1])
df_population_prognose = df_population_prognose[
    (df_population_prognose['gemeentenaam'].str.contains('(CR)') == False) &
    (df_population_prognose['gemeentenaam'].str.contains('(PV)') == False) &
    (df_population_prognose['gemeentenaam'] != 'Nederland')].copy()
df_population_prognose = df_population_prognose[df_population_prognose['interval'].astype(int) <= roundedto5periods]
df_population_prognose['aantalinwoners'] = df_population_prognose['aantalinwoners'] * 1000
df_population_prognose['aantalinwoners'] = df_population_prognose['aantalinwoners'].round().astype(int)
df_population_prognose = df_population_prognose.drop(['perioden'], axis=1)

# Merge all dataframes
df_prognoses = pd.merge(df_regioindeling, df_huishouden_prognose_pivot, how='left',
                        left_on=['gemeentenaam'], right_on=['gemeentenaam'])
df_prognoses = pd.merge(df_prognoses, df_population_prognose, how='left',
                        left_on=['gemeentenaam', 'interval'],
                        right_on=['gemeentenaam', 'interval'])

# Concat with original 'get data' dataframe (incl. drop multiplicacities that don't occur in original dataset)
list_unchanged_multiplicacities = df_get_data[df_get_data['interval'] == df_get_data['interval'].max()][
    'codering_regio'].unique()
df_prognoses = df_prognoses[df_prognoses['codering_regio'].isin(list_unchanged_multiplicacities)]
df_future = pd.concat([df_get_data, df_prognoses], axis=0)
df_future = df_future.sort_values(['codering_regio', 'interval']).reset_index().drop(['index'], axis=1)




In [None]:
df_future

In [None]:
# list_cols_prognoses

In [None]:
## Extend dataframe for blancs
# Determine columns for each imputing strategy
list_cols_prognoses = df_prognoses.columns
# list_cols_prognoses_str = [x for x in list(df_prognoses.loc[:, df_prognoses.dtypes == object].columns) if x!='codering_regio']
list_cols_prognoses_num = list(df_prognoses.loc[:, df_prognoses.dtypes != object].columns)
list_all_columns = list(df_future.columns)
list_cols_str = list(df_future.loc[:, df_future.dtypes == object].columns)
list_cols_str = list(set(list_cols_str) - set(list_cols_prognoses))
list_cols_trained_model = settings.predict['LIST_COLS_TRAINED_MODEL']
list_cols_trained_model = list(set([x.replace('relative_', '') for x in list_cols_trained_model]))
list_cols_relate_imputer = list(
    set(list_cols_trained_model) - set(settings.predict['LIST_COLS_TRAINED_MODEL_INVARIABLY']) - set(
        list_cols_prognoses))
list_cols_group_imputer = list(set(list_all_columns)-set(list_cols_str)-set(list_cols_relate_imputer))

In [None]:
df_future_cop = df_future.copy()

In [None]:
# ffill for string columns
df_future_cop.loc[:, list_cols_str] = df_future_cop.loc[:, list_cols_str].ffill()

In [None]:
from src.utilities.transformers import ColumnSelector, GroupInterpolateImputer, RelativeColumnScaler, \
    CustomScaler, CustomImputer

In [None]:
# Group imputer for available future / invariably columns / columns not used in trained model
GII = GroupInterpolateImputer(groupcols=settings.predict['GROUP_INTERPOLATE_IMPUTER_GROUPCOLS'],
                        interpolate_method=settings.predict['GROUP_INTERPOLATE_IMPUTER_METHOD'],
                        cols=list_cols_group_imputer)
df_future_cop = GII.fit_transform(df_future_cop)

In [None]:
df_future_cop

In [None]:
# Relational imputer for other columns in trained model
list_cols_relate_imputer

In [None]:
base_col = 'aantalinwoners'
future_years = ['2020', '2021', '2022', '2023', '2024', '2025']
all_relate_cols_necessary = settings.predict['LIST_COLS_GROUPER_RELATE_IMPUTER']+list_cols_relate_imputer+[base_col]

df_base_year = df_future_cop[df_future_cop['interval']=='2019'][all_relate_cols_necessary]

In [None]:
df_base_year.loc[:, list_cols_relate_imputer] = df_base_year.loc[:, list_cols_relate_imputer].div(df_base_year['aantalinwoners'], axis=0)

In [None]:
df_base_year

In [None]:
df_base_year = df_base_year[df_base_year['codering_regio'].isin(df_future_cop[df_future_cop['interval']=='2025'].codering_regio.unique())]

In [None]:
# df_base_year.set_index('codering_regio')[col]

In [None]:
# df_future_2 = df_future_cop.copy()
# df_future_2 = df_future_2.set_index('codering_regio')

In [None]:
# df_future_2[df_future_2['interval']=='2021'][base_col]

In [None]:
# df_future_2[df_future_2['interval']=='2021'].loc[:,col] = df_future_2[df_future_2['interval']=='2021'].loc[:,base_col] * df_base_year.set_index('codering_regio')[col]

In [None]:
# df_future_2[df_future_2['interval']=='2021'].loc[:,col]
df_future_2[df_future_2['interval']==year].loc[:,col]

In [None]:
df_future_2[df_future_2['interval']==year].loc[:,base_col]

In [None]:
df_base_year.set_index('codering_regio')[col]

In [None]:
df_future_cop[df_future_cop['interval'].isin(future_years)].loc[:,['codering_regio']+list_cols_relate_imputer+[base_col]]

In [None]:
df_future_2 = df_future_cop.copy()
df_future_2 = df_future_2.set_index('codering_regio')
for col in list_cols_relate_imputer:
    df_future_2.loc[:,col] = df_future_2.loc[:,base_col]
    
#     for year in future_years:
    base_col_series = df_future_2[df_future_2['interval']==year].loc[:,base_col]
    perc_col_series = df_base_year.set_index('codering_regio')[col]
#         df_future_2[df_future_2['interval']==year].loc[:,col] = base_col_series.multiply(perc_col_series)
    df_future_2.loc[:,col] = df_future_2.loc[:,col] * perc_col_series
#         print(base_col_series.multiply(perc_col_series))
    


In [None]:
0.507697108383607*9528.333333333333940

In [None]:
df_future_2[~df_future_2['interval'].isin(future_years)].loc[:,list_cols_relate_imputer]

In [None]:
# df_future_cop[df_future_cop['interval'].isin(future_years)].loc[:,col]

In [None]:
df_hist_perc = pd.DataFrame({'code_regio': ['AB01', 'AB02', 'AB03'],
                            'interval': ['2019', '2019', '2019'],
                            'allenstaande_vrouwen': [0.4, 0.15, 0.2],
                            'alleenstaande_mannen': [0.3, 0.1, 0.3]})
df_future = pd.DataFrame({'code_regio': ['AB01', 'AB01','AB01','AB02','AB02','AB02', 'AB03','AB03','AB03'],
                            'interval': ['2019', '2020', '2021','2019', '2020', '2021', '2019', '2020', '2021'],
                            'allenstaande_vrouwen': [4, np.nan, np.nan,15, np.nan, np.nan,5, np.nan, np.nan],
                            'alleenstaande_mannen': [3, np.nan, np.nan,11.5, np.nan, np.nan,15, np.nan, np.nan],
                            'aantalinwoners': [10,20,30, 100,115,130, 25,50,75]})
df_uitkomst = pd.DataFrame({'code_regio': ['AB01', 'AB01','AB01','AB02','AB02','AB02', 'AB03','AB03','AB03'],
                            'interval': ['2020', '2021', '2022','2020', '2021', '2022','2020', '2021', '2022'],
                            'allenstaande_vrouwen': [4, 8, 12, 15,17.25,19.5, 5,10,15],
                            'alleenstaande_mannen': [3,6,9, 10,11.5,13, 7.5,15,22.5],
                            'aantalinwoners': [10,20,30, 100,115,130, 25,50,75]})

In [None]:
df_hist_perc

In [None]:
df_future

In [None]:
df_uitkomst

In [None]:
# df_hist_perc = df_base_year.copy()
# df_future = df_future_cop[df_future_cop['interval'].isin(future_years)].copy()

In [None]:
df_uitkomst_test = df_future.copy()
df_uitkomst_test = df_uitkomst_test.set_index('code_regio')
for col in ['allenstaande_vrouwen', 'alleenstaande_mannen']:
# for col in list_cols_relate_imputer:
    df_uitkomst_test.loc[:, col] = df_uitkomst_test['aantalinwoners'] * df_hist_perc.set_index('code_regio')[col]
#     df_uitkomst_test.loc[:, col] = df_uitkomst_test[base_col] * df_hist_perc.set_index('codering_regio')[col]

In [None]:
df_uitkomst_test

In [None]:
df_hist_perc.set_index('code_regio')['alleenstaande_mannen']

In [None]:
list(df_prognoses.loc[:, df_prognoses.dtypes == object].columns)

In [None]:
list_cols_prognoses_num

In [None]:
df_future_cop[df_future_cop['interval'].isin(['2017', '2018', '2019'])][settings.predict['LIST_COLS_GROUPER_RELATE_IMPUTER']+list_cols_prognoses_num+list_cols_relate_imputer]


In [None]:
df_future_cop[df_future_cop['interval'].isin(['2017', '2018', '2019'])][settings.predict['LIST_COLS_GROUPER_RELATE_IMPUTER']+list_cols_prognoses_num+list_cols_relate_imputer].dtypes

In [None]:
list_cols_relate

In [None]:
list_past_period = ['2017', '2018', '2019']
list_cols_relate = settings.predict['LIST_COLS_GROUPER_RELATE_IMPUTER']+list_cols_relate_imputer
df_var = df_future_cop[df_future_cop['interval'].isin(list_past_period)][list_cols_relate+['aantalinwoners']].copy()

# for basecol in list_cols_prognoses_num:
#     print(basecol)
#     df_var.loc[:, list_cols_relate_imputer] = df_var.loc[:, list_cols_relate_imputer] / df_var[basecol]
    
# df_var.loc[:, list_cols_relate_imputer] = df_var.loc[:, list_cols_relate_imputer].div(df_var['aantalinwoners'], axis=0)
# df_var_mean = df_var.groupby(['codering_regio']).mean().drop(['aantalinwoners'], axis=1)
    

In [None]:
df_var

In [None]:
df_var['aantalinwoners']

In [None]:
df_var[df_var['codering_regio'].isin(['GM0085', 'GM0017'])]

In [None]:
df_future_cop[df_future_cop['codering_regio'].isin(['GM0085', 'GM0017'])][['alleenstaande_mannen', 'alleenstaande_vrouwen', 'aantalinwoners', 'gemeentenaam']]

In [None]:
import statistics
# statistics.pvariance

In [None]:
df_var.loc[3]

In [None]:
df_var = df_var.drop(['interval'], axis=1) * 1
df_var = df_var.groupby(['codering_regio'])
gb

In [None]:
gb.apply(lambda grp: statistics.pvariance(grp)) 

In [None]:
list(df_prognoses.loc[:, df_prognoses.dtypes != object].columns)

In [None]:
df_future[df_future['interval'].isin(['2017', '2018', '2019'])]

In [None]:
df_future[df_future['interval'].isin(['2020', '2021', '2022', '2023'])]

In [None]:
import src.settings as settings
from src.preprocess.preprocess import get_and_combine_cbs_tables, rename_and_subset_cols, \
    get_region_period_spec_val_subtable, downcast_variables_dataframe

In [None]:
periods = [2020, 2021, 2022]

In [None]:
round(max(periods), 5)

In [None]:
roundedto5periods = max(periods) + (5 - max(periods)) % 5

In [None]:
total_periods = list(range(min(periods), roundedto5periods+1, 1))

In [None]:
total_periods

In [None]:
print("Get 'progonse bevolking' tables")
df_population_prognose = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_BEVOLKING'],
                                                    url=settings.get_data['CBS_OPEN_URL'])
df_population_prognose = rename_and_subset_cols(df=df_population_prognose,
                                                dict_rename=settings.predict['DICT_COLS_RENAMED_BEVOLKING'],
                                                list_cols=settings.predict['LIST_COLS_SUBSET_BEVOLKING'])
df_population_prognose['interval'] = df_population_prognose['perioden'].apply(lambda x: x.split(' ')[-1])
df_population_prognose = df_population_prognose[(df_population_prognose['gemeentenaam'].str.contains('(CR)')==False) & 
                      (df_population_prognose['gemeentenaam'].str.contains('(PV)')==False) &
                      (df_population_prognose['gemeentenaam']!='Nederland')].copy()
df_population_prognose = df_population_prognose[df_population_prognose['interval'].astype(int)<=roundedto5periods]
df_population_prognose['aantalinwoners'] = df_population_prognose['aantalinwoners'].round().astype(int)

In [None]:
df_population_prognose

In [None]:
print("Get 'prognose huishoudens' tables")
df_huishouden_prognose = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_HUISHOUDEN'],
                                                    url=settings.get_data['CBS_OPEN_URL'])
df_huishouden_prognose['interval'] = df_huishouden_prognose['perioden']
df_huishouden_prognose = df_huishouden_prognose.rename(columns=settings.predict['DICT_COLS_RENAMED_HUISHOUDEN'])
df_huishouden_prognose = df_huishouden_prognose[df_huishouden_prognose['prognoseinterval']=='Prognose']
df_huishouden_prognose = df_huishouden_prognose[(df_huishouden_prognose['gemeentenaam'].str.contains('(CR)')==False) & 
                      (df_huishouden_prognose['gemeentenaam'].str.contains('(PV)')==False) &
                      (df_huishouden_prognose['gemeentenaam']!='Nederland')].copy()
df_huishouden_prognose['particulierehuishoudens'] = df_huishouden_prognose['particulierehuishoudens'].round().astype(int)
df_huishouden_prognose_pivot = pd.pivot_table(data=df_huishouden_prognose, values='particulierehuishoudens',
                                         index=['gemeentenaam', 'interval'],
                                         columns=['samenstellingvanhethuishouden'], aggfunc=np.sum).reset_index()
df_huishouden_prognose_pivot = df_huishouden_prognose_pivot[df_huishouden_prognose_pivot['interval'].astype(int) <= roundedto5periods]
df_huishouden_prognose_pivot = rename_and_subset_cols(df=df_huishouden_prognose_pivot,
                                                dict_rename=settings.predict['DICT_COLS_RENAMED_HUISHOUDEN_PIVOT'],
                                                list_cols=settings.predict['LIST_COLS_SUBSET_HUISHOUDING_PIVOT'])

In [None]:
df_huishouden_prognose_pivot

In [None]:
print("Get 'regio-indeling'")
df_regioindeling = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_REGIOINDELING'],
                                              double_trouble_colnames=settings.predict[
                                                  'DICT_DOUBLETROUBLECOLNAMES_REGIOINDELING'],
                                              url=settings.get_data['CBS_OPEN_URL'])
df_regioindeling = rename_and_subset_cols(df=df_regioindeling,
                                        dict_rename=settings.predict['DICT_COLS_RENAMED_REGIOINDELING'],
                                        list_cols=settings.predict['LIST_COLS_SUBSET_REGIOINDELING'])
df_regioindeling[settings.predict['LIST_STR_STRIP_COLS_REGIOINDELING']] = df_regioindeling[
    settings.predict['LIST_STR_STRIP_COLS_REGIOINDELING']].apply(lambda x: x.str.strip())

In [None]:
df_regioindeling

In [None]:
# Merge all dataframes

df_prognoses = pd.merge(df_regioindeling, df_huishouden_prognose_pivot, how='left', 
                        left_on=['gemeentenaam'], right_on=['gemeentenaam'])
df_prognoses = pd.merge(df_prognoses, df_population_prognose, how='left',
                       left_on=['gemeentenaam', 'interval'],
                       right_on=['gemeentenaam', 'interval'])

In [None]:
df_prognoses

In [None]:
import cbsodata
dict_tables=settings.predict['DICT_TABLES_REGIOINDELING']
url=settings.get_data['CBS_OPEN_URL']

In [None]:
print(f"Number of tables to collect: {len(dict_tables)}")

In [None]:
df = pd.DataFrame()
for interval, table in dict_tables.items():
    print(f"Pythonic iteration {interval} for table {table}")
    df_sub = pd.DataFrame(cbsodata.get_data(table, catalog_url=url))


In [None]:
{i:i for i in df_sub.columns}

In [None]:
{'Code_1': 'Code_1gemeente',
 'Naam_2': 'Naam_2gemeente',
 'SorteringNaam_3': 'SorteringNaam_3gemeente',
 'Code_4': 'Code_4arbeidsmarktregio',
 'Naam_5': 'Naam_5arbeidsmarktregio',
 'Code_6': 'Code_6arrondissementenrechtsgebieden',
 'Naam_7': 'Naam_7arrondissementenrechtsgebieden',
 'Code_8': 'Code_8corop',
 'Naam_9': 'Naam_9corop',
 'Code_10': 'Code_10coropsub',
 'Naam_11': 'Naam_11coropsub',
 'Code_12': 'Code_12coropplus',
 'Naam_13': 'Naam_13coropplus',
 'Code_14': 'Code_14ggdregio',
 'Naam_15': 'Naam_15ggdregio',
 'Code_16': 'Code_16jeugdzorgregio',
 'Naam_17': 'Naam_17jeugdzorgregio',
 'Code_18': 'Code_18kvk',
 'Naam_19': 'Naam_19jkvk',
 'Code_20': 'Code_20landbouwgebieden',
 'Naam_21': 'Naam_21landbouwgebieden',
 'Code_22': 'Code_22landbouwgebiedengroepen',
 'Naam_23': 'Naam_23landbouwgebiedengroepen',
 'Code_24': 'Code_24landsdelen',
 'Naam_25': 'Naam_25landsdelen',
 'Code_26': 'Code_26nutseen',
 'Naam_27': 'Naam_27nutseen',
 'Code_28': 'Code_28nutstwee',
 'Naam_29': 'Naam_29nutstwee',
 'Code_30': 'Code_30nutsdrie',
 'Naam_31': 'Naam_31nutsdrie',
 'Code_32': 'Code_32provincies',
 'Naam_33': 'Naam_33provincies',
 'Code_34': 'Code_34regionaleeenheden',
 'Naam_35': 'Naam_35regionaleeenheden',
 'Code_36': 'Code_36regionaleenergiestrategieregios',
 'Naam_37': 'Naam_37regionaleenergiestrategieregios',
 'Code_38': 'Code_38regionalemeldencoordinatiepunten',
 'Naam_39': 'Naam_39regionalemeldencoordinatiepunten',
 'Code_40': 'Code_40regioplusarbeidsmarktregios',
 'Naam_41': 'Naam_41regioplusarbeidsmarktregios',
 'Code_42': 'Code_42ressortenrechtsgebieden',
 'Naam_43': 'Naam_43ressortenrechtsgebieden',
 'Code_44': 'Code_44subresregios',
 'Naam_45': 'Naam_45subresregios',
 'Code_46': 'Code_46toeristengebieden',
 'Naam_47': 'Naam_47toeristengebieden',
 'Code_48': 'Code_48veiligheidsregios',
 'Naam_49': 'Naam_49veiligheidsregios',
 'Code_50': 'Code_50zorgkantoorregios',
 'Naam_51': 'Naam_51zorgkantoorregios',
 'Code_52': 'Code_52gemeentegrootte',
 'Omschrijving_53': 'Omschrijving_53gemeentegrootte',
 'Code_54': 'Code_54stedelijksheidsklase',
 'Omschrijving_55': 'Omschrijving_55stedelijkheidsklasse',
 'Inwonertal_56': 'Inwonertal_56',
 'Omgevingsadressendichtheid_57': 'Omgevingsadressendichtheid_57'}

In [None]:
{'ID', 'RegioS', 
 'Code_1':'codegemeente', 
 'Naam_2':'naamgemeente', 
 'SorteringNaam_3': , 'Code_4', 'Naam_5', 'Code_6', 'Naam_7', 'Code_8', 'Naam_9', 'Code_10', 'Naam_11', 'Code_12', 'Naam_13', 'Code_14', 'Naam_15', 'Code_16', 'Naam_17', 'Code_18', 'Naam_19', 'Code_20', 'Naam_21', 'Code_22', 'Naam_23', 'Code_24', 'Naam_25', 'Code_26', 'Naam_27', 'Code_28', 'Naam_29', 'Code_30', 'Naam_31', 'Code_32', 'Naam_33', 'Code_34', 'Naam_35', 'Code_36', 'Naam_37', 'Code_38', 'Naam_39', 'Code_40', 'Naam_41', 'Code_42', 'Naam_43', 'Code_44', 'Naam_45', 'Code_46', 'Naam_47', 'Code_48', 'Naam_49', 'Code_50', 'Naam_51', 'Code_52', 'Omschrijving_53', 'Code_54', 'Omschrijving_55', 'Inwonertal_56', 'Omgevingsadressendichtheid_57'}

In [None]:
print(f"Number of tables to collect: {len(dict_tables)}")

df = pd.DataFrame()
for interval, table in dict_tables.items():
    print(f"Pythonic iteration {interval} for table {table}")
    try:
        df_sub = pd.DataFrame(cbsodata.get_data(table, catalog_url=url))
        if double_trouble_colnames:
            df_sub = df_sub.rename(columns=double_trouble_colnames)
        cols_wijk_stripped = [i.rstrip('0123456789').replace("_", "").lower() for i in list(df_sub.columns)]
        dict_wijk_cols_renamed = {key: value for key, value in zip(iter(df_sub.columns), iter(cols_wijk_stripped))}
        df_sub = df_sub.rename(columns=dict_wijk_cols_renamed)
        df_sub['interval'] = interval
        # print(list(df_sub.columns))
    except Exception:
        df_sub = pd.DataFrame()
        pass
    df = pd.concat([df, df_sub], sort=True)
    # print(list(df.columns))
return df

In [None]:
df_huishouden_prognose_pivot = pd.pivot_table(data=df_huishouden_prognose, values='particulierehuishoudens',
                                         index=['regioindeling', 'interval'],
                                         columns=['samenstellingvanhethuishouden'], aggfunc=np.sum).reset_index()

In [None]:
df_huishouden_prognose_pivot

In [None]:
df_huishouden_prognose = df_huishouden_prognose[(df_huishouden_prognose['regioindeling'].str.contains('(CR)')==False) & 
                      (df_huishouden_prognose['regioindeling'].str.contains('(PV)')==False) &
                      (df_huishouden_prognose['regioindeling']!='Nederland')].copy()

In [None]:
df[df["col1"].str.contains('this'|'that')==False and df["col2"].str.contains('foo'|'bar')==True]

In [None]:
df_population_prognose = rename_and_subset_cols(df=df_population_prognose,
                                    dict_rename=settings.get_data['DICT_COLS_RENAMED_WMO'],
                                    list_cols=settings.get_data['LIST_COLS_SUBSET_WMO'])

In [None]:
df_population_prognose['interval'] = df_population_prognose['perioden'].apply(lambda x: x.split(' ')[-1])

# Extend dataframe with future

In [None]:
list_unique_regions = list(df[df['interval']==df['interval'].max()]['codering_regio'].unique())
list_future_years = ['2020', '2021', '2022']
df_future = pd.DataFrame(list(product(list_unique_regions, list_future_years)), columns=['codering_regio', 'interval'])
df_extended = pd.concat([df, df_future])
df_extended['interval'] = df_extended['interval'].astype(int)
df_extended = df_extended.sort_values(['codering_regio', 'interval']).reset_index().drop(['index'], axis=1)
df_extended

# Strategy one: Use GroupInterpolateImputer

In [None]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline

import src.settings as settings
from src.preprocess.preprocess import make_df_missing
from src.utilities.transformers import ColumnSelector, GroupInterpolateImputer, RelativeColumnScaler, \
    CustomScaler, CustomImputer

In [None]:
df_preprocess = df_extended.reset_index().copy()
# Determine columns with to much missing values
df_missing = make_df_missing(df_preprocess)
list_drop_missing_cols = list(
    df_missing[df_missing['perc_missing'] > 0.99].index)

# Determine columns which are not numeric but objects
list_drop_object_cols = list(df_preprocess.loc[:, df_preprocess.dtypes == object].columns)

# Determine list of columns for first ColumnSelector
drop_cols_total = list(set(list_drop_missing_cols + list_drop_object_cols))
drop_cols_total = [c for c in drop_cols_total if c not in settings.preprocess['ORIGINAL_INDEX']]
list_column_selector_1 = [c for c in list(df_preprocess.columns) if c not in drop_cols_total]

# Make Pipeline and fit transform df_preprocess
pl_preprocess = make_pipeline(
    ColumnSelector(cols=list_column_selector_1),
    GroupInterpolateImputer(groupcols=settings.preprocess['GROUP_INTERPOLATE_IMPUTER_GROUPCOLS'],
                            interpolate_method='values',
                            cols=settings.preprocess['GROUP_INTERPOLATE_IMPUTER_COLS']))

In [None]:
df_preprocessed = pl_preprocess.fit_transform(df_preprocess)

In [None]:
df_preprocessed[df_preprocessed['codering_regio']=='GM0197']

In [None]:
# # Create sample set for developing visualisation
# df_test_set_for_visualise = df_preprocessed[['codering_regio', 'interval', 'wmoclientenper1000inwoners']].copy()

# mu, sigma = 0, 0.1 
# noise = np.random.normal(mu, sigma, len(df_test_set_for_visualise)) 

# df_test_set_for_visualise['wmoclientenper1000inwoners'] = df_test_set_for_visualise['wmoclientenper1000inwoners'] + noise
# df_test_set_for_visualise.to_csv('../data/sampleset_y_predict.csv')

In [None]:
pd.DataFrame.interpolate.__code__.co_varnames

# Strategy 2: ffill + percentage
Including making method to determine percentage:
   
* Population growth percentage per type of region? Whole country?
* Fixed?
* Certain age?

# Strategy 3: Prognose CBS

Mogelijke tabellen: 
* 84525NED -> niet alle gemeenten
* 84528NED -> Slechts per 5 jaar
* 84526NED -> huishoudens

In [None]:
import cbsodata

In [None]:
table = '84526NED'
url = settings.get_data['CBS_OPEN_URL']
df_prognose_bevolking = pd.DataFrame(cbsodata.get_data(table, catalog_url=url))

In [None]:
table = '84528NED'
url = settings.get_data['CBS_OPEN_URL']
df_prognose_bevolking2 = pd.DataFrame(cbsodata.get_data(table, catalog_url=url))

In [None]:
df_prognose_bevolking.head()

In [None]:
df_prognose_bevolking2.head()

In [None]:
df_prognose_bevolking.SamenstellingVanHetHuishouden.unique()

In [None]:
df_prognose_bevolking.RegioIndeling2018.unique()

# Load model, select columns

In [None]:
def predict_data(df_get_data, periods, trained_model, save_all=False, personal_note=""):


    ## Get data (for extending get data with future)
    # Determine boundaries for get prognose data
    roundedto5periods = max(periods) + (5 - max(periods)) % 5
    total_periods = list(range(min(periods), roundedto5periods+1, 1))
    total_periods_str = [str(x) for x in total_periods]

    print("Get 'regio-indeling'")
    df_regioindeling = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_REGIOINDELING'],
                                                  double_trouble_colnames=settings.predict[
                                                      'DICT_DOUBLETROUBLECOLNAMES_REGIOINDELING'],
                                                  url=settings.get_data['CBS_OPEN_URL'])
    df_regioindeling = rename_and_subset_cols(df=df_regioindeling,
                                              dict_rename=settings.predict['DICT_COLS_RENAMED_REGIOINDELING'],
                                              list_cols=settings.predict['LIST_COLS_SUBSET_REGIOINDELING'])
    df_regioindeling[settings.predict['LIST_STR_STRIP_COLS_REGIOINDELING']] = df_regioindeling[
        settings.predict['LIST_STR_STRIP_COLS_REGIOINDELING']].apply(lambda x: x.str.strip())

    print("Get 'prognose huishoudens' tables")
    df_huishouden_prognose = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_HUISHOUDEN'],
                                                        url=settings.get_data['CBS_OPEN_URL'])
    df_huishouden_prognose['interval'] = df_huishouden_prognose['perioden']
    df_huishouden_prognose = df_huishouden_prognose.rename(columns=settings.predict['DICT_COLS_RENAMED_HUISHOUDEN'])
    df_huishouden_prognose = df_huishouden_prognose[df_huishouden_prognose['prognoseinterval'] == 'Prognose']
    df_huishouden_prognose = df_huishouden_prognose[
        (df_huishouden_prognose['gemeentenaam'].str.contains('(CR)') == False) &
        (df_huishouden_prognose['gemeentenaam'].str.contains('(PV)') == False) &
        (df_huishouden_prognose['gemeentenaam'] != 'Nederland')].copy()
    df_huishouden_prognose['particulierehuishoudens'] = df_huishouden_prognose['particulierehuishoudens'] * 1000
    df_huishouden_prognose['particulierehuishoudens'] = df_huishouden_prognose[
        'particulierehuishoudens'].round().astype(int)
    df_huishouden_prognose_pivot = pd.pivot_table(data=df_huishouden_prognose, values='particulierehuishoudens',
                                                  index=['gemeentenaam', 'interval'],
                                                  columns=['samenstellingvanhethuishouden'],
                                                  aggfunc=np.sum).reset_index()
    df_huishouden_prognose_pivot = df_huishouden_prognose_pivot[
        df_huishouden_prognose_pivot['interval'].astype(int) <= roundedto5periods]
    df_huishouden_prognose_pivot = rename_and_subset_cols(df=df_huishouden_prognose_pivot,
                                                          dict_rename=settings.predict[
                                                              'DICT_COLS_RENAMED_HUISHOUDEN_PIVOT'],
                                                          list_cols=settings.predict[
                                                              'LIST_COLS_SUBSET_HUISHOUDING_PIVOT'])

    print("Get 'prognose bevolking' tables")
    df_population_prognose = get_and_combine_cbs_tables(dict_tables=settings.predict['DICT_TABLES_BEVOLKING'],
                                                        url=settings.get_data['CBS_OPEN_URL'])
    df_population_prognose = rename_and_subset_cols(df=df_population_prognose,
                                                    dict_rename=settings.predict['DICT_COLS_RENAMED_BEVOLKING'],
                                                    list_cols=settings.predict['LIST_COLS_SUBSET_BEVOLKING'])
    df_population_prognose['interval'] = df_population_prognose['perioden'].apply(lambda x: x.split(' ')[-1])
    df_population_prognose = df_population_prognose[
        (df_population_prognose['gemeentenaam'].str.contains('(CR)') == False) &
        (df_population_prognose['gemeentenaam'].str.contains('(PV)') == False) &
        (df_population_prognose['gemeentenaam'] != 'Nederland')].copy()
    df_population_prognose = df_population_prognose[df_population_prognose['interval'].astype(int) <= roundedto5periods]
    df_population_prognose['aantalinwoners'] = df_population_prognose['aantalinwoners'] * 1000
    df_population_prognose['aantalinwoners'] = df_population_prognose['aantalinwoners'].round().astype(int)
    df_population_prognose = df_population_prognose.drop(['perioden'], axis=1)

    # Merge all dataframes
    df_prognoses = pd.merge(df_regioindeling, df_huishouden_prognose_pivot, how='left',
                            left_on=['gemeentenaam'], right_on=['gemeentenaam'])
    df_prognoses = pd.merge(df_prognoses, df_population_prognose, how='left',
                            left_on=['gemeentenaam', 'interval'],
                            right_on=['gemeentenaam', 'interval'])
    print(f"Shape of df_prognoses = {df_prognoses.shape}")

    # Concat with original 'get data' dataframe (incl. drop multiplicacities that don't occur in original dataset)
    list_unchanged_multiplicacities = df_get_data[df_get_data['interval'] == df_get_data['interval'].max()][
        'codering_regio'].unique()
    df_prognoses = df_prognoses[df_prognoses['codering_regio'].isin(list_unchanged_multiplicacities)]
    print(f"Shape of df_prognoses = {df_prognoses.shape}")
    df_future = pd.concat([df_get_data, df_prognoses], axis=0)
    df_future = df_future.sort_values(['codering_regio', 'interval']).reset_index().drop(['index'], axis=1)
    print(f"Shape of df_future = {df_future.shape}")

    ## Extend dataframe for blancs
    print("Start extending blancs in DataFrame with future values")
    # Determine columns for each imputing strategy
    list_cols_prognoses = df_prognoses.columns
    # list_cols_prognoses_str = [x for x in list(df_prognoses.loc[:, df_prognoses.dtypes == object].columns) if x!='codering_regio']
    list_cols_prognoses_num = list(df_prognoses.loc[:, df_prognoses.dtypes != object].columns)
    list_all_columns = list(df_future.columns)
    list_cols_str = list(df_future.loc[:, df_future.dtypes == object].columns)
    list_cols_str = list(set(list_cols_str) - set(list_cols_prognoses))
    list_cols_trained_model = settings.predict['LIST_COLS_TRAINED_MODEL']
    list_cols_trained_model = list(set([x.replace('relative_', '') for x in list_cols_trained_model]))
    list_cols_relate_imputer = list(
        set(list_cols_trained_model) - set(settings.predict['LIST_COLS_TRAINED_MODEL_INVARIABLY']) - set(
            list_cols_prognoses))
    list_cols_group_imputer = list(set(list_all_columns) - set(list_cols_str) - set(list_cols_relate_imputer))

    # ffill for string columns
    print("ffill for string columns")
    df_future.loc[:, list_cols_str] = df_future.loc[:, list_cols_str].ffill()
    print(f"Shape of df_future = {df_future.shape}")

    # Group imputer for available future / invariably columns / columns not used in trained model
    print("Group imputer for available future / invariably columns / columns not used in trained model")
    GII = GroupInterpolateImputer(groupcols=settings.predict['GROUP_INTERPOLATE_IMPUTER_GROUPCOLS'],
                                  interpolate_method=settings.predict['GROUP_INTERPOLATE_IMPUTER_METHOD'],
                                  cols=list_cols_group_imputer)
    df_future = GII.fit_transform(df_future)
    print(f"Shape of df_future = {df_future.shape}")

    # Relational imputer for other columns in trained model
    print("Relational imputer for other columns in trained model")
    base_col = 'aantalinwoners'
    # future_years = ['2020', '2021', '2022', '2023', '2024', '2025']
    all_relate_cols_necessary = settings.predict['LIST_COLS_GROUPER_RELATE_IMPUTER'] + list_cols_relate_imputer + [
        base_col]
    df_base_year = df_future[df_future['interval'] == '2019'][all_relate_cols_necessary]
    df_base_year.loc[:, list_cols_relate_imputer] = df_base_year.loc[:, list_cols_relate_imputer].div(
        df_base_year[base_col], axis=0)
    df_base_year = df_base_year[df_base_year['codering_regio'].isin(
        df_future[df_future['interval'] == total_periods[-1]].codering_regio.unique())]
    df_future = df_future.set_index('codering_regio')
    for col in list_cols_relate_imputer:
        df_future.loc[:, col] = df_future.loc[:, base_col]
        df_future.loc[:, col] = df_future.loc[:, col] * df_base_year.set_index('codering_regio')[col]
    print(f"Shape of df_future = {df_future.shape}")
    df_future = df_future[df_future['interval'].isin(total_periods_str)].reset_index()
    df_future = df_future.set_index(['codering_regio', 'interval'])
    print(f"Shape of df_future = {df_future.shape}")

    ## Preprocess
    df_preprocessed = preprocess_data(df=df_future, save_all=False, personal_note='predict')
    df_preprocessed = df_preprocessed.drop(settings.Y_TARGET_COLS, axis=1)

    ## Predict
    y_preds = trained_model.predict(df_preprocessed)

    # Save
    # ?
    return y_preds