# Imports and settings

In [None]:
# Python packages
import sys
sys.path.append('../')
from datetime import datetime
import numpy as np
import pandas as pd
import pickle

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import altair as alt
from sklearn import preprocessing
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Custom functions
import src.settings as settings
from src.run_all.get_data import get_data, get_data_predict
from src.run_all.preprocess import preprocess_data
from src.run_all.train import train_and_fit_models
from src.run_all.predict import predict_data
from src.utilities.utilities import get_latest_file, list_filenames

# Settings for displaying DataFrames
pd.set_option('display.max_rows', 500) # number of rows to show
pd.set_option('display.max_columns', 500) # number of columns to show
pd.set_option('display.width', 1000) # column width
# pd.set_option("display.precision", 2) # precision of column
# pd.set_option('display.float_format', lambda x: '{:.15f}'.format(x)) # float format

In [None]:
PROCESS = 'train_and_predict'
SOURCE = 'hardcoded'
PERSONAL_NOTE = PROCESS+'_'+SOURCE
PREDICT_PERIODS = [2020, 2021, 2022, 2023]
SAVE_ALL = False


settings.DATAPATH = '../data/'
gemeentenamen = ['Berg en Dal', 'Beuningen', 'Druten', 'Heumen', 'Mook en Middelaar', 'Nijmegen', 'Wijchen']

# Load data

## Load source data (saved from CBS Statline)

In [None]:
%%time
if SOURCE == 'new':
    # Get historical data (needed for training and predicting)
    df_get_data = get_data(save_all=SAVE_ALL, personal_note=PERSONAL_NOTE)
    if 'predict' in PROCESS:
        # Get prognosed data
        df_get_data_predict = get_data_predict(periods=PREDICT_PERIODS, 
                                               save_all=SAVE_ALL, 
                                               personal_note=PERSONAL_NOTE)
    else:
        df_get_data_predict = pd.DataFrame()
elif SOURCE == 'hardcoded':
    # Get historical data (needed for training and predicting)
    filename = 'df_get_data_WMO_WIJK_HUISHOUDENS_BEVOLKING_HEFFING_202104241837_train_and_predict_new.parquet.gzip'
    df_get_data = pd.read_parquet(settings.DATAPATH + filename)
    if 'predict' in PROCESS:
        # Get prognosed data
        filename = 'df_get_data_predict_202104241838_train_and_predict_new.parquet.gzip'
        df_get_data_predict = pd.read_parquet(settings.DATAPATH + filename)
    else:
        df_get_data_predict = pd.DataFrame()

## Load predicted data

In [None]:
filename = 'df_predict_202104291100_train_and_predict_hardcoded.parquet.gzip'
df_predict = pd.read_parquet(settings.DATAPATH + filename)

In [None]:
# df_predict[df_predict['prediction'].isnull()]

# Make one table with historical and predicted values

In [None]:
# Make a DataFrame with the historical data for the selected municipalities and the total of the Netherlands
df_hist = df_get_data[df_get_data['gemeentenaam'].isin(gemeentenamen)][['wmoclientenper1000inwoners', 'gemeentenaam']].copy()
df_hist_total = df_get_data[['wmoclientenper1000inwoners', 'gemeentenaam']].copy()
df_hist_total = df_hist_total.reset_index()
df_hist_total = df_hist_total.groupby('interval').mean()
df_hist_total['gemeentenaam'] = 'Nederland'
df_hist_total['codering_regio'] = 'NL'
df_hist_total['wmoclientenper1000inwoners'] = df_hist_total['wmoclientenper1000inwoners'].round().astype(int)
df_hist_total = df_hist_total.reset_index()
df_hist_total = df_hist_total.set_index(['codering_regio', 'interval'])
df_historical = pd.concat([df_hist, df_hist_total])
df_historical

In [None]:
# Make a DataFrame with the predictions for the selected municipalities and the total of the Netherlands
df_preds = df_predict[df_predict['gemeentenaam'].isin(gemeentenamen)].copy()
df_preds = df_preds.rename(columns={'prediction':'wmoclientenper1000inwoners'})
df_preds_total = df_predict.copy()
df_preds_total = df_preds_total.rename(columns={'prediction':'wmoclientenper1000inwoners'})
df_preds_total = df_preds_total.reset_index()
df_preds_total = df_preds_total.groupby('interval').mean()
df_preds_total['gemeentenaam'] = 'Nederland'
df_preds_total['codering_regio'] = 'NL'
df_preds_total['wmoclientenper1000inwoners'] = df_preds_total['wmoclientenper1000inwoners'].round().astype(int)
df_preds_total = df_preds_total.reset_index()
df_preds_total = df_preds_total.set_index(['codering_regio', 'interval'])
df_predictions = pd.concat([df_preds, df_preds_total])
df_predictions

In [None]:
list_coderings = ['GM0225', 'GM0268', 'GM0296', 'GM0944', 'GM1945']

In [None]:
# Make the table in the right format for the report
df_one_table = pd.concat([df_historical, df_predictions])
df_one_table = df_one_table.reset_index()
df_one_table = df_one_table.sort_values(['gemeentenaam', 'interval'])
df_one_table = df_one_table.drop(['codering_regio'], axis=1)
df_one_table['wmoclientenper1000inwoners'] = df_one_table['wmoclientenper1000inwoners'].astype(int)
df_right_format = df_one_table.pivot(index='gemeentenaam', columns='interval', values='wmoclientenper1000inwoners')
df_right_format = df_right_format.fillna(value='', inplace=False, downcast='infer')
df_right_format

In [None]:
filename_table = 'result_table.csv'
df_right_format.to_csv(settings.DATAPATH + filename_table)

# Try making a boxplot

In [None]:
df_hist_total_plot = df_get_data[['wmoclientenper1000inwoners', 'gemeentenaam']].copy()
df_preds_total_plot = df_predict.copy()
df_preds_total_plot = df_preds_total_plot.rename(columns={'prediction':'wmoclientenper1000inwoners'})
df_plot = pd.concat([df_hist_total_plot, df_preds_total_plot])
df_plot = df_plot.reset_index()
df_plot = df_plot.sort_values(['gemeentenaam', 'interval'])
df_plot = df_plot.drop(['codering_regio'], axis=1)
# df_plot['wmoclientenper1000inwoners'] = df_plot['wmoclientenper1000inwoners'].astype(int)

In [None]:
df_plot

In [None]:
ax = sns.boxplot(y=df_plot['interval'], x=df_plot["wmoclientenper1000inwoners"])

# Explaining the difference

In [None]:
file_preprocess_predict = 'df_preprocess_predict_202104291100_train_and_predict_hardcoded.parquet.gzip'
df_preprocess_predict = pd.read_parquet(settings.DATAPATH + file_preprocess_predict)
df_preprocess_predict = df_preprocess_predict.reset_index()

file_preprocess = 'df_preprocessed_202104291100_train_and_predict_hardcoded.parquet.gzip'
df_preprocess = pd.read_parquet(settings.DATAPATH + file_preprocess)
df_preprocess = df_preprocess.reset_index()


In [None]:
list_features = ['codering_regio', 'interval',
                                    'wmoclienten', 'wmoclientenper1000inwoners',
                                   'relative_huishoudensmetkinderen',
                                    'relative_poparbeidsongeschiktheidtotaal',
                                    'relative_ongehuwd',
                                    'relative_poptotaleoppervlakte',
                                    'relative_huishoudenszonderkinderen',
                                    'relative_ouder_in_eenouderhuishouden_vrouwen',
                                    'relative_alleenstaande_mannen',
                                    'relative_popafstandtothuisartsenpraktijk',
                                    'relative_popbevolkingsdichtheid']

In [None]:
df_hist_all = df_get_data.reset_index().copy()
df_hist_all[(df_hist_all['codering_regio'].isin(list_coderings))&(df_hist_all['interval']=='2019')]

In [None]:
df_preprocess_predict[df_preprocess_predict['codering_regio'].isin(list_coderings)]

In [None]:
df_preprocess[df_preprocess['codering_regio'].isin(list_coderings)]