# Imports & settings

In [None]:
# Python packages
import sys
sys.path.append('../../')
from datetime import datetime
import numpy as np
import pandas as pd
import pickle

# Custom functions
import src.settings as settings
from src.run_all.get_data import get_data, get_data_predict
from src.run_all.preprocess import preprocess_data
from src.run_all.train import train_and_fit_models
from src.run_all.predict import predict_data
from src.utilities.utilities import get_latest_file, list_filenames

# Settings for displaying DataFrames
pd.set_option('display.max_rows', 500) # number of rows to show
pd.set_option('display.max_columns', 500) # number of columns to show
pd.set_option('display.width', 1000) # column width
pd.set_option("display.precision", 2) # precision of column
pd.set_option('display.float_format', lambda x: '{:.15f}'.format(x)) # float format

# Base settings
For this project all settings are included in the file `src/settings.py`. This notebook is the summary of all the code developed and as a showcase. The base settings are to ensure the code below is run in the right order.


`PROCES` : str

    String value of the proces to be runned. Options for string are one of the following list: 
    `['train', 'predict', 'train_and_predict']`
    
`SOURCE` : str

    String value of the source of the data. As a user you can choose to get the newest data from CBS Statline 
    by running the total script from scratch (will take a number of minutes) or use the collected data as saved 
    in this project. IMPORTANT NOTE: By collecting new data, there may be a possibility that CBS has changed 
    columnnames, deleted tables, etc. In that case the code may run into an error!
    
    Options:
    
    1. Get (new) data from CBS or apply gridsearch on preprocessed data: 'new'
    2. From hardcoded files of get data: 'hardcoded'
    
`PERSONAL_NOTE` : str

    String value to add to the different filenames to make sure the user recognizes the files they have generated.

In [None]:
PROCESS = 'predict'
SOURCE = 'hardcoded'
PERSONAL_NOTE = PROCESS+'_'+SOURCE

# Get data
This step will load and combine several tables from CBS statline. 

Note: This step takes a number of minutes and without changes to the settings will give the same result. Therefor this code is commented out and the original dataset is loaded. 

In [None]:
%%time
if SOURCE == 'new':
    # Get historical data (needed for training and predicting)
    df_get_data = get_data(save_all=True, personal_note=PERSONAL_NOTE)
    if 'predict' in PROCESS:
        # Get prognosed data
        df_get_data_predict = get_data_predict(save_all=True, personal_note=PERSONAL_NOTE)
    else:
        df_get_data_predict = pd.DataFrame()
elif SOURCE == 'hardcoded':
    # Get historical data (needed for training and predicting)
    filename = 'df_get_data_WMO_WIJK_HUISHOUDENS_BEVOLKING_HEFFING_202104241837_train_and_predict_new.parquet.gzip'
    df_get_data = pd.read_parquet(settings.DATAPATH + filename)
    if 'predict' in PROCESS:
        # Get prognosed data
        filename = 'df_get_data_predict_202104241838_train_and_predict_new.parquet.gzip'
        df_get_data_predict = pd.read_parquet(settings.DATAPATH + filename)
    else:
        df_get_data_predict = pd.DataFrame()

In [None]:
print(f"The shape of the dataframe from step 'Get Data': {df_get_data.shape}")
df_get_data.sample(5)

In [None]:
print(f"The shape of the dataframe from step 'Get Data Predict': {df_get_data_predict.shape}")
df_get_data_predict.head()

# Preprocess data

In [None]:
%%time
if 'train' in PROCESS:
    # This step is included in the code of the predict step
    df_preprocessed = preprocess_data(df=df_get_data, save_all=True, personal_note=PERSONAL_NOTE)
    print(f"The shape of the dataframe from step 'Preprocess': {df_preprocessed.shape}")
else:
    df_preprocessed = pd.DataFrame()
df_preprocessed.head()

# Train

In [None]:
%%time
if 'train' in PROCESS:
    gridsearch_object = train_and_fit_models(df_preprocessed=df_preprocessed,
                                            filename_input=PERSONAL_NOTE,
                                            save_all=True,
                                            personal_note=PERSONAL_NOTE)
    best_trained_model = gridsearch_object.best_estimator_
else:
    best_trained_model = get_latest_file(filename_str_contains='best_model_202104241838_train_and_predict_new', 
                                         datapath=settings.DATAPATH, filetype='pickle')
    print(f"Loaded model is: {best_trained_model}")

# Predict

In [None]:
%%time
if 'predict' in PROCESS:
    df_predict = predict_data(trained_model=best_trained_model,
                              periods=settings.get_data_predict['LIST_PERIODS'],
                              df_get_data=df_get_data,
                              df_get_data_predict=df_get_data_predict,
                              save_all=True, personal_note=PERSONAL_NOTE)

    print(f"The shape of the dataframe from step 'Predict': {df_predict.shape}")
else:
    df_predict = pd.DataFrame()
df_predict.head()