# Imports

In [None]:
# Python packages
import sys
sys.path.append('../../')
from datetime import datetime
import numpy as np
import pandas as pd

# Custom functions
import src.settings as settings
import src.mapper_cols as mapper_cols
from src.run_all.main_get_data import get_data
from src.run_all.main_preprocess import preprocess_data
from src.utilities.utilities import get_latest_file

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Get data
This step will load and combine several tables from CBS statline. 

Note: This step takes a number of minutes and without changes to the settings will give the same result. Therefor this code is commented out and the original dataset is loaded. 

In [None]:
%%time 
# ## CREATE NEW DATASET
# df_get_data_WMO= get_data(save=True)

# ## HARDCODED
datapath = '../../data/'
filename = 'df_get_data_WMO_WIJK_HUISHOUDENS_BEVOLKING_HEFFING_202104040949.parquet.gzip'
df_get_data_WMO = pd.read_parquet(datapath + filename)

# ## SELECT LAST FILE
# datapath = '../../data/'
# df = get_latest_file(filename_str_contains='df_WMO_', datapath=datapath, filetype='parquet')

print(f"The shape of the dataframe from step 'Get Data': {df_get_data_WMO.shape}")
df_get_data_WMO.sample(5)

# Preprocess
This step will transform (select columns, impute, scale) the dataframe to be used in train/predict. 

In [None]:
testsets = {'All': [mapper_cols.DICT_WMO_RELATIVELY_COLS_ALL, 
                          mapper_cols.LIST_WMO_GET_DATA_ALL, 
                          None],
            'No_Relative': [{}, 
                          mapper_cols.LIST_WMO_GET_DATA_ALL, 
                          None],
            'Boerenverstand_Maikel': [mapper_cols.DICT_WMO_RELATIVELY_COLS_BOERENVERSTAND_MAIKEL, 
                                      mapper_cols.LIST_WMO_GET_DATA_BOERENVERSTAND_MAIKEL, 
                                      mapper_cols.LIST_COLUMNSELECTOR_2_BOERENVERSTAND_MAIKEL],
           'Minimum_Maikel': [mapper_cols.DICT_WMO_RELATIVELY_COLS_BOERENVERSTAND_MAIKEL, 
                              mapper_cols.LIST_WMO_GET_DATA_BOERENVERSTAND_MAIKEL,
                             ['codering_regio', 'interval',
                              'aantalinwoners','gescheiden','verweduwd','alleenstaande_mannen',
                              'alleenstaande_vrouwen','poparbeidsongeschiktheidtotaal',
                              'popbevolkingsdichtheid','popk65tot80jaarrelatieveleeftijdsgroep',
                              'popk80jaarofouderrelatieveleeftijdsgroep']]}

In [None]:
for key, value in testsets.items():
    print(f"Testset {key} will be created")
    # Set the parameters
    settings.preprocess['DICT_RELATIVELY_COLS'] = value[0]
    settings.preprocess['LIST_CUSTOMSCALER_COLS'] = value[1]
    settings.preprocess['LIST_COLUMNSELECTOR_COLS_2'] = value[2]
    # Create set
    df_preprocessed = preprocess_data(df=df_get_data_WMO, save_all=True, personal_note=key)
    print(f"The shape of the dataframe from step 'Preprocess': {df_preprocessed.shape}")

# Appendix
## Code examples to get a subset of the DataFrame based on multiindex

In [None]:
# # Subset on columnvalue:
# df[df['gemeentenaam']=='Nijmegen']

# # One row / record
# df.loc[('WK026801', '2019')]

# # Multiple rows / records based on combination of the multiindex
# df.loc[[('WK026801', '2018'), ('WK026802', '2018')]]

# # Multiple rows / records for one column (works only for series)
# df['codering_regio'].loc[(['WK026801', 'WK026802'], ['2018', '2019'])]

# # Multiple rows based on both indexes:
# df.loc(axis=0)[['WK026801', 'WK026802'], ['2018', '2019']]

# # Subset with IndexSlice
# idx = pd.IndexSlice
# # Subset on one of the multiindex and select a column
# df.loc[idx[:, ['2018', '2019']], idx["codering_regio"]]
# # Subset on one of the multiindex and select all columns
# df.loc[idx['WK026801', :], idx[:]]

In [None]:
# df[df['gemeentenaam']=='Nijmegen']

## Code to get current versions of loaded packages

In [None]:
# print('\n'.join(f'{m.__name__} {m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

# Create a huge dictionary with all possible columns to scale with RelativeColumnScaler

In [None]:
# # Create a huge dictionary with all possible columns to scale with RelativeColumnScaler
# dict_combined = {}
# dicts = [mapper_cols.DICT_RELATIVELY_COL_WIJK,
#          mapper_cols.DICT_RELATIVELY_COL_HUISHOUDEN,
#         mapper_cols.DICT_RELATIVELY_COL_BEVOLKING,
#         mapper_cols.DICT_RELATIVELY_COL_HEFFING]

# for D in dicts:
#     for key, value in D.items():
#         if key in dict_combined.keys():
#             value_dict_combined = dict_combined[key]
#             new_value = list(set(value_dict_combined+value))
#             dict_combined[key] = new_value
#         else:
#             dict_combined[key] = value
# dict_combined