# Preprocess - Get data predict --> CHANGES NEEDED LATER
This notebook is used to collect data for predicting purposes. The data in this notebook will collect data from different sources. The goal is a X-dataset in parquet format, with:
- A number of variables that are used to predict the number of WMO clients. 
- Index contains a region and time interval
- Filename contains a datetime suffix

## Content
* **Imports**: Imports of needed Python packages
* **Settings**: Hard coded variables needed to collect data like sources, tablenames, columnnames, etc. 
* **Funtions**: Resuable functions
* **Load data from sources**: Seperate paragraph for each source
    * CBS: WIJK_TABLES
* **Combine multiple sources**: Combining all data to one table
* **Extend and subset right timewindow**: Code to extend some columns to ensure that all
* **Write result**: Writing result to '../data'
* **Appendix**: Usefull code to preserve
    * Code examples to get a subset of the DataFrame based on multiindex
    * Code to get current versions of loaded packages

## Requirements
The packages to be installed (besides standard Python packages) are:
* pandas >=1.1.5
* cbsodata >=1.3.3

# Imports

In [1]:
import pandas as pd
import cbsodata
from datetime import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Settings

In [2]:
# URL of the CBS Statline database
CBS_OPEN_URL = 'opendata.cbs.nl'
# CBS tables with information of the WMO clients
WMO_TABLES = {'2020': '84907NED',
            '2019': '84664NED',
            '2018': '84421NED',
            '2017': '83818NED',
            '2016': '83620NED',
            '2015': '83267NED'}
# CBS tables with the information of key figures for a neighbourhood
WIJK_TABLES = {'2020': '84799NED', 
               '2019': '84583NED', 
               '2018': '84286NED', 
               '2017': '83765NED', 
               '2016': '83487NED', 
               '2015': '83220NED'} 
# Dictionary with columns that need to be renamed to avoid duplicates for neighbourhood data
DOUBLETROUBLECOLNAMES_WIJK = {'GemiddeldElektriciteitsverbruikTotaal_47': 'GemiddeldElektriciteitsverbruikTotaal_47',
                             'Appartement_48': 'GemElectriciteitsverbruikAppartement_48',
                             'Tussenwoning_49': 'GemElectriciteitsverbruikTussenwoning_49',
                             'Hoekwoning_50': 'GemElectriciteitsverbruikHoekwoning_50',
                             'TweeOnderEenKapWoning_51': 'GemElectriciteitsverbruikTweeOnderEenKapWoning_51',
                             'VrijstaandeWoning_52': 'GemElectriciteitsverbruikVrijstaandeWoning_52',
                             'Huurwoning_53': 'GemElectriciteitsverbruikHuurwoning_53',
                             'EigenWoning_54': 'GemElectriciteitsverbruikEigenWoning_54',
                              'Koopwoning_54' : 'GemElectriciteitsverbruikEigenWoning_54',
                             'GemiddeldAardgasverbruikTotaal_55': 'GemiddeldAardgasverbruikTotaal_55',
                             'Appartement_56': 'GemGasverbruikAppartement_56',
                             'Tussenwoning_57': 'GemGasverbruikTussenwoning_57',
                             'Hoekwoning_58': 'GemGasverbruikHoekwoning_58',
                             'TweeOnderEenKapWoning_59': 'GemGasverbruikTweeOnderEenKapWoning_59',
                             'VrijstaandeWoning_60': 'GemGasverbruikVrijstaandeWoning_60',
                             'Huurwoning_61': 'GemGasverbruikHuurwoning_61',
                             'EigenWoning_62': 'GemGasverbruikEigenWoning_62',
                              'Koopwoning_62': 'GemGasverbruikEigenWoning_62',
                             'PercentageWoningenMetStadsverwarming_63': 'PercentageWoningenMetStadsverwarming_63'}

# TO DO: 
# * ADD list with columns to keep for model! 

# Functions

In [3]:
def get_and_combine_cbs_tables(dict_tables, double_trouble_colnames=None, url='opendata.cbs.nl'):
    """
    Method to get multiple simular tables in the CBS database.
    
    :params dict(str: str) tables: Dictionary with as key the period and as value the table name
    :params dict(str: str) double_trouble_colnames: Dictionary with columnnames that will cause trouble if the suffix is deleted
    :params str url: URL of the catalog of the CBS databases, i.e.: 'opendata.cbs.nl'
    
    return: pd.DataFrame
    """

    print(f"Number of tables to collect: {len(dict_tables)}")
    
    df= pd.DataFrame()
    for interval, table in dict_tables.items():
        print(f"Pythonic iteration {interval} for table {table}")
        try:
            df_sub = pd.DataFrame(cbsodata.get_data(table, catalog_url=url))
            if double_trouble_colnames:
                df_sub = df_sub.rename(columns=double_trouble_colnames)
            cols_wijk_stripped = [i.rstrip('0123456789').replace("_", "").lower() for i in list(df_sub.columns)]
            dict_wijk_cols_renamed = {key: value for key, value in zip(iter(df_sub.columns), iter(cols_wijk_stripped))}
            df_sub = df_sub.rename(columns=dict_wijk_cols_renamed)
            df_sub['interval'] = interval
            # print(list(df_sub.columns))
        except Exception:
            df_sub = pd.DataFrame()
            pass
        df = pd.concat([df, df_sub])
        # print(list(df.columns))
    return df

def rename_and_subset_cols(df, dict_rename, list_cols, include=True):
    """
    Method to rename and subset certain columns from a DataFrame. 
    
    :params pd.DataFrame df: DataFrame with several columns
    :params dict(str:str) dict_rename: Dictionary with a dictionary where the keys are the original columnnames
                                       and the values are the new column names
    :params list(str) list_cols: List of columns to keep/drop
    :params bool include: Boolean value to indicate if the columns from list_cols should be kept or dropped. Default 'true' to keep.
    
    return: pd.DataFrame
    """
    
    df = df.rename(columns=dict_rename)
    if include:
        df = df[list_cols]
    else:
        df = df.drop(list_cols, axis=1)
    
    return df

def downcast_variables_dataframe(df):
    """
    Method to downcast the variables in a DataFrame
    
    :params pd.DataFrame: df: DataFrame to downcast
    
    return: pd.DataFrame
    """
    df_downy = df.copy()
    # Downcast dataset
    df_downy[df_downy.select_dtypes(include='object').columns] = df_downy.select_dtypes(include='object').astype('category')

    for old, new in [('integer', 'unsigned'), ('float', 'float')]:
        for col in df.select_dtypes(include=old).columns:
            df_downy.loc[:,col] = pd.to_numeric(df_downy.loc[:,col], downcast=new)
    return df_downy

# Load data from sources

## CBS: WIJK_TABLES

In [4]:
%%time
# Get Wijkdata
df_wijk = get_and_combine_cbs_tables(dict_tables=WIJK_TABLES, double_trouble_colnames = DOUBLETROUBLECOLNAMES_WIJK, url=CBS_OPEN_URL)
DICT_WIJK_COLS_RENAMED = {'codering':'codering_regio', 
                          'interval':'perioden'}
df_wijk_sub = rename_and_subset_cols(df=df_wijk, 
                                     dict_rename=DICT_WIJK_COLS_RENAMED, 
                                     list_cols=['id', 'wijkenenbuurten', 'soortregio', 'indelingswijzigingwijkenenbuurten'], 
                                     include=False)
df_wijk_sub['codering_regio'] = df_wijk_sub['codering_regio'].str.strip()
df_wijk_sub['gemeentenaam'] = df_wijk_sub['gemeentenaam'].str.strip()
df_wijk_total = df_wijk_sub[df_wijk_sub.codering_regio.str.startswith('WK', na=False)]
df_wijk_total = downcast_variables_dataframe(df_wijk_total)
df_wijk_total = df_wijk_total.set_index(['codering_regio', 'perioden'])
df_wijk_total.sample(5)

Number of tables to collect: 6
Pythonic iteration 2020 for table 84799NED
Pythonic iteration 2019 for table 84583NED
Pythonic iteration 2018 for table 84286NED
Pythonic iteration 2017 for table 83765NED
Pythonic iteration 2016 for table 83487NED
Pythonic iteration 2015 for table 83220NED
Wall time: 3min 47s


## Source: Type of information

In [5]:
# Possible other source (to be added later):

# Combine multiple sources

In [7]:
# df_dataset_WMO = pd.merge(df_wmo_total, df_wijk_total, how='inner', left_index=True, right_index=True)

# Extend and subset right timewindow

In [None]:
# TODO!

In [None]:
# df_dataset_WMO.sample(5)

In [None]:
# df_dataset_WMO.shape

# Write result

In [None]:
# suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')

# df_dataset_WMO.to_parquet(f'../data/df_get_for_predict_WMO_{suffix_datetime}.parquet.gzip',
#               compression='gzip')

# Appendix

## Code to get current versions of loaded packages

In [None]:
print('\n'.join(f'{m.__name__} {m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))