# Imports

In [None]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import cbsodata
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import altair as alt
from sklearn import preprocessing
import plotly.express as px

# from typing import Union
# from sklearn import preprocessing
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.pipeline import Pipeline, make_pipeline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Settings

In [None]:
## Dataframe parameters
# locatie van dataset 
DF_LOCATION = 'C:/_NoBackup/Git/__JADS/WMO_execute_group_project/data/df_dataset_WMO.parquet.gzip'
# Location all data
datapath = '../data/'
# manier van laden dataset. Bijvoorbeeld read_parquet of read_csv
DF_READ = pd.read_parquet

## X & Y parameters
# de kolommen die uit de X dataset moeten worden gehaald. Dat is in ieder geval de y en eventueel nog meer kolommen.
# X_DROP_VALUES = ['wmoclienten', 'eenpersoonshuishoudens', 'huishoudenszonderkinderen', 'huishoudensmetkinderen']
X_DROP_VALUES = ['wmoclienten', 'percentagewmoclienten', 'eenpersoonshuishoudens', 'huishoudenszonderkinderen', 'huishoudensmetkinderen']
# de kolom die wordt gebruikt als y value
Y_VALUE = ['wmoclientenper1000inwoners']
# test size voor de train/test split
TEST_SIZE = 0.3
# random state voor de train/test split. Bijvoorbeeld random_state = 42 als vaste seed voor reproduceerbaarheid
RANDOM_STATE = 42

## Pipeline parameters
# strategy en waarde om te vullen bij lege categorische kolommen
NAN_VALUES_CAT_STRATEGY = 'constant'
NAN_VALUES_CAT_VALUES = 'Missing'
# waarden om in te vullen bij lege numerieke kolommen. Bijvoorbeeld mean of median
NAN_VALUES_NUM_STRATEGY = 'mean'

## Model parameters
# manier van cross validate in de modellen. Bijvoorbeeld 10 of RepeatedKFold(n_splits=30, n_repeats=5, random_state=1)
CROSS_VALIDATE = 10
# manier van scoren in de modellen
MODEL_SCORING = 'neg_mean_squared_error'

## Scoring parameters
# Deze kunnen we later toevoegen als we meerdere manieren van scoren hebben. Dus niet alleen maar de RSMLE

# Functions

In [None]:
def get_latest_file(mypath='../data/'):
    """
    Method to get the latest file to preprare
    
    :params str mypath: String with the (respectively) directory where the data can be found. Default = '../data'
    :params bool train: Boolean to indicate if expected dataframe should be for preparing training data. Default = True
    
    return: pd.DataFrame
    """
    # Get list with file
    onlyfiles = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
    filename = [s for s in onlyfiles if "df_prep_for_train_WMO" in s][-1]
    # Get list with last files
    df = pd.read_parquet(mypath+filename)
    return df

# EDA

In [None]:
df = get_latest_file(mypath='../data/')

In [None]:
df.sample(5)

In [None]:
y = 'percentagewmoclienten'
x = 'aantalinwoners'

In [None]:
sns.scatterplot(data=df, x=x, y=y, hue='perioden')

In [None]:
def plotly_scatter_xy(df, col_x, col_y, annotation, show=True, save=False, save_as='img'):
    fig = px.scatter(df, x=col_x, y=col_y, width=800, height=800)
    suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
    filename = f"{suffix_datetime}_scatter_x_{col_x}_y_{col_y}"
    annotation_below = f"x: {filename}<br>{annotation}"
    fig.add_annotation(dict(font=dict(color='blue',size=15),
                                        x=0,
                                        y=-0.2,
                                        showarrow=False,
                                        text=annotation_below,
                                        textangle=0,
                                        xanchor='left',
                                        align='left',
                                        valign='top',
                                        xref="paper",
                                        yref="paper"))
    if save:
        if save_as == 'html':
            fig.write_html(f"../img/{filename}.html")
        elif save_as == 'img':
            fig.write_image(f"../img/{filename}.jpeg")    
    if show:
        fig.show()

In [None]:
y = 'percentagewmoclienten'
for x in df.columns:
    plotly_scatter_xy(df=df, col_x=x, col_y=y, annotation="", save=True, show=False)

In [None]:
import plotly.express as px
fig = px.scatter(x=df[x], y=df[y])
fig.show()

In [None]:
fig.write_html("../img/file.html")

In [None]:
fig.write_image("../img/file.jpeg")