# Imports

In [None]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import cbsodata
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import altair as alt
from sklearn import preprocessing
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# from typing import Union
# from sklearn import preprocessing
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.pipeline import Pipeline, make_pipeline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Settings

In [None]:
## Dataframe parameters
# locatie van dataset 
# DF_LOCATION = 'C:/_NoBackup/Git/__JADS/WMO_execute_group_project/data/df_dataset_WMO.parquet.gzip'
# Location all data
datapath = '../data/'
# manier van laden dataset. Bijvoorbeeld read_parquet of read_csv
DF_READ = pd.read_parquet

## X & Y parameters
# de kolommen die uit de X dataset moeten worden gehaald. Dat is in ieder geval de y en eventueel nog meer kolommen.
# X_DROP_VALUES = ['wmoclienten', 'eenpersoonshuishoudens', 'huishoudenszonderkinderen', 'huishoudensmetkinderen']
X_DROP_VALUES = ['wmoclienten', 'percentagewmoclienten', 'eenpersoonshuishoudens', 'huishoudenszonderkinderen', 'huishoudensmetkinderen']
# de kolom die wordt gebruikt als y value
Y_VALUE = ['wmoclientenper1000inwoners']
# test size voor de train/test split
TEST_SIZE = 0.3
# random state voor de train/test split. Bijvoorbeeld random_state = 42 als vaste seed voor reproduceerbaarheid
RANDOM_STATE = 42

## Pipeline parameters
# strategy en waarde om te vullen bij lege categorische kolommen
NAN_VALUES_CAT_STRATEGY = 'constant'
NAN_VALUES_CAT_VALUES = 'Missing'
# waarden om in te vullen bij lege numerieke kolommen. Bijvoorbeeld mean of median
NAN_VALUES_NUM_STRATEGY = 'mean'

## Model parameters
# manier van cross validate in de modellen. Bijvoorbeeld 10 of RepeatedKFold(n_splits=30, n_repeats=5, random_state=1)
CROSS_VALIDATE = 10
# manier van scoren in de modellen
MODEL_SCORING = 'neg_mean_squared_error'

## Scoring parameters
# Deze kunnen we later toevoegen als we meerdere manieren van scoren hebben. Dus niet alleen maar de RSMLE

# Functions

In [None]:
def get_latest_file(mypath='../data/', step='prep', goal='train'):
    """
    Method to get the latest file to preprare
    
    :params str mypath: String with the (respectively) directory where the data can be found. Default = '../data'
    :params bool train: Boolean to indicate if expected dataframe should be for preparing training data. Default = True
    
    return: pd.DataFrame
    """
    # Get list with file
    onlyfiles = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
    filename = [s for s in onlyfiles if f"df_{step}_for_{goal}_WMO" in s][-1]
    # Get list with last files
    df = pd.read_parquet(mypath+filename)
    return df

def single_scatter(df, x, y, ymin=None, ymax=None, xmin=None, xmax=None, show=True, save=False, save_as='img', **kwargs):
    fig = px.scatter(df, x=x, y=y, **kwargs)
    suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
    filename = f"{suffix_datetime}_scatter_x_{x}_y_{y}"
    if (type(ymin) == int or type(ymin) == float) and (type(ymax) == int or type(ymax) == float):
        fig.update_yaxes(range=[ymin, ymax], row=1, col=1)
    if (type(xmin) == int or type(xmin) == float) and (type(xmax) == int or type(xmax) == float):
        fig.update_xaxes(range=[xmin, xmax], row=1, col=1)
    if save:
        if save_as == 'html':
            fig.write_html(f"../img/{filename}.html")
        elif save_as == 'img':
            fig.write_image(f"../img/{filename}.jpeg")    
    if show:
        fig.show()
        
def subplot_scatter(df, xlist, y, show=True, save=False, save_as='img', **kwargs):
    fig = make_subplots(rows=1, cols=len(xlist),
                       shared_yaxes=True)
    
    for i, x in enumerate(xlist):
        go_scatter_kwargs = {k:v for k, v in kwargs.items() if k in list(go.Scatter.__init__.__code__.co_varnames)}
        fig.add_trace(go.Scatter(x=df[x], y=df[y], mode="markers",  name=x, **go_scatter_kwargs), row=1, col=i+1)
        fig.update_xaxes(title_text=x, row=1, col=i+1)
    
    update_layout_kwargs = {k:v for k, v in kwargs.items() if k not in list(kwargs.keys())}
    fig.update_layout(**update_layout_kwargs)
    fig.update_yaxes(title_text=y, row=1, col=1)
    
    
    if save:
        if save_as == 'html':
            fig.write_html(f"../img/{filename}.html")
        elif save_as == 'img':
            fig.write_image(f"../img/{filename}.jpeg")    
    if show:
        fig.show()

# EDA

## Load data

In [None]:
df = get_latest_file(mypath='../data/', step='prep')
# df = pd.read_parquet('../data/df_prep_for_train_WMO_202103112150.parquet.gzip')
# df = pd.read_parquet('../data/df_get_for_train_WMO_202103141516.parquet.gzip')
# df_get_for_train_WMO_202103112139.parquet.gzip

In [None]:
df['periodencat'] = df['periodennum'].astype(str)
df.sample(5)

In [None]:
df.describe().T

## Play with Seaborn

In [None]:
y = 'percentagewmoclienten'
x = 'aantalinwoners'
sns.scatterplot(data=df, x=x, y=y, hue='perioden')

## Play with plotly

### Single scatterplot
* [Plotly scatterplot](https://plotly.com/python/line-and-scatter/)
* [Scatter params](https://plotly.com/python-api-reference/generated/plotly.graph_objects.Scatter.html)

In [None]:
y = 'percentagewmoclienten'
x = 'vrouwen'
z = 'periodencat'
single_scatter(df=df, x=x, y=y, color=z, opacity=0.4, size=None, hover_data=[], width=800, height=800, show=True)

### Single scatterplot for loop for all

In [None]:
y = 'percentagewmoclienten'
z = 'periodencat'
for x in df.columns:
    single_scatter(df=df, x=x, y=y, color=z, opacity=0.4, size=None, hover_data=[], width=800, height=800, show=False, save=True)

### Subplot scatterplot
[Plotly subplots](https://plotly.com/python/subplots/)

In [None]:
df['leeftijd_mix_sum'] = (7.5*df['k0tot15jaar'])+(20*df['k15tot25jaar'])+(35*df['k25tot45jaar'])+(55*df['k45tot65jaar'])+(75*df['k65jaarofouder'])
df['leeftijd_mix_avg'] = df['leeftijd_mix_sum'] / df['aantalinwoners']

In [None]:
# xlist=['k0tot15jaar', 'k15tot25jaar', 'k25tot45jaar', 'k45tot65jaar', 'k65jaarofouder']
# xlist=['vrouwen', 'mannen']
xlist = ['leeftijd_mix_sum', 'leeftijd_mix_avg']
y = 'wmoclienten'
subplot_scatter(df=df, xlist=xlist, y=y, opacity=0.5, height=500, width=500, title_text="Test", show=True, save=False)

### Histograms/density plots
* [Plotly histograms ](https://plotly.com/python/histograms/)
* [Plotly histrogram contour](https://plotly.com/python/2d-histogram-contour/)
* [Plotly density plot](https://plotly.com/python/distplot/)

In [None]:
fig = go.Figure(go.Histogram2dContour(
        x = x,
        y = y,
        colorscale = 'Jet',
        contours = dict(
            showlabels = True,
            labelfont = dict(
                family = 'Raleway',
                color = 'white'
            )
        ),
        hoverlabel = dict(
            bgcolor = 'white',
            bordercolor = 'black',
            font = dict(
                family = 'Raleway',
                color = 'black'
            )
        )

))

fig.show()

In [None]:
y = 'bevolkingsdichtheid'
x = 'huishoudenszonderkinderen'
z = 'percentagewmoclienten'

fig = go.Figure(go.Histogram2dContour(
        x = df[x],
        y = df[y],
        colorscale = 'Jet',
        contours = dict(
            showlabels = True,
            labelfont = dict(
                family = 'Raleway',
                color = 'white'
            )
        ),
        hoverlabel = dict(
            bgcolor = 'white',
            bordercolor = 'black',
            font = dict(
                family = 'Raleway',
                color = 'black'
            )
        )

))

fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram2dContour(
        x = df[x],
        y = df[y],
        colorscale = 'Blues',
        reversescale = True,
        xaxis = 'x',
        yaxis = 'y'
    ))
fig.add_trace(go.Scatter(
        x = df[x],
        y = df[y],
        xaxis = 'x',
        yaxis = 'y',
        mode = 'markers',
        marker = dict(
            color = 'rgba(0,0,0,0.3)',
            size = 3
        )
    ))
fig.add_trace(go.Histogram(
        y = df[y],
        xaxis = 'x2',
        marker = dict(
            color = 'rgba(0,0,0,1)'
        )
    ))
fig.add_trace(go.Histogram(
        x = df[x],
        yaxis = 'y2',
        marker = dict(
            color = 'rgba(0,0,0,1)'
        )
    ))
fig.update_layout(
    autosize = False,
    xaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = False
    ),
    yaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = False
    ),
    xaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False
    ),
    yaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False
    ),
    height = 600,
    width = 600,
    bargap = 0,
    hovermode = 'closest',
    showlegend = False
)

fig.show()

### Sunburst
[Plotly sunburst docs](https://plotly.com/python/sunburst-charts/)

In [None]:
import plotly.express as px
import numpy as np
df = px.data.gapminder().query("year == 2007")
fig = px.sunburst(df, path=['continent', 'country'], values='pop',
                  color='lifeExp', hover_data=['iso_alpha'],
                  color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(df['lifeExp'], weights=df['pop']))
fig.show()

## Play with finding most important features

In [None]:
df_copy = df.copy()

In [None]:
df_copy = df_copy.fillna(value=0.00001)

In [None]:
df_copy.isnull().sum()

In [None]:
X = df_copy.copy().drop([ 'wmoclientenper1000inwoners', 'wmoclienten', 'perioden'], axis=1)
y = df_copy.copy()['wmoclientenper1000inwoners']

In [None]:
X.where(df == '         .').dropna(axis=1)

In [None]:
df_copy.describe().T

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
# df_copy = df_copy.fillna(value=0)
# X = df_copy.copy().drop([ 'wmoclientenper1000inwoners', 'wmoclienten', 'perioden', 'typemaatwerkarrangement', 'gemeentenaam', 'meestvoorkomendepostcode', 'periodennum'], axis=1)
X = df_copy.copy().drop([ 'wmoclientenper1000inwoners', 'wmoclienten', 'perioden', 'periodennum', 'periodencat', 'eenpersoonshuishoudens', 'huishoudensmetkinderen'], axis=1)
y = df_copy.copy()['wmoclientenper1000inwoners']
selector = SelectFromModel(estimator=Lasso()).fit(X, y)

df_features2 = pd.DataFrame(data=selector.estimator_.coef_, index=X.columns)
df_features2['abs'] = df_features2[0].abs()
df_features2.sort_values(by='abs', ascending=False).head(25)

In [None]:
X.shape

In [None]:
list(df_copy.columns)

In [None]:
df_copy['leeftijd_mix'] = (0.25*df_copy['k0tot15jaar'])+(0.5*df_copy['k15tot25jaar'])+df_copy['k25tot45jaar']+(1.5*df_copy['k45tot65jaar'])+(2*df_copy['k65jaarofouder'])

In [None]:
y = 'wmoclientenper1000inwoners'
x = 'leeftijd_mix'
plotly_scatter_xy(df=df_copy, col_x=x, col_y=y, annotation="", save=False, show=True)

In [None]:
import plotly.express as px
fig = px.scatter(x=df[x], y=df[y])
fig.show()

In [None]:
fig.write_html("../img/file.html")

In [None]:
fig.write_image("../img/file.jpeg")