# Imports

In [1]:
#!pip install plotly
from os import listdir
from os.path import isfile, join
import pandas as pd
import cbsodata
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import altair as alt
from sklearn import preprocessing
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Settings

# Functions

In [2]:
def single_scatter(df, x, y, ymin=None, ymax=None, xmin=None, xmax=None, show=True, save=False, save_as='img', **kwargs):
    fig = px.scatter(df, x=x, y=y, **kwargs)
    suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
    filename = f"{suffix_datetime}_scatter_x_{x}_y_{y}"
    if (type(ymin) == int or type(ymin) == float) and (type(ymax) == int or type(ymax) == float):
        fig.update_yaxes(range=[ymin, ymax], row=1, col=1)
    if (type(xmin) == int or type(xmin) == float) and (type(xmax) == int or type(xmax) == float):
        fig.update_xaxes(range=[xmin, xmax], row=1, col=1)
    if save:
        if save_as == 'html':
            fig.write_html(f"../img/{filename}.html")
        elif save_as == 'img':
            fig.write_image(f"../img/{filename}.jpeg")    
    if show:
        fig.show()
        
def subplot_scatter(df, xlist, y, show=True, save=False, save_as='img', **kwargs):
    fig = make_subplots(rows=1, cols=len(xlist),
                       shared_yaxes=True)
    
    for i, x in enumerate(xlist):
        go_scatter_kwargs = {k:v for k, v in kwargs.items() if k in list(go.Scatter.__init__.__code__.co_varnames)}
        fig.add_trace(go.Scatter(x=df[x], y=df[y], mode="markers",  name=x, **go_scatter_kwargs), row=1, col=i+1)
        fig.update_xaxes(title_text=x, row=1, col=i+1)
    
    update_layout_kwargs = {k:v for k, v in kwargs.items() if k not in list(kwargs.keys())}
    fig.update_layout(**update_layout_kwargs)
    fig.update_yaxes(title_text=y, row=1, col=1)
    
    
    if save:
        if save_as == 'html':
            fig.write_html(f"../img/{filename}.html")
        elif save_as == 'img':
            fig.write_image(f"../img/{filename}.jpeg")    
    if show:
        fig.show()

# EDA

EDA outside dataset

In [3]:
df_geslacht = pd.read_csv("../data/Cliënten huishoudelijke hulp leeftijd en geslacht Nederland Nijmegen 25 mrt 2021.csv", sep=';')

In [None]:
df_geslacht

In [None]:
df_manvrouw = pd.read_csv("../data/Huishoudens alleenstaand en mannen en vrouwen 2012-2020 25 mrt 2021 - Nijmegen.csv", sep=';')

In [None]:
df_manvrouw

Load data

In [5]:
df = pd.read_parquet('../data/df_WMO_WIJK_HOUSEHOLDS_POP_LEVY_absolute_gemeente.parquet.gzip')

Omvang dataset

In [None]:
df.shape

Check kolommen

In [None]:
list(df.columns)

Tel aantal unieke gemeenten

In [None]:
df.reset_index().codering_regio.nunique()

Uitzoeken hoeveel gemeenten er alle jaren info hebben

In [None]:
df_gem = df.reset_index()[['codering_regio', 'interval']]

In [None]:
df_counts = pd.DataFrame(df_gem.codering_regio.value_counts())
df_counts.head()

In [None]:
df_counts.codering_regio.value_counts()

Aantal string kolommen

In [None]:
list_exclude = ['perioden', 'popcodea', 'popcodeb', 'popcodec', 'popcoded', 'popcodee', 'popcodef', 'popcodeg', 'popcodeh', 
                'popcodei', 'popcodej', 'popcodek', 'popcodel', 'popcodem', 'popcoden', 'popcodeo', 'popcodep', 'popcodeq', 
                'popcoder', 'popnaama', 'popnaamb', 'popnaamc', 'popnaamd', 'popnaame', 'popnaamf', 'popnaamg', 
                'popnaamh', 'popnaami', 'popnaamj', 'popnaamk', 'popnaaml', 'popnaamm', 'popnaamn', 'popnaamo',
                'popnaamp', 'popnaamq', 'popnaamr', 'popkoppelvariabeleregiocode', 'typemaatwerkarrangement', 
                'gemeentenaam', 'meestvoorkomendepostcode']
len(list_exclude)

Aantal missende waarden numerieke kolommen bepalen

In [None]:
# # search certain value
# df.drop(list_exclude, axis=1)[df.drop(list_exclude, axis=1) == "JZ01      "].sum()>1

In [None]:
# df.loc[:, df.columns != 'perioden'].columns
for col in df.drop(list_exclude, axis=1).columns:
#     print(col)
    df[col] = pd.to_numeric(df[col])

In [None]:
s_num_missing = df.drop(list_exclude, axis=1).isnull().sum(axis=0)[df.drop(list_exclude, axis=1).isnull().sum(axis=0)>0]
s_perc_missing = s_num_missing / len(df)
df_missing = pd.DataFrame({'num_missing': s_num_missing,'perc_missing': s_perc_missing})
df_missing.sort_values('perc_missing', ascending=False)

Aantal kolommen met missing value > 25%

In [None]:
len(df_missing[df_missing['perc_missing']>0.25])

In [None]:
all_nan_cols = list(df_missing[df_missing['perc_missing']==1].index)
all_nan_cols

Aantal missing values voor target variabele

In [None]:
df_missing.loc['wmoclienten']

In [None]:
df_missing.loc['wmoclientenper1000inwoners']

# Verkennen targetvariabele

_Histograms/density plots_
* [Plotly histograms ](https://plotly.com/python/histograms/)
* [Plotly histrogram contour](https://plotly.com/python/2d-histogram-contour/)
* [Plotly density plot](https://plotly.com/python/distplot/)

In [None]:
## Mocht je het tof vinden, kun je ook kijken of je een mooie visualisatie in dezelfde stijl kunt krijgen 
## voor de targetvariabele

In [None]:
df['wmoclientenper1000inwoners'].hist()

In [None]:
import plotly.figure_factory as ff
import numpy as np

x = df['wmoclientenper1000inwoners'].dropna()
hist_data = [x]
group_labels = ['wmoclientenper1000inwoners'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

In [None]:
df['wmoclientenper1000inwoners'].max()

In [None]:
import plotly.figure_factory as ff
import numpy as np

x = df['wmoclientenper1000inwoners'].dropna()
group_labels = ['wmoclientenper1000inwoners']

# colors = ['slategray', 'magenta']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot([x], group_labels, bin_size=.5,
                         curve_type='normal') # override default 'kde')

# Add title
fig.update_layout(title_text='Distplot with Normal Distribution')
fig.show()

# Correlatie bepalen

### HIER GRAAG CORRELATIEMATRIX INVOEGEN NICK

In [None]:
corr_matrix = df.corr()['wmoclientenper1000inwoners']

In [None]:
sorted_pairs = corr_matrix.sort_values(kind='quicksort')
strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.4]
print(strong_pairs)

In [None]:
list_strong_pairs = list(strong_pairs.index)

In [None]:
df_corr = df[list_strong_pairs].corr()
f, ax = plt.subplots(figsize=(40,20))
cmap = sns.diverging_palette(230,20, as_cmap=True)
sns.set(font_scale=2.0)
sns.heatmap(df_corr,vmax=1 ,cmap=cmap, square=True, linewidth=.5, ax = ax)
plt.title("Correlatiematrix")

# Verdieping middels scatterplots

_Single scatterplot_
* [Plotly scatterplot](https://plotly.com/python/line-and-scatter/)
* [Scatter params](https://plotly.com/python-api-reference/generated/plotly.graph_objects.Scatter.html)

In [None]:
y = 'wmoclientenper1000inwoners'
x = 'vrouwen'
z = 'perioden'
single_scatter(df=df, x=x, y=y, color=z, opacity=0.4, size=None, hover_data=[], width=800, height=800, show=True)

_Single scatterplot for loop for all_

In [None]:
# list_cols_with_interesting_corr = [df]

In [None]:
y = 'wmoclientenper1000inwoners'
z = 'perioden'
for x in df.columns:
    single_scatter(df=df, x=x, y=y, color=z, opacity=0.4, size=None, hover_data=[], width=800, height=800, show=False, save=True)

# Appendix: Play with plotly

### Subplot scatterplot
[Plotly subplots](https://plotly.com/python/subplots/)

In [None]:
df['leeftijd_mix_sum'] = (7.5*df['k0tot15jaar'])+(20*df['k15tot25jaar'])+(35*df['k25tot45jaar'])+(55*df['k45tot65jaar'])+(75*df['k65jaarofouder'])
df['leeftijd_mix_avg'] = df['leeftijd_mix_sum'] / df['aantalinwoners']

In [8]:
# xlist=['k0tot15jaar', 'k15tot25jaar', 'k25tot45jaar', 'k45tot65jaar', 'k65jaarofouder']
xlist=['vrouwen', 'mannen']
# xlist = ['leeftijd_mix_sum', 'leeftijd_mix_avg']
y = 'wmoclientenper1000inwoners'
subplot_scatter(df=df, xlist=xlist, y=y, opacity=0.5, height=500, width=500, title_text="Test", show=True, save=False)