### Bayesian network construction

In [54]:
import pandas as pd
import numpy as np
from PrivBayes import greedy_bayes
from utils import preprocessing, encoding, get_school_list, decoding, display_bayesian_network
import plotly.express as px
from scipy import stats

In [25]:
# Display the dataframe
pd.set_option('display.max_columns', None)  # or 1000
# pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [4]:
choices_df = pd.read_excel('data/2021.xlsx', sheet_name='Resultaten')
schools_df = pd.read_excel('data/2021.xlsx', sheet_name='Klassen')
schools = get_school_list(schools_df)
choices_df = preprocessing(choices_df)
encoded_choices_df = encoding(choices_df, schools)
edu_types = encoded_choices_df['Basisschool advies'].unique().tolist()
choice_cols = [c for c in list(encoded_choices_df.columns) if 'Voorkeur' in c]

In [35]:
k = 5
for e in edu_types:
    print(e, ":")
    subset = encoded_choices_df[encoded_choices_df['Basisschool advies'] == e]
    chosen_cols = ['Voorkeur {}'.format(i) for i in range(1, k+1)]
    first_choices_subset = subset[chosen_cols]

    bn = greedy_bayes(first_choices_subset, k=0, epsilon=0.1 / 2, seed=0)
    display_bayesian_network(bn)

havo :
Constructed Bayesian network:
    Voorkeur 3 has parents ['Voorkeur 4'].
    Voorkeur 5 has parents ['Voorkeur 3', 'Voorkeur 4'].
    Voorkeur 2 has parents ['Voorkeur 3', 'Voorkeur 5', 'Voorkeur 4'].
    Voorkeur 1 has parents ['Voorkeur 5', 'Voorkeur 2', 'Voorkeur 4'].
havo/vwo :
Constructed Bayesian network:
    Voorkeur 3 has parents ['Voorkeur 4'].
    Voorkeur 5 has parents ['Voorkeur 3', 'Voorkeur 4'].
    Voorkeur 2 has parents ['Voorkeur 3', 'Voorkeur 5', 'Voorkeur 4'].
    Voorkeur 1 has parents ['Voorkeur 5', 'Voorkeur 2', 'Voorkeur 4'].
vwo :
Constructed Bayesian network:
    Voorkeur 3 has parents ['Voorkeur 4'].
    Voorkeur 5 has parents ['Voorkeur 4'].
    Voorkeur 2 has parents ['Voorkeur 4'].
    Voorkeur 1 has parents ['Voorkeur 5'].
vmbo-b/k :


### Graphical comparison

In [6]:
alpha_synthetic = pd.read_csv('synthetic_datasets/model_alpha_synthetic.csv')
beta_synthetic = pd.read_csv('synthetic_datasets/model_beta_synthetic.csv')

In [7]:
encoded_alpha_synthetic = encoding(alpha_synthetic, schools)
encoded_alpha_synthetic.head()

Unnamed: 0,Basisschool advies,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5,Voorkeur 6,Voorkeur 7,Voorkeur 8,Voorkeur 9,...,Voorkeur 13,Voorkeur 14,Voorkeur 15,Voorkeur 16,Voorkeur 17,Voorkeur 18,Voorkeur 19,Voorkeur 20,Voorkeur 21,Voorkeur 22
0,havo,68,99,81,102,20,132,137,42,30,...,181,181,181,181,181,181,181,181,181,181
1,havo,138,104,56,28,37,73,45,96,153,...,181,181,181,181,181,181,181,181,181,181
2,havo,30,20,13,127,137,5,40,109,139,...,181,181,181,181,181,181,181,181,181,181
3,havo,104,37,95,42,73,11,35,131,62,...,181,181,181,181,181,181,181,181,181,181
4,havo,129,11,162,40,45,113,151,2,135,...,181,181,181,181,181,181,181,181,181,181


In [8]:
encoded_beta_synthetic = encoding(beta_synthetic, schools)
encoded_beta_synthetic.head()

Unnamed: 0,Basisschool advies,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5,Voorkeur 6,Voorkeur 7,Voorkeur 8,Voorkeur 9,...,Voorkeur 13,Voorkeur 14,Voorkeur 15,Voorkeur 16,Voorkeur 17,Voorkeur 18,Voorkeur 19,Voorkeur 20,Voorkeur 21,Voorkeur 22
0,havo,37,20,113,52,81,127,42,180,46,...,181,181,181,181,181,181,181,181,181,181
1,havo,81,40,129,102,29,115,180,48,68,...,181,181,181,181,181,181,181,181,181,181
2,havo,121,138,137,45,44,88,110,22,99,...,181,181,181,181,181,181,181,181,181,181
3,havo,127,64,109,135,5,35,142,104,73,...,181,181,181,181,181,181,181,181,181,181
4,havo,104,99,126,135,142,87,30,7,72,...,181,181,181,181,181,181,181,181,181,181


### First choice column for VWO group

In [42]:
real_choice_col_bins = encoded_choices_df['Voorkeur 1'].value_counts().rename('real')
alpha_synthetic_choice_col_bins = alpha_synthetic['Voorkeur 1'].value_counts().rename('alpha_synthetic')
beta_synthetic_choice_col_bins = beta_synthetic['Voorkeur 1'].value_counts().rename('beta_synthetic')
compare_choice_col_bins = pd.concat([real_choice_col_bins, alpha_synthetic_choice_col_bins, beta_synthetic_choice_col_bins], axis=1).astype(float)

In [43]:
compare_choice_col_bins.head()

Unnamed: 0,real,alpha_synthetic,beta_synthetic
0,11.0,10.0,15.0
1,39.0,51.0,36.0
2,30.0,30.0,30.0
3,235.0,243.0,263.0
4,77.0,79.0,76.0


In [44]:
import plotly.graph_objects as go

indices = compare_choice_col_bins.index

fig = go.Figure()
fig.add_trace(go.Bar(
    x=indices,
    y=compare_choice_col_bins.real,
    name='Real',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=indices,
    y=compare_choice_col_bins.alpha_synthetic,
    name='Synthetic G_alpha',
    marker_color='lightsalmon'
))
fig.add_trace(go.Bar(
    x=indices,
    y=compare_choice_col_bins.beta_synthetic,
    name='Synthetic G_beta',
    marker_color='deepskyblue'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', title_text='First choice variable - Bar chart comparison')
fig.show()

### Bar chart comparison for each choice column

In [None]:
for choice_col in choice_cols:
    real_choice_col_bins = encoded_choices_df[choice_col].value_counts().rename('real')
    alpha_synthetic_choice_col_bins = alpha_synthetic[choice_col].value_counts().rename('alpha_synthetic')
    beta_synthetic_choice_col_bins = beta_synthetic[choice_col].value_counts().rename('beta_synthetic')
    compare_choice_col_bins = pd.concat([real_choice_col_bins, alpha_synthetic_choice_col_bins, beta_synthetic_choice_col_bins], axis=1).astype(float)

    indices = compare_choice_col_bins.index

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=indices,
        y=compare_choice_col_bins.real,
        name='Real',
        marker_color='indianred'
    ))
    fig.add_trace(go.Bar(
        x=indices,
        y=compare_choice_col_bins.alpha_synthetic,
        name='Synthetic G_alpha',
        marker_color='lightsalmon'
    ))
    fig.add_trace(go.Bar(
        x=indices,
        y=compare_choice_col_bins.beta_synthetic,
        name='Synthetic G_beta',
        marker_color='deepskyblue'
    ))

    # Here we modify the tickangle of the xaxis, resulting in rotated labels.
    fig.update_layout(barmode='group', title_text='{} variable - Bar chart comparison'.format(choice_col))
    fig.show()

### KS test

#### First choice col

In [49]:
real_choice_col_bins = encoded_choices_df['Voorkeur 1'].value_counts().rename('real')
alpha_synthetic_choice_col_bins = alpha_synthetic['Voorkeur 1'].value_counts().rename('alpha_synthetic')
beta_synthetic_choice_col_bins = beta_synthetic['Voorkeur 1'].value_counts().rename('beta_synthetic')
compare_choice_col_bins = pd.concat([real_choice_col_bins, alpha_synthetic_choice_col_bins, beta_synthetic_choice_col_bins], axis=1).astype(float)

In [51]:
stats.ks_2samp(compare_choice_col_bins.real, compare_choice_col_bins.alpha_synthetic)

KstestResult(statistic=0.058394160583941604, pvalue=0.9745005471247125)

In [53]:
stats.ks_2samp(compare_choice_col_bins.real, compare_choice_col_bins.beta_synthetic)

KstestResult(statistic=0.058394160583941604, pvalue=0.9745005471247125)

### KS test for each choice col

In [None]:
significance_level = 0.05

for choice_col in choice_cols:

    real_choice_col_bins = encoded_choices_df[choice_col].value_counts().rename('real')
    alpha_synthetic_choice_col_bins = alpha_synthetic[choice_col].value_counts().rename('alpha_synthetic')
    beta_synthetic_choice_col_bins = beta_synthetic[choice_col].value_counts().rename('beta_synthetic')
    compare_choice_col_bins = pd.concat([real_choice_col_bins, alpha_synthetic_choice_col_bins, beta_synthetic_choice_col_bins], axis=1).astype(float)

    alpha_pvalue = stats.ks_2samp(compare_choice_col_bins.real, compare_choice_col_bins.alpha_synthetic).pvalue
    beta_pvalue = stats.ks_2samp(compare_choice_col_bins.real, compare_choice_col_bins.beta_synthetic).pvalue
    
    if alpha_pvalue > significance_level:
        print('{}: cannot reject null hypothesis, no significant differences between real and synthetic from G_alpha'.format(choice_col))
    else:
        print('{}: reject null hypothesis, significant differences between real and synthetic from G_alpha'.format(choice_col))
    
    if beta_pvalue > significance_level:
        print('{}: cannot reject null hypothesis, no significant differences between real and synthetic from G_beta'.format(choice_col))
    else:
        print('{}: reject null hypothesis, significant differences between real and synthetic from G_beta'.format(choice_col))

### Correlation matrix comparison

In [74]:
selected_cols = [c for c in list(encoded_choices_df.columns) if 'Voorkeur' in c][:5]

In [75]:
real_corr = encoded_choices_df[selected_cols].corr()
real_corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5
Voorkeur 1,1.0,0.068394,0.036399,0.006425,-0.021909
Voorkeur 2,0.068394,1.0,0.057559,0.083189,-0.030292
Voorkeur 3,0.036399,0.057559,1.0,0.060853,0.002634
Voorkeur 4,0.006425,0.083189,0.060853,1.0,0.004656
Voorkeur 5,-0.021909,-0.030292,0.002634,0.004656,1.0


In [76]:
alpha_synthetic_corr = encoded_alpha_synthetic[selected_cols].corr()
alpha_synthetic_corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5
Voorkeur 1,1.0,0.018564,-0.009046,-0.016361,-0.054041
Voorkeur 2,0.018564,1.0,0.017172,0.005552,0.006594
Voorkeur 3,-0.009046,0.017172,1.0,0.070809,0.066751
Voorkeur 4,-0.016361,0.005552,0.070809,1.0,0.167993
Voorkeur 5,-0.054041,0.006594,0.066751,0.167993,1.0


In [77]:
beta_synthetic_corr = encoded_beta_synthetic[selected_cols].corr()
beta_synthetic_corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5
Voorkeur 1,1.0,-0.004905,-0.017386,-0.003547,-0.020036
Voorkeur 2,-0.004905,1.0,0.018103,0.020372,-0.006314
Voorkeur 3,-0.017386,0.018103,1.0,0.101495,0.073939
Voorkeur 4,-0.003547,0.020372,0.101495,1.0,0.173625
Voorkeur 5,-0.020036,-0.006314,0.073939,0.173625,1.0


In [78]:
real_corr - alpha_synthetic_corr

Unnamed: 0,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5
Voorkeur 1,0.0,0.04983,0.045445,0.022785,0.032132
Voorkeur 2,0.04983,0.0,0.040387,0.077637,-0.036886
Voorkeur 3,0.045445,0.040387,0.0,-0.009956,-0.064117
Voorkeur 4,0.022785,0.077637,-0.009956,0.0,-0.163337
Voorkeur 5,0.032132,-0.036886,-0.064117,-0.163337,0.0


In [79]:
real_corr - beta_synthetic_corr

Unnamed: 0,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5
Voorkeur 1,0.0,0.073298,0.053785,0.009972,-0.001873
Voorkeur 2,0.073298,0.0,0.039456,0.062817,-0.023978
Voorkeur 3,0.053785,0.039456,0.0,-0.040642,-0.071305
Voorkeur 4,0.009972,0.062817,-0.040642,0.0,-0.168969
Voorkeur 5,-0.001873,-0.023978,-0.071305,-0.168969,0.0
