In [1]:
import pandas as pd
import numpy as np
from PrivBayes import greedy_bayes, construct_noisy_conditional_distributions
from utils import preprocessing, encoding, display_bayesian_network

In [2]:
# Display the dataframe
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [3]:
application_21 = pd.read_excel('data/2021.xlsx', sheet_name='Resultaten')
schools_21 = pd.read_excel('data/2021.xlsx', sheet_name='Klassen')

In [4]:
school_indices_dict = dict(zip(schools_21.Key, schools_21.index))
school_indices_dict

{'SvPO Amsterdam - vwo': 0,
 'Amsterdams Beroepscollege Noorderlicht - v.a. vmbo-b': 1,
 'Amsterdams Beroepscollege Noorderlicht - vmbo-k': 2,
 'Barlaeus Gymnasium - vwo': 3,
 'Berlage Lyceum - Tweetalig - v.a. vmbo-t': 4,
 'Berlage Lyceum - Tweetalig - v.a. havo': 5,
 'Berlage Lyceum - Tweetalig - vwo': 6,
 'Bindelmeer College - v.a. vmbo-b': 7,
 'Bredero Beroepscollege - v.a. vmbo-b': 8,
 'Bredero Mavo - v.a. vmbo-t': 9,
 'Bredero Mavo - Tweetalig - v.a. vmbo-t': 10,
 'Calandlyceum - vmbo-t': 11,
 'Calandlyceum - vmbo-t/havo': 12,
 'Calandlyceum - havo': 13,
 'Calandlyceum - havo/vwo': 14,
 'Calandlyceum - Technasium - havo/vwo': 15,
 'Calandlyceum - vwo': 16,
 'Calandlyceum - Technasium - vwo': 17,
 'Calvijn College - v.a. vmbo-b': 18,
 'Cartesius Lyceum - vwo': 19,
 'Cartesius Lyceum - v.a. havo': 20,
 'Cburg College - v.a. vmbo-b': 21,
 'Clusius College - v.a. vmbo-b': 22,
 'College De Meer - v.a. vmbo-b': 23,
 'College De Meer - v.a. vmbo-t': 24,
 'Comenius Lyceum Amsterdam - v.a

In [None]:
# choice_cols = [c for c in list(application_21.columns) if 'Voorkeur' in c]

In [4]:
preprocessed_df = preprocessing(application_21)
encoded_df = encoding(preprocessed_df, schools_21)
encoded_df.head()

Unnamed: 0,Basisschool advies,Voorkeur 1,Voorkeur 2,Voorkeur 3,Voorkeur 4,Voorkeur 5,Voorkeur 6,Voorkeur 7,Voorkeur 8,Voorkeur 9,Voorkeur 10,Voorkeur 11,Voorkeur 12,Voorkeur 13,Voorkeur 14,Voorkeur 15,Voorkeur 16,Voorkeur 17,Voorkeur 18,Voorkeur 19,Voorkeur 20,Voorkeur 21,Voorkeur 22
0,havo,135,37,138,104,52,109,102,81,130,5,40,42,182,182,182,182,182,182,182,182,182,182
1,havo/vwo,135,64,137,40,5,42,99,102,109,104,35,52,182,182,182,182,182,182,182,182,182,182
2,havo/vwo,138,135,102,137,37,68,99,81,20,105,180,95,148,109,141,40,182,182,182,182,182,182
3,havo/vwo,135,95,137,138,134,32,104,99,35,20,62,113,40,182,182,182,182,182,182,182,182,182
4,havo,135,40,42,109,137,132,30,37,127,180,45,102,182,182,182,182,182,182,182,182,182,182


In [43]:
encoded_df['Voorkeur 1'].nunique()

137

In [45]:
encoded_df['Voorkeur 4'].nunique()

168

## Synthetic data using PrivBayes (correlated attribute mode)

In [None]:
edu_types = application_21['Basisschool advies'].unique().tolist()
# edu_types

In [None]:
len(application_21[application_21['Basisschool advies'] == 'vwo']['Voorkeur 1'].unique().tolist())

In [None]:
# max_choice = 22
# for e in edu_types:
#     domain_sizes = []
#     print(e, ':')
#     for i in range(max_choice):
#         domain_size = len(application_21[application_21['Basisschool advies'] == e]['Voorkeur {}'.format(i+1)].unique().tolist())
#         domain_sizes.append(('Voorkeur {}'.format(i+1), domain_size))
#     print("Max: ", max([ds[1] for ds in domain_sizes]))
#     print(domain_sizes, '\n')

In [5]:
vwo_encoded = encoded_df[encoded_df['Basisschool advies'] == 'vwo']
top_vwo_df = vwo_encoded[['Voorkeur 1', 'Voorkeur 2', 'Voorkeur 3', 'Voorkeur 4', 'Voorkeur 5']]

bn = greedy_bayes(top_vwo_df, k=0, epsilon=0.1 / 2, seed=0)
display_bayesian_network(bn)

Adding ROOT Voorkeur 4
Adding attribute Voorkeur 3
Adding attribute Voorkeur 5
Adding attribute Voorkeur 2
Adding attribute Voorkeur 1
Constructed Bayesian network:
    Voorkeur 3 has parents ['Voorkeur 4'].
    Voorkeur 5 has parents ['Voorkeur 4'].
    Voorkeur 2 has parents ['Voorkeur 4'].
    Voorkeur 1 has parents ['Voorkeur 5'].


In [20]:
bn

[('Voorkeur 3', ['Voorkeur 4']),
 ('Voorkeur 5', ['Voorkeur 4']),
 ('Voorkeur 2', ['Voorkeur 4']),
 ('Voorkeur 1', ['Voorkeur 5'])]

In [23]:
len(bn[-1][1])

1

In [6]:
epsilon=0.1
conditional_probabilities = construct_noisy_conditional_distributions(bn, top_vwo_df, epsilon/2)

In [7]:
data_description = {}
data_description['bayesian_network'] = bn
data_description['conditional_probabilities'] = conditional_probabilities

# data_description['meta'] = {"num_tuples": df_input.shape[0],
#                                     "num_attributes": self.df_input.shape[1],
#                                     "num_attributes_in_BN": len(attributes_in_BN),
#                                     "all_attributes": self.df_input.columns.tolist(),
#                                     "candidate_keys": list(candidate_keys),
#                                     "non_categorical_string_attributes": non_categorical_string_attributes,
#                                     "attributes_in_BN": attributes_in_BN}

In [8]:
conditional_probabilities.keys()

dict_keys(['Voorkeur 4', 'Voorkeur 3', 'Voorkeur 5', 'Voorkeur 2', 'Voorkeur 1'])

In [31]:
len(conditional_probabilities['Voorkeur 1']['[100]'])

137

In [33]:
conditional_probabilities['Voorkeur 3']

{'[0]': [0.0,
  0.003770600435698451,
  0.0,
  0.016725328818824284,
  0.028652988822121778,
  0.0,
  0.009570966555217376,
  0.0006505634294496779,
  0.0015987403735543448,
  0.020820710010883658,
  0.0,
  0.0,
  0.0,
  0.011959930073500064,
  0.008881223272957157,
  0.01472304491321102,
  0.034448686239782006,
  0.009968159887658035,
  0.0,
  0.008998727942351915,
  0.0,
  0.003587788379706727,
  0.0,
  0.024057479727940503,
  0.0004883040015808054,
  0.0,
  0.0,
  0.008689646413334242,
  0.0,
  0.0016085967062823311,
  0.0,
  0.00293149790197548,
  0.002774293959996593,
  0.0029114657171339623,
  0.023877094297102785,
  0.00493976971922987,
  0.0,
  0.0,
  0.005496802738249339,
  0.0,
  0.00443456548710473,
  0.0045622690891732294,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0016533294711541407,
  0.0,
  0.04110739336923162,
  0.0,
  0.0,
  0.0,
  0.0039955545365843665,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.004097515258897532,
  0.0,
  0.0,
  0.0,
  0.011225989039940305,
  0.0,
  0.01231327478

In [9]:
def update_distribution(dist, indices):
    '''
    Update the probabilities of schools that already appeared in previous choices to zero,
    to avoid duplicates of choices.
    Divide probability weight equally over the rest of the distribution.
    '''
    print(indices)
    probability_sum_schools_already_in_previous_choices = sum(dist[index] for index in indices)
    for index, probability in enumerate(dist):
        if index in indices:
            dist[index] = 0.0
        else:
            dist[index] = probability + probability_sum_schools_already_in_previous_choices/(len(dist) - len(indices))
    return dist

In [24]:
from numpy import random
from pandas import DataFrame

def get_sampling_order(bn):
    order = [bn[0][1][0]]
    for child, _ in bn:
        order.append(child)
    return order

def generate_encoded_dataset(n, description):
    bn = description['bayesian_network']
    bn_root_attr = bn[0][1][0]
    root_attr_dist = description['conditional_probabilities'][bn_root_attr]
    encoded_df = DataFrame(columns=get_sampling_order(bn))
    encoded_df[bn_root_attr] = random.choice(len(root_attr_dist), size=n, p=root_attr_dist)

    for child, parents in bn:
        child_conditional_distributions = description['conditional_probabilities'][child]
        for parents_instance in child_conditional_distributions.keys():
            dist = child_conditional_distributions[parents_instance]
            # print(child, parents, parents_instance, len(dist))
            parents_instance = list(eval(parents_instance))

            # Adjust probabilities of values (school) that already appears in parent nodes to zero, to avoid duplicate
            # dist = update_distribution(dist, parents_instance, )

            # Resolve the error that probabilities do not sum up to 1
            dist = np.asarray(dist).astype('float64')
            dist = dist / np.sum(dist)

            filter_condition = ''
            for parent, value in zip(parents, parents_instance):
                filter_condition += f"(encoded_df['{parent}']=={value})&"

            filter_condition = eval(filter_condition[:-1])
            
            size = encoded_df[filter_condition].shape[0]
            if size:
                encoded_df.loc[filter_condition, child] = random.choice(len(dist), size=size, p=dist)


    encoded_df[encoded_df.columns] = encoded_df[encoded_df.columns].astype(int)
    return encoded_df

In [11]:
get_sampling_order(bn)

['Voorkeur 4', 'Voorkeur 3', 'Voorkeur 5', 'Voorkeur 2', 'Voorkeur 1']

In [12]:
parents_instance = eval('[3]')
dist = conditional_probabilities['Voorkeur 3']['[10]']
# dist[list(eval(parents_instance))[0]]
len(dist)

183

In [25]:
synthetic_vwo = generate_encoded_dataset(n=len(vwo_encoded), description=data_description)
synthetic_vwo.head()

Voorkeur 3 ['Voorkeur 4'] [0] 183
Voorkeur 3 ['Voorkeur 4'] [1] 183
Voorkeur 3 ['Voorkeur 4'] [2] 183
Voorkeur 3 ['Voorkeur 4'] [3] 183
Voorkeur 3 ['Voorkeur 4'] [4] 183
Voorkeur 3 ['Voorkeur 4'] [5] 183
Voorkeur 3 ['Voorkeur 4'] [6] 183
Voorkeur 3 ['Voorkeur 4'] [7] 183
Voorkeur 3 ['Voorkeur 4'] [8] 183
Voorkeur 3 ['Voorkeur 4'] [9] 183
Voorkeur 3 ['Voorkeur 4'] [10] 183
Voorkeur 3 ['Voorkeur 4'] [11] 183
Voorkeur 3 ['Voorkeur 4'] [12] 183
Voorkeur 3 ['Voorkeur 4'] [13] 183
Voorkeur 3 ['Voorkeur 4'] [14] 183
Voorkeur 3 ['Voorkeur 4'] [15] 183
Voorkeur 3 ['Voorkeur 4'] [16] 183
Voorkeur 3 ['Voorkeur 4'] [17] 183
Voorkeur 3 ['Voorkeur 4'] [18] 183
Voorkeur 3 ['Voorkeur 4'] [19] 183
Voorkeur 3 ['Voorkeur 4'] [20] 183
Voorkeur 3 ['Voorkeur 4'] [21] 183
Voorkeur 3 ['Voorkeur 4'] [22] 183
Voorkeur 3 ['Voorkeur 4'] [23] 183
Voorkeur 3 ['Voorkeur 4'] [24] 183
Voorkeur 3 ['Voorkeur 4'] [25] 183
Voorkeur 3 ['Voorkeur 4'] [26] 183
Voorkeur 3 ['Voorkeur 4'] [27] 183
Voorkeur 3 ['Voorkeur 4'] [28]

Unnamed: 0,Voorkeur 4,Voorkeur 3,Voorkeur 5,Voorkeur 2,Voorkeur 1
0,177,106,141,154,24
1,104,139,170,154,79
2,154,62,54,130,40
3,61,145,35,105,85
4,19,140,121,131,0
