In [1]:
import pandas as pd
import numpy as np
import pgmpy
from pgmpy.estimators import HillClimbSearch, TreeSearch, BicScore, K2Score, BDeuScore
from pgmpy.estimators import BayesianEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination, BeliefPropagation
from graphviz import Digraph
from sklearn.preprocessing import KBinsDiscretizer
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score




In [2]:
def discretize(df_data, num_bins, binning_strategy):
    discretizer = KBinsDiscretizer(n_bins = num_bins, encode = 'ordinal', strategy = binning_strategy)
    discretizer_fitted = discretizer.fit(df_data)
    data_discretized = discretizer.transform(df_data)

    for i in range(df_data.shape[1]):
        df_data.iloc[:, i] = data_discretized[:, i]
    #df_data.to_csv('features_discretized.csv', index = False)
    return df_data

In [3]:
# Load a dummy data file and discretizing the data
num_bins = 3
num_folds = 3
binning_strategy = 'quantile'
df_data = pd.read_csv('communitytesting.csv').iloc[:, 1:]
df_data.fillna(df_data.mean(), inplace=True)
df_discretized_data = discretize(df_data, num_bins, binning_strategy)

kfold = KFold(n_splits = num_folds, shuffle = True, random_state = 123)

# K fold cross validation 
acc_total = []
print('Cross validation')
for train_index, test_index in kfold.split(df_discretized_data):
    # Construct a graph for each fold
    training_samples = df_discretized_data.iloc[train_index, :]
    structure_estimator = HillClimbSearch(training_samples, scoring_method = K2Score(training_samples))
    model_structure = structure_estimator.estimate()
    edges = model_structure.edges()
    
    # Learn the conditional probability distributions of the graph built from this fold
    model_structure_bayes = BayesianModel(ebunch = edges)
    model_structure_bayes.fit(data = training_samples, estimator = BayesianEstimator, prior_type='K2')
    model_variable_eliminated = BeliefPropagation(model_structure_bayes)

    # Do query on each of the testing sample
    pred_classes = []
    for i in range(len(test_index)):
        testing_sample_index = test_index[i]
        df_sample = df_discretized_data.iloc[testing_sample_index, 0:-1].to_frame()
        dict_sample = df_sample.to_dict('dict')
        
        # Evidence is all the features of this sample, quiry variable is the variable we're interested in predicting
        query_result = model_variable_eliminated.map_query(variables = [df_discretized_data.columns[-1]], evidence = dict_sample[testing_sample_index])
        
        # Using Maximum a Posterior to get the most likely prediction 
        pred_class = query_result.get(df_discretized_data.columns[-1])
        pred_classes.append(pred_class)
    
    target_classes = df_discretized_data.iloc[test_index, -1]
    fold_accuracy = accuracy_score(target_classes, pred_classes)
    acc_total.append(fold_accuracy)
    print('Accuracy of this fold is: ', fold_accuracy)
print('Mean accuracy is: ', np.mean(acc_total))
print('Std accuracy is: ', np.std(acc_total))


  0%|          | 0/1000000 [00:00<?, ?it/s]

Cross validation


  0%|          | 9/1000000 [00:00<16:33:42, 16.77it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?i

Accuracy of this fold is:  0.9819819819819819


  0%|          | 7/1000000 [00:00<18:32:23, 14.98it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?i

Accuracy of this fold is:  0.8828828828828829


  0%|          | 6/1000000 [00:00<21:38:42, 12.83it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?i

Accuracy of this fold is:  0.8648648648648649
Mean accuracy is:  0.9099099099099099
Std accuracy is:  0.05149077537382053



