In [151]:
import pandas as pd
import numpy as np
from itertools import combinations, product
from progressbar import ProgressBar

In [148]:
# load data
occurence = np.loadtxt('./data/chowliu-input.txt')
names = np.loadtxt('./data/names.txt',dtype=str)

In [152]:
def calculate_empirical_coocurence_probability(occurence):
    n_records, n_classes = occurence.shape
    matrix_11 = np.zeros((n_classes, n_classes))
    matrix_00 = np.zeros((n_classes, n_classes))
    matrix_10 = np.zeros((n_classes, n_classes))
    matrix_01 = np.zeros((n_classes, n_classes))
    bar = ProgressBar()
    for row in bar(occurence):
        idx_1 = np.where(row==1)[0]
        idx_0 = np.where(row==0)[0]
        idx_11 = combinations(idx_1, 2)
        idx_00 = combinations(idx_0, 2)
        idx_10 = product(idx_1, idx_0)
        idx_01 = product(idx_0, idx_1)
        for idx in idx_11:
            matrix_11[idx]+=1
        for idx in idx_00:
            matrix_00[idx]+=1
        for idx in idx_10:
            matrix_10[idx]+=1
        for idx in idx_01:
            matrix_01[idx]+=1
            
    return matrix_11/n_records, matrix_00/n_records, matrix_10/n_records, matrix_01/n_records

def calculate_empirical_marginal_probability(occurence):
    return occurence.mean(0), 1-occurence.mean(0)

def calculate_mutial_information(occurence):
    n_records, n_classes = occurence.shape
    mutial_information_matrix = np.zeros((n_classes, n_classes))
    prob_11, prob_00, prob_10, prob_01 = calculate_empirical_coocurence_probability(occurence)
    prob_1, prob_0 = calculate_empirical_marginal_probability(occurence)
    all_idx = combinations(np.arange(n_classes), 2)
    for i,j in all_idx:
        mutial_information_matrix[i,j]+=prob_11[i,j]*np.log(prob_11[i,j]/(prob_1[i]*prob_1[j]))
        mutial_information_matrix[i,j]+=prob_00[i,j]*np.log(prob_00[i,j]/(prob_0[i]*prob_0[j]))
        mutial_information_matrix[i,j]+=prob_10[i,j]*np.log(prob_10[i,j]/(prob_1[i]*prob_0[j]))
        mutial_information_matrix[i,j]+=prob_01[i,j]*np.log(prob_01[i,j]/(prob_0[i]*prob_1[j]))
    return np.nan_to_num(mutial_information_matrix)

In [183]:
mutial_information_matrix = calculate_mutial_information(occurence)

100% (4367 of 4367) |#####################| Elapsed Time: 0:00:15 Time: 0:00:15
  
  
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


In [187]:
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree

In [218]:
def get_tree_edges_from_mutual_information(mutial_information_matrix):
    neg_mut_info_csr = csr_matrix(-mutial_information_matrix)
    mstree_adj = minimum_spanning_tree(neg_mut_info_csr).todense()
    return [edge for edge in zip(*np.where(mstree_adj<0))]

def calculate_edge_potential(edges, occurence,name=None):
    prob_11, prob_00, prob_10, prob_01 = calculate_empirical_coocurence_probability(occurence)
    prob_1, prob_0 = calculate_empirical_marginal_probability(occurence)
    pot_dict = {}
    for edge in edges:
        if name is None:
            key = edge
        else:
            key = '--'.join(name[np.array(edge)])
        i,j = edge
        pot_11 =prob_11[i,j]/(prob_1[i]*prob_1[j])
        pot_00 =prob_00[i,j]/(prob_0[i]*prob_0[j])
        pot_10 =prob_10[i,j]/(prob_1[i]*prob_0[j])
        pot_01 =prob_01[i,j]/(prob_0[i]*prob_1[j])
        pot_dict[key] = np.array([[pot_00, pot_01], [pot_10, pot_11]])
    return pot_dict

def calculate_node_potentials(occurence):
    prob_1, prob_0 = calculate_empirical_marginal_probability(occurence)
    return np.stack([prob_0, prob_1]).T

In [241]:
edges = get_tree_edges_from_mutual_information(mutial_information_matrix)
edge_potential = calculate_edge_potential(edges, occurence)

100% (4367 of 4367) |#####################| Elapsed Time: 0:00:15 Time: 0:00:15


In [252]:
node_potential = calculate_node_potentials(occurence)

In [260]:
def get_uai_str(edges, node_potentials, edge_potentials):
    num_vars = node_potentials.shape[0]
    network_type = 'MARKOV'
    num_vars_str = str(num_vars)
    var_cardinals = ' '.join(['2']*num_vars)
    num_cliques = str(len(edges))
    edge_cliques = [' '.join(['2', str(i), str(j)]) for i, j in edges]
    preamble = ([network_type, num_vars_str, var_cardinals, num_cliques]+ edge_cliques)
    function_tables = []
    # node potentials
    for i in range(num_vars):
        prob_str = ' '.join([' ', str(node_potentials[i][0]), str(node_potentials[i][1])])
        function_tables += ['', '2', prob_str]
    # edge potentials
    for i, j in edges:
        prob00 = edge_potentials[(i,j)][0][0]
        prob01 = edge_potentials[(i,j)][0][1]
        prob_str1 = ' '.join([' ', str(prob00), str(prob01)])
        prob10 = edge_potentials[(i,j)][1][0]
        prob11 = edge_potentials[(i,j)][1][1]
        prob_str2 = ' '.join([' ', str(prob10), str(prob11)])
        function_tables += ['', '4', prob_str1, prob_str2]
    return '\n'.join(preamble + function_tables + [''])

In [261]:
uai_str = get_uai_str(edges, node_potential, edge_potential)

In [259]:
# Output
with open ('p4.uai', 'w') as f:
    f.write(uai_str)