In [3]:
import numpy as np
import math
from numpy.random import beta

# to compute exp of each cell in a matrix
exp = np.vectorize(math.exp)


def MV(Psi):
    """
    The majority voting method.
    """
    return [np.bincount([y[1] for y in x]) / (0.0 + len(x)) for x in Psi]


def EM(N, M, Psi, inv_Psi):
    """
    The expectation maximization method (EM) from Dong et al., 2013.
    """
    # convergence eps
    eps = 0.0001

    # init accuracies
    A = [0.8]*N
    while True:
        # E-step
        p = [[0, 0] for x in range(M)]
        for obj in range(M):
            C = [0]*2
            for s, val in Psi[obj]:
                C[val] += math.log(A[s])
                C[not val] += math.log(1-A[s])
            p[obj][1] = math.exp(C[1])/(math.exp(C[0])+math.exp(C[1]))
            p[obj][0] = math.exp(C[0]) / (math.exp(C[0]) + math.exp(C[1]))

        # M-step
        A_new = [np.average([p[y[0]][y[1]] for y in x]) for x in inv_Psi]

        # convergence check
        if sum(abs(np.subtract(A, A_new))) < eps:
            A = A_new
            break
        else:
            A = A_new

    return A, p


def log_likelihood(Psi, A, p):
    """
    Computes the log likelihood of the Psi using A and p.
    """
    res = 0
    for obj_id in range(M):
        for source_id, value_id in Psi[obj_id]:
            if value_id == 1:
                res += math.log(A[source_id] * p[obj_id][value_id])
            else:
                res += math.log((1 - A[source_id]) * (1 - p[obj_id][value_id]))
    return res


def random_log_likelihood(N, M, Psi):
    """
    Searches for the max log likelihood at random.
    """
    # number of attempts
    N_iter = 10000

    max_log_likelihood = -100
    bf_A = []
    bf_p = []
    for i in range(N_iter):
        A = np.random.uniform(0.8, 1.0, N)
        p = [[1 - x, x] for x in np.random.uniform(0, 1, M)]
        cur_ll = log_likelihood(Psi, A, p)
        if cur_ll > max_log_likelihood:
            max_log_likelihood = cur_ll
            bf_A = A
            bf_p = p

    return bf_A, bf_p


def mcmc(N, M, Psi, inv_Psi):
    """
    MCMC for log-likelihood maximum search.
    """
    N_iter = 1000
    burnin = 10
    thin = 30
    sample_size = (N_iter-burnin)/thin

    # random init
    A = np.random.uniform(0.8, 1.0, N)

    # MCMC sampling
    test_sample_size = 0
    mcmc_p = [[0, 0] for x in range(M)]
    for _iter in range(N_iter):
        # update objects
        p = [[0, 0] for x in range(M)]
        for obj in range(M):
            C = [0]*2
            for s, val in Psi[obj]:
                C[val] += math.log(A[s])
                C[not val] += math.log(1-A[s])
            p[obj][1] = math.exp(C[1])/(math.exp(C[0])+math.exp(C[1]))
            p[obj][0] = math.exp(C[0]) / (math.exp(C[0]) + math.exp(C[1]))
        O = [1 if np.random.rand() < p[i][1] else 0 for i in range(M)]

        # update sources
        for source_id in range(N):
            beta_0 = 0
            beta_1 = 0
            for obj, val in inv_Psi[source_id]:
                if val == O[obj]:
                    beta_0 += 1
                else:
                    beta_1 += 1
            A[source_id] = beta(beta_0 + 1, beta_1 + 1)

        if _iter > burnin and _iter % thin == 0:
            test_sample_size += 1
            for obj in range(M):
                mcmc_p[obj][O[obj]] += 1/(0.0+sample_size)

    # mcmc output
    mcmc_A = [0]*N
    for s in range(N):
        for obj, val in inv_Psi[s]:
            # TODO take advantage of priors (as in Zhao et al., 2012)
            mcmc_A[s] += mcmc_p[obj][val]
        mcmc_A[s] /= (0.0+len(inv_Psi[s]))

    return mcmc_A, mcmc_p


# number of sources
N = 3
# number of objects
M = 6
# observations
# chain
Psi = [[(0, 0), (1, 1), (2, 1)],
       [(0, 1), (1, 1), (2, 1)],
       [(0, 1), (1, 1), (2, 1)],
       [(0, 1), (1, 1), (2, 1)],
       [(0, 1), (1, 1), (2, 1)],
       [(0, 1), (1, 1), (2, 1)]]

# inverted observations
inv_Psi = [[(0, 0), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
           [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
           [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]

mv_p = MV(Psi)
em_A, em_p = EM(N, M, Psi, inv_Psi)
bf_A, bf_p = random_log_likelihood(N, M, Psi)
mcmc_A, mcmc_p = mcmc(N, M, Psi, inv_Psi)

print('MV   Pr: {:1.3f}'.format(np.average([x[1] for x in mv_p])))
print('EM   Pr: {:1.3f}\tlog-likelihood: {:3.2f}\tA: {}\tp: {}'.format(np.average([x[1] for x in em_p]), log_likelihood(Psi, em_A, em_p), em_A, em_p))
print('RND  Pr: {:1.3f}\tlog-likelihood: {:3.2f}\tA: {}\tp: {}'.format(np.average([x[1] for x in bf_p]), log_likelihood(Psi, bf_A, bf_p), bf_A, bf_p))
print('MCMC Pr: {:1.3f}\tlog-likelihood: {:3.2f}\tA: {}\tp: {}'.format(np.average([x[1] for x in mcmc_p]), log_likelihood(Psi, mcmc_A, mcmc_p), mcmc_A, mcmc_p))

(33, 33)
MV   Pr: 0.944
EM   Pr: 1.000	log-likelihood: -2.70	A: [0.83333333336386961, 0.99999999995419697, 0.99999999995419697]	p: [[2.290184421798629e-10, 0.9999999997709815], [9.16013573199207e-12, 0.9999999999908399], [9.16013573199207e-12, 0.9999999999908399], [9.16013573199207e-12, 0.9999999999908399], [9.16013573199207e-12, 0.9999999999908399], [9.16013573199207e-12, 0.9999999999908399]]
RND  Pr: 0.870	log-likelihood: -5.99	A: [ 0.87438904  0.96585112  0.94824855]	p: [[0.38175117395775393, 0.61824882604224607], [0.036643436402551521, 0.96335656359744848], [0.0066079709528127362, 0.99339202904718726], [0.13190427518325654, 0.86809572481674346], [0.13348021537284793, 0.86651978462715207], [0.090469234471765136, 0.90953076552823486]]
MCMC Pr: 0.657	log-likelihood: -15.98	A: [0.6212121212121208, 0.6565656565656561, 0.6565656565656561]	p: [[0.3939393939393938, 0.6060606060606057], [0.33333333333333326, 0.6666666666666663], [0.33333333333333326, 0.6666666666666663], [0.3333333333333332