In [1]:
import numpy as np
import pymc as pm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Reading in the data
data = np.load('../data/preset_MC_80_5_4.npz.npy')
trueCls = np.load('../data/preset_MC_80_5_4_reference_classes.npy')
data.shape

(80, 5, 4)

In [3]:
idx = 5
data[idx], trueCls[idx]

(array([[ 1.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.],
        [ 0.,  1.,  0.,  0.],
        [ 1.,  0.,  0.,  0.]]), 0)

In [4]:
#Inferring the number of classes and data length
numData = data.shape[0]
numExperts = data.shape[1]
numClasses = data.shape[2]

Initializing the Priors for the PyMC model.

In [5]:
alpha = [1, 1, 1, 1]
rho = pm.Dirichlet('rho',theta=alpha, plot=False)
rho_completedD = pm.CompletedDirichlet('rho_completed', rho, plot=False)
z = pm.Container([pm.Multinomial('z_%i' %i,n=1,p=rho, plot=False) for i in range(numData)])
lam = 10
lambda_mat = lam*np.eye( numClasses) + np.ones((numClasses, numClasses))
confMat = pm.Container([pm.Dirichlet(name='theta_row_'+str(i), theta=lambda_mat[i,:], plot=False)\
                      for i in range(numClasses)])
confMatHat = pm.Container([pm.CompletedDirichlet('theta_row_hat'+str(i),\
                    confMat[i], plot=False) for i in range(numClasses)])

Likelihood for the Multiclass case.

In [6]:
@pm.observed
def mc_likelihood(value=data, z=z, confMat=confMatHat, rho=rho_completedD):
    """
    value - data
    z - observed classes
    confMat - confidence matrix
    """
    log_like = 0
    numData = data.shape[0]
    numExperts = data.shape[1]
    numClasses = data.shape[2]
    for data_idx in range(numData):
        dat = data[data_idx]
        observed_cls = z[data_idx]
        class_num = np.where(observed_cls == 1)[0][0]
        log_like += np.sum(np.multiply(observed_cls,np.sum(np.multiply(np.sum(dat, axis=0),\
                            np.log(confMat[class_num])))))
    return log_like

In [7]:
mod = pm.Model([rho, rho_completedD, z, confMat, confMatHat, mc_likelihood])

In [8]:
mcmc = pm.MCMC(mod)
mcmc.sample(iter=10000, burn=1000, thin=5)
pm.Matplot.plot(mcmc)

 [-----------------100%-----------------] 10000 of 10000 complete in 464.7 sec

Results obtained from PyMC for different datasets.

#### For the data file preset_MC_80_5_4.npz.npy

In [9]:
print "Confusion Matrix"
for idx in range(numClasses):
    print np.mean(confMatHat[idx].trace()[:], axis=0)

Confusion Matrix
[[ 0.84090251  0.0520346   0.0700579   0.037005  ]]
[[ 0.02860957  0.88412848  0.05101491  0.03624704]]
[[ 0.04659353  0.04335627  0.88466778  0.02538242]]
[[ 0.07966358  0.0268948   0.01804512  0.87539649]]


In [10]:
final_z = []
for i in range(numData):
    final_z.append(np.argmax(np.mean(z[i].trace()[:], axis=0)))

Number of Recovered Labels and Final Class Distribution

In [11]:
np.sum(final_z == trueCls)

80

In [12]:
np.mean(rho_completedD.trace(), axis=0)

array([[ 0.25200085,  0.24972981,  0.24869499,  0.24957436]])

#### For the data file preset_MC_200_5_4.npz.npy

In [18]:
print "Confusion Matrix"
for idx in range(numClasses):
    print np.mean(confMatHat[idx].trace()[:], axis=0)

Confusion Matrix
[[ 0.83215003  0.06998182  0.05680138  0.04106677]]
[[ 0.00614853  0.88403796  0.06243647  0.04737704]]
[[ 0.05332688  0.03417867  0.85687459  0.05561986]]
[[ 0.05389482  0.04569116  0.05720491  0.84320912]]


In [15]:
final_z = []
for i in range(numData):
    final_z.append(np.argmax(np.mean(z[i].trace()[:], axis=0)))

Number of Recovered Labels and Final Class Distribution

In [16]:
np.sum(final_z == trueCls)

197

In [17]:
np.mean(rho_completedD.trace(), axis=0)

array([[ 0.26649719,  0.23522991,  0.25105032,  0.24722259]])

#### For the data file preset_MC_100_5_4.npz.npy

In [9]:
print "Confusion Matrix"
for idx in range(numClasses):
    print np.mean(confMatHat[idx].trace()[:], axis=0)

Confusion Matrix
[[ 0.6930749   0.16420421  0.03823254  0.10448835]]
[[ 0.05658189  0.75923519  0.102547    0.08163592]]
[[ 0.15113202  0.13492457  0.5746348   0.13930861]]
[[ 0.08296775  0.09377541  0.07021663  0.75304021]]


In [10]:
final_z = []
for i in range(numData):
    final_z.append(np.argmax(np.mean(z[i].trace()[-1000:], axis=0)))

Number of Recovered Labels

In [11]:
np.sum(final_z == trueCls)

88

#### For the data file preset_MC_40_5_4.npz.npy

In [9]:
print "Confusion Matrix"
for idx in range(numClasses):
    print np.mean(confMatHat[idx].trace()[:], axis=0)

Confusion Matrix
[[ 0.8819219   0.04241082  0.04085174  0.03481554]]
[[ 0.05651182  0.88355781  0.02841986  0.03151051]]
[[ 0.03010899  0.04463091  0.91061273  0.01464737]]
[[ 0.066242    0.01119483  0.06289027  0.8596729 ]]


In [10]:
final_z = []
for i in range(numData):
    final_z.append(np.argmax(np.mean(z[i].trace()[:], axis=0)))

Number of Recovered Labels and Final Class Distribution

In [11]:
np.sum(final_z == trueCls)

40

In [12]:
np.mean(rho_completedD.trace(), axis=0)

array([[ 0.24664998,  0.2563131 ,  0.24541446,  0.25162246]])