In [487]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.misc as misc
%matplotlib inline

In [962]:
#Reading in the data
data = np.load('MC_50_5_4.npz.npy')
trueCls = np.load('MC_50_5_4_reference_classes.npy')

In [1083]:
#Inferring the number of classes and data length
numData = data.shape[0]
numExperts = data.shape[1]
numClasses = data.shape[2]

In [1092]:
#Initialization
alpha = [1, 1, 1, 1]
z = np.random.dirichlet(alpha, size=numData)
observed_classes = np.argmax(z, axis=1)

pi = np.random.dirichlet(alpha, size=1)[0]
lam = 10
lambda_mat = lam*np.eye( numClasses) + np.ones((numClasses, numClasses))
ConfMatList = []
for exp_index in range(numExperts):
    confMat = []
    for cls_idx in range(numClasses):
        confMat.append(np.random.dirichlet(lambda_mat[cls_idx,:],1)[0])
    ConfMatList.append(np.array(confMat))
confMatArray = np.array(ConfMatList)

In [1126]:
lambda_mat

array([[ 11.,   1.,   1.,   1.],
       [  1.,  11.,   1.,   1.],
       [  1.,   1.,  11.,   1.],
       [  1.,   1.,   1.,  11.]])

In [1093]:
confMatArray = confMatArray/confMatArray.sum(axis=(2))[:,:,np.newaxis]

In [1094]:
def compute_LL(data, confMatArray, z, pi):
    log_like = 0
    numData = data.shape[0]
    numExperts = data.shape[1]
    numClasses = data.shape[2]
    obs_classes = np.argmax(z,axis=1)
    for data_idx in range(numData):
        data_val = data[data_idx]
        z_val = z[data_idx]
        obs_val = np.argmax(data_val,axis=1)
        for cls_idx in range(numClasses):
            log_like += z_val[cls_idx]*(np.log(pi[cls_idx]) + \
                    np.sum(np.multiply(np.log(confMatArray[:, obs_val, cls_idx]), \
                                data_val[:, cls_idx])))
    return log_like

In [1125]:
np.argmax(data,axis=2).shape

(50, 5)

In [1096]:
def E_step(data, confMatArray, pi, obs_classes):
    numData = data.shape[0]
    numExperts = data.shape[1]
    numClasses = data.shape[2]
    z = np.zeros((numData,numClasses))
    for i in range(numData):
        r = np.argmax(data[i],axis)
        for k in range(numClasses):
            z[i, k] = pi[k]*np.sum(confMatArray[:,k,r])
    return z/z.sum(axis=1)[:,np.newaxis]


In [1097]:
def M_Step(data, z):
    numData = data.shape[0]
    numExperts = data.shape[1]
    numClasses = data.shape[2]
    r = np.argmax(data,axis=2)
    newPi = np.mean(z, axis=0)
    newConfMatArray = np.empty(confMatArray.shape)
    for j in range(numExperts):
        for k in range(numClasses):
            dat = data[observed_classes == k]
            for t in range(numClasses):
                newConfMatArray
#                 newConfMatArray[e_idx, cls_idx, row_cls_idx] = 0.00001 + \
#                 np.sum(np.multiply(dat[:,e_idx,row_cls_idx], z_dat[:, row_cls_idx]))
#     newConfMatArray = newConfMatArray/newConfMatArray.sum(axis=(2))[:,:,np.newaxis]
#     return newConfMatArray, newPi

In [1098]:
compute_LL(data, confMatArray, z, pi)

-166.75445668144593

In [1099]:
for i in range(200):
    z = E_step(data, confMatArray, pi, np.argmax(z,axis=1))
    confMatArray, pi = M_Step(data, z)
    print "Likelihood in iteration %f is %f", i+1, compute_LL(data, confMatArray, z, pi)

Likelihood in iteration %f is %f 1 -600.721322227
Likelihood in iteration %f is %f 2 -76.2494896234
Likelihood in iteration %f is %f 3 -101.993466568
Likelihood in iteration %f is %f 4 -76.8515019251
Likelihood in iteration %f is %f 5 -106.90021428
Likelihood in iteration %f is %f 6 -75.6212820455
Likelihood in iteration %f is %f 7 -112.218877592
Likelihood in iteration %f is %f 8 -75.3520014436
Likelihood in iteration %f is %f 9 -118.172483991
Likelihood in iteration %f is %f 10 -75.3400032584
Likelihood in iteration %f is %f 11 -122.56737513
Likelihood in iteration %f is %f 12 -75.4004750697
Likelihood in iteration %f is %f 13 -123.67860513
Likelihood in iteration %f is %f 14 -75.4405286692
Likelihood in iteration %f is %f 15 -123.817470828
Likelihood in iteration %f is %f 16 -75.4565417614
Likelihood in iteration %f is %f 17 -123.834701513
Likelihood in iteration %f is %f 18 -75.4623026001
Likelihood in iteration %f is %f 19 -123.837691447
Likelihood in iteration %f is %f 20 -75.464

In [1100]:
np.mean(z,axis=0)

array([ 0.37047953,  0.15441723,  0.22194622,  0.25315702])

In [1101]:
np.argmax(z,axis=1)

array([2, 0, 0, 0, 1, 0, 0, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 3, 0,
       3, 0, 0, 0, 0, 0, 3, 1, 2, 3, 2, 2, 3, 2, 3, 0, 3, 0, 0, 0, 2, 0, 3,
       0, 3, 3, 3])

In [1102]:
trueCls

array([3, 1, 1, 2, 2, 2, 3, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 3, 2, 0, 2,
       0, 1, 1, 3, 3, 2, 2, 3, 0, 0, 3, 1, 0, 0, 0, 2, 0, 2, 3, 1, 3, 2, 2,
       3, 2, 2, 0])

In [1069]:
for c in confMatArray:
    print "Confusion matrix: "
    print c

Confusion matrix: 
[[  9.99942854e-01   1.90526755e-05   1.90477692e-05   1.90455914e-05]
 [  2.49994137e-01   2.49994137e-01   2.50017588e-01   2.49994137e-01]
 [  2.67434419e-01   2.44188530e-01   2.44188525e-01   2.44188525e-01]
 [  1.22569626e-02   1.22555974e-02   9.63231843e-01   1.22555974e-02]]
Confusion matrix: 
[[  9.99862782e-01   9.34809038e-05   2.03134683e-05   2.34240330e-05]
 [  2.49994137e-01   2.49994137e-01   2.50017588e-01   2.49994137e-01]
 [  9.34846736e-01   4.73364102e-02   8.90663705e-03   8.91021707e-03]
 [  3.14489801e-01   2.65735203e-02   6.55715733e-01   3.22094624e-03]]
Confusion matrix: 
[[  9.57062440e-01   3.98158230e-02   3.10310690e-03   1.86302609e-05]
 [  9.08291046e-03   9.08291046e-03   9.72751269e-01   9.08291046e-03]
 [  9.34720491e-01   4.74350610e-02   8.91768545e-03   8.92676286e-03]
 [  6.40852841e-01   4.81657545e-02   3.03594829e-01   7.38657574e-03]]
Confusion matrix: 
[[  4.67166977e-04   9.98598226e-01   4.67186224e-04   4.67420701e-04

In [1013]:
pi

array([  1.00000000e+00,   4.16535353e-39,   2.50803880e-41,
         8.66487509e-31])

Liu, C.,. 2013. How we taught AM207 and lived to tell the tale. IACS Journal 1:1–3.
The students of AM207. 2013. Everything we know, we learned in AM207. IACS Journal 1:3–5. 

We used two data sets: an easy data set and a hard data set. Both have 5 workers and 4 classes, but the difference is in the confusion matrices. The easy data have confusion matrices close to an identity matrix, while the hard confusion matrices are much more muddled. We compared accuracy (the percentage of predicted labels that match the true labels) for 40, 80, 100, 200, and 500 data points, on both the easy and hard data. For EM, we also looked how having a single confusion matrix shared between all workers impacted our results.




The conclusion that we draw from the results is that the YN paradigm is better suited for scenarios where the confusion matrix for the worker poor; that is, when the diagonal of the confusion matrices aren't as heavily weighted. This aligns with our results from our PyMC simulations. The drawback though comes in the form of computation – the number of questions increases linearly with increasing number of classes. Regardless of the method or difficultly of data set, our accuracy increased as we increased the number of data points. 