In [1]:
import numpy as np

#### Multiclass classification case

In [183]:
numExperts = 5
numClasses = 4
numData = 80
#used this for preset prefix data
#biases = [0.01, 0.15, 0.07, 0.05, 0.03]
#used this for same_preset prefix data
biases = [0.05, 0.05, 0.05, 0.05, 0.05]
#used this for hard_preset prefix data
#biases = [0.15, 0.20, 0.12, 0.10, 0.05]
#used this for extra_hard prefix data
#biases = [0.02, 0.2, 0.25, 0.15, 0.4]

In [184]:
def genConfMat(numExperts, numClasses, biases):
    """
    INPUT-
    numExperts - number of Experts
    numClasses - number of Classes
    biases - biases for each expert - Chance of misclassification to be encoded in Conf Matrix
    
    OUTPUT-
    List of confusion matrices
    """
    confMatList = []
    #Mandatory check before generating the data
    assert len(biases) == numExperts
    #Iterating over each expert to create a confusion matrix
    for exp in range(numExperts):
        confMat = np.eye(numClasses, dtype=float)
        confMat = confMat + biases[exp]*np.ones((numClasses, numClasses), dtype=float)
        rowSum = np.sum(confMat, axis=1)
        #normalizing the confusion matrix
        confMatNorm = confMat/rowSum[:,np.newaxis]
        confMatList.append(confMatNorm)
    return confMatList

In [185]:
def genTrueClasses(numData, numClasses, classDist='uniform'):
    """
    INPUT
    numData - number of data points to be generated
    numClasses - number of classes
    classDist - type of distribution governing true classes in the data
    
    OUTPUT
    data - one hot vectorized data
    data_num - data with the classes as integers
    """
    if classDist=='uniform':
        data_num = np.random.choice(range(numClasses), numData)
        data = np.eye(numClasses)[data_num]
        return data, data_num
    if classDist=='pre-selected-uniform':
        data_list = []
        for k in range(numClasses):
            class_dat = [k for i in range(numData/numClasses)]
            data_list = data_list + class_dat
        data_num = np.array(data_list)
        np.random.shuffle(data_num)
        data = np.eye(numClasses)[data_num]
        return data, data_num

In [186]:
def genExpertLabels_MC(trueClsNum, confMatList, numExperts, numClasses):
    """
    INPUT
    trueClsNum - data with the classes as integers
    confMatList - List of confusion matrices
    numExperts - number of Experts
    numClasses - number of classes
    
    OUTPUT
    Data in a 3d array format
    """
    #Mandatory check before generating expert labels
    assert len(confMatList) == numExperts
    numData = len(trueClsNum)
    dataList = []
    for data_idx in range(numData):
        true_class = trueClsNum[data_idx]
        expert_val = np.zeros((numExperts, numClasses))
        for exp in range(numExperts):
            confMat = confMatList[exp]
            predClass = np.random.choice(range(numClasses), p=confMat[true_class,:])
            expert_val[exp, predClass] = 1
        dataList.append(expert_val)
    return np.array(dataList)

In [187]:
confMatList = genConfMat(numExperts, numClasses, biases)

In [188]:
trueClasses, trueClsNum = genTrueClasses(numData, numClasses, classDist='pre-selected-uniform')

In [189]:
expertData = genExpertLabels_MC(trueClsNum, confMatList, numExperts, numClasses)

In [190]:
expertData.shape

(80, 5, 4)

In [191]:
np.save('../data/same_preset_MC_'+str(numData)+'_'+str(numExperts)+'_'+str(numClasses)+'.npz',expertData)

In [192]:
np.save('../data/same_preset_MC_'+str(numData)+'_'+str(numExperts)+\
        '_'+str(numClasses)+'_reference_classes', trueClsNum)

In [193]:
for i in range(numExperts):
    print "Confusion Matrix for expert ",str(i+1)
    print confMatList[i]

Confusion Matrix for expert  1
[[ 0.875       0.04166667  0.04166667  0.04166667]
 [ 0.04166667  0.875       0.04166667  0.04166667]
 [ 0.04166667  0.04166667  0.875       0.04166667]
 [ 0.04166667  0.04166667  0.04166667  0.875     ]]
Confusion Matrix for expert  2
[[ 0.875       0.04166667  0.04166667  0.04166667]
 [ 0.04166667  0.875       0.04166667  0.04166667]
 [ 0.04166667  0.04166667  0.875       0.04166667]
 [ 0.04166667  0.04166667  0.04166667  0.875     ]]
Confusion Matrix for expert  3
[[ 0.875       0.04166667  0.04166667  0.04166667]
 [ 0.04166667  0.875       0.04166667  0.04166667]
 [ 0.04166667  0.04166667  0.875       0.04166667]
 [ 0.04166667  0.04166667  0.04166667  0.875     ]]
Confusion Matrix for expert  4
[[ 0.875       0.04166667  0.04166667  0.04166667]
 [ 0.04166667  0.875       0.04166667  0.04166667]
 [ 0.04166667  0.04166667  0.875       0.04166667]
 [ 0.04166667  0.04166667  0.04166667  0.875     ]]
Confusion Matrix for expert  5
[[ 0.875       0.04166667

#### Yes/No Question data generation

The assumption is that all the questions are asked in the same order. This is done to maintain uniformity while generating the data.

The first column indicates NO, whereas the second column indicates YES.

In [194]:
def genExpertLabels_YN(trueClsNum, confMatList, numExperts, numClasses):
    """
    INPUT
    trueClsNum - data with the classes as integers
    confMatList - List of confusion matrices
    numExperts - number of Experts
    numClasses - number of classes
    
    OUTPUT
    Data in a 4d array format
    """
    #Mandatory check before generating expert labels
    assert len(confMatList) == numExperts
    numData = len(trueClsNum)
    dataList = []
    for data_idx in range(numData):
        true_class = trueClsNum[data_idx]
        expert_val = np.zeros((numExperts, numClasses, 2))
        for exp in range(numExperts):
            confMat = confMatList[exp]
            for qClass in range(numClasses):
                prob = confMat[true_class, qClass]
                val = np.random.binomial(1, prob)
                expert_val[exp, qClass, val] = 1
        dataList.append(expert_val)
    return np.array(dataList)

In [195]:
expertData_YN = genExpertLabels_YN(trueClsNum, confMatList, numExperts, numClasses)

In [196]:
np.save('../data/same_preset_YN_'+str(numData)+'_'+str(numExperts)+'_'+str(numClasses)+'.npz',expertData_YN)

#### Comparing two classes and asking the expert to pick either

The number of questions for every datapoint if we have $k$ classes will be $\frac{k(k-1)}{2}$

In [82]:
import itertools

In [83]:
order = []
for subset in itertools.combinations(range(numClasses), 2):
    order.append(subset)

Make sure to save order as well, just in case we might need it in the inference process.

In [89]:
def genExpertLabels_CC(trueClsNum, confMatList, numExperts, numClasses, order):
    """
    INPUT
    trueClsNum - data with the classes as integers
    confMatList - List of confusion matrices
    numExperts - number of Experts
    numClasses - number of classes
    order - List with tuples of classes being compared (All combinations)
    OUTPUT
    Data in a 4d array format
    """
    #Mandatory check before generating expert labels
    assert len(confMatList) == numExperts
    numData = len(trueClsNum)
    dataList = []
    for data_idx in range(numData):
        true_class = trueClsNum[data_idx]
        expert_val = np.zeros((numExperts, len(order), 2))
        for exp in range(numExperts):
            confMat = confMatList[exp]
            for qClass_idx in range(len(order)):
                qClass = order[qClass_idx]
                prob_neg = confMat[true_class, qClass[0]]
                prob_pos = confMat[true_class, qClass[1]]
                prob = prob_pos/(prob_pos + prob_neg)
                val = np.random.binomial(1, prob)
                expert_val[exp, qClass_idx, val] = 1
        dataList.append(expert_val)
    return np.array(dataList)

In [108]:
expertData_CC = genExpertLabels_CC(trueClsNum, confMatList, numExperts, numClasses, order)

In [113]:
expertData_CC.shape

(10, 5, 6, 2)