In [5]:
from sklearn.metrics import accuracy_score
import numpy as np

# Parameters

In [6]:
nWorkers = 5    # Number of Channels
nSamples = 45   # Number of Data
nClasses = 4    # Number of Classes

# Generate Fake Data

In [7]:
from fake_data_generator import RandomConfusionMatrixChannel, DataGenerator

def generate_sample_data(nSamples, nWorkers, nClass, truePredictionRate=.8):
    workers = []
    confusion_matrices = []
    for _ in range(nWorkers):
        worker = RandomConfusionMatrixChannel()
        worker.train(nClass, truePredictionRate)
        workers.append(worker)
        confusion_matrices.append(worker.confusionMatrix)

    samples = {}
    lables = []
    data_generator = DataGenerator(nClass)
    for n in range(nSamples):
        data = data_generator.generate()
        sample = {w:[workers[w].estimate(data)] for w in range(nWorkers)}
        samples[n] = sample
        lables.append(data.label)


    return (samples, lables, confusion_matrices)


def generate_temporal_dependent_data(nSamples, nWorkers, nClass, nActivityChunk, truePredictionRate=.8):
    workers = []
    confusion_matrices = []
    for _ in range(nWorkers):
        worker = RandomConfusionMatrixChannel()
        worker.train(nClass, truePredictionRate)
        workers.append(worker)
        confusion_matrices.append(worker.confusionMatrix)

    samples = {}
    lables = []
    data_generator = DataGenerator(nClass)
    
    chunkSize = nSamples // nActivityChunk

    for n in range(nActivityChunk):
        data = data_generator.generate()
        for m in range(chunkSize):
            index = (n*chunkSize) + m
            sample = {w:[workers[w].estimate(data)] for w in range(nWorkers)}
            samples[index] = sample
            lables.append(data.label)

    data = data_generator.generate()
    for n in range(len(lables), nSamples):
        sample = {w:[workers[w].estimate(data)] for w in range(nWorkers)}
        samples[n] = sample
        lables.append(data.label)

    return (samples, lables, confusion_matrices)

# Test

In [8]:
# samples, labels, confusion_matrices = generate_sample_data(nSamples, nWorkers, nClasses, .8)
samples, labels, confusion_matrices = generate_temporal_dependent_data(nSamples, nWorkers, nClasses, 10, .8)

Run EM algorithm

In [9]:
from dawid_skene import run as run_em

_, _, _, _, class_marginals, error_rates, patient_classes = run_em(samples, verbose=False)

Run HMM algorithm

In [29]:
from hmmlearn import hmm
# Number of possible observations. That can be the same as number of states (for single labeler, or equal to nClasses*nWorkers for multi labelers)
labelers_list = [0] # list of labelers
n_observations = nClasses
# Number of hidden states
n_states = nClasses
estimated_labels = np.argmax(patient_classes, axis=1)
observations = np.array(labels)

# Create a Categorical Hidden Markov Model
model = hmm.CategoricalHMM(n_components=n_states, n_iter=100)

# Fit the model to the observations using the Baum-Welch algorithm
model.fit(observations.reshape(-1,1))


CategoricalHMM(n_components=4, n_features=4, n_iter=100,
               random_state=RandomState(MT19937) at 0x186C7882940)

In [30]:
print(model.emissionprob_)

[[2.57171900e-01 3.69640100e-18 1.30732206e-01 6.12095894e-01]
 [2.43222189e-01 8.36089583e-06 1.20322586e-01 6.36446864e-01]
 [1.82006232e-14 9.99980298e-01 2.90487866e-13 1.97021946e-05]
 [2.53087058e-01 2.47644217e-12 1.25919420e-01 6.20993522e-01]]


# Evaluation

In [None]:
def frobenius_norm(matrix1, matrix2):
    # Convert the input matrices to numpy arrays if they are not already
    matrix1 = np.array(matrix1)
    matrix2 = np.array(matrix2)
    
    # Calculate the Frobenius norm of the difference
    difference = matrix1 - matrix2
    norm = np.linalg.norm(difference, 'fro')
    
    return norm

def one_hot_encode(x, n_classes):
    """
    One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
    : x: List of sample Labels
    : return: Numpy array of one-hot encoded labels
     """
    return np.eye(n_classes)[x]

### Accuracy
Run this cell only if true labels are available

In [None]:

true_labels = labels
estimated_labels = np.argmax(patient_classes, axis=1)

accuracy = int(100 * accuracy_score(true_labels, estimated_labels))

print("Accuracy: %{}".format(accuracy))

### Average Parameter Estimation Error
Run this cell only if true confusion matrices are available

In [None]:
parameter_error_rate = np.mean([frobenius_norm(cm_true, cm_estimate) for cm_true, cm_estimate in zip(confusion_matrices, error_rates)])

print("Average Parameter Estimation Error: {:.2f}".format(parameter_error_rate))