# Parameter Sensetivity Test

Test the algorithm accuracy using different problem parameters using fake data

## Initialization

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from dawid_skene import run as run_em

In [2]:
from fake_data_generator import RandomConfusionMatrixChannel, DataGenerator

def generate_sample_data(nSamples, nWorkers, nClass, truePredictionRate=.8):
    workers = []
    confusion_matrices = []
    for _ in range(nWorkers):
        worker = RandomConfusionMatrixChannel()
        worker.train(nClass, truePredictionRate)
        workers.append(worker)
        confusion_matrices.append(worker.confusionMatrix)

    samples = {}
    lables = []
    data_generator = DataGenerator(nClass)
    for n in range(nSamples):
        data = data_generator.generate()
        sample = {w:[workers[w].estimate(data)] for w in range(nWorkers)}
        samples[n] = sample
        lables.append(data.label)


    return (samples, lables, confusion_matrices)


def frobenius_norm(matrix1, matrix2):
    # Convert the input matrices to numpy arrays if they are not already
    matrix1 = np.array(matrix1)
    matrix2 = np.array(matrix2)
    
    # Calculate the Frobenius norm of the difference
    difference = matrix1 - matrix2
    norm = np.linalg.norm(difference, 'fro')
    
    return norm

## Test 1
Sample size

In [5]:
nWorkers = 5                                # Number of Channels
nSamples = [50, 100, 200, 500, 1000]        # Number of Data
nClasses = 4                                # Number of Classes

truePredictionRate = .8                     # Workers true prediction rate
nTest = 10                                  # Number of tests

In [None]:
n_workers = nWorkers
n_classes = nClasses

average_accurace_list = []
average_cm_distance_list = []
for n_samples in tqdm(nSamples):
    accuracy_list = []
    cm_distance_list = []
    for _ in range(nTest):
        samples, true_labels, confusion_matrices = generate_sample_data(n_classes, n_samples, n_workers, truePredictionRate)
        _, _, _, _, class_marginals, error_rates, patient_classes = run_em(samples, verbose=False)

        estimated_labels = np.argmax(patient_classes, axis=1)
        accuracy = 100 * accuracy_score(true_labels, estimated_labels)
        accuracy_list.append(accuracy)

        cm_distance = np.mean([frobenius_norm(cm_true, cm_estimate) for cm_true, cm_estimate in zip(confusion_matrices, error_rates)])
        cm_distance_list.append(cm_distance)

    average_accurace_list.append(np.mean(accuracy_list))
    average_cm_distance_list.append(np.mean(cm_distance_list))


fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle('Number Of Samples')

ax1.plot(nSamples, average_accurace_list)
ax1.set_title("Accuracy")
ax1.set_xlabel("Number of samples")
ax1.set_ylabel("Accuracy")

ax2.plot(nSamples, average_cm_distance_list)
ax2.set_title("Confusion Matrix Distance")
ax2.set_xlabel("Number of samples")
ax2.set_ylabel("Distance")

plt.show()

# Test 2

True Prediction rate

In [3]:
nWorkers = 5                                # Number of Channels
nSamples = 1000                             # Number of Data
nClasses = 4                                # Number of Classes

truePredictionRates = [.5, .6, .7, .8, .9]  # Workers true prediction rate]
nTest = 10

In [None]:
n_workers = nWorkers
n_samples = nSamples
n_classes = nClasses

average_accurace_list = []
average_cm_distance_list = []
for truePredictionRate in tqdm(truePredictionRates):
    accuracy_list = []
    cm_distance_list = []
    for _ in range(nTest):
        samples, true_labels, confusion_matrices = generate_sample_data(n_classes, n_samples, n_workers, truePredictionRate)
        _, _, _, _, class_marginals, error_rates, patient_classes = run_em(samples, verbose=False)

        estimated_labels = np.argmax(patient_classes, axis=1)
        accuracy = 100 * accuracy_score(true_labels, estimated_labels)
        accuracy_list.append(accuracy)

        cm_distance = np.mean([frobenius_norm(cm_true, cm_estimate) for cm_true, cm_estimate in zip(confusion_matrices, error_rates)])
        cm_distance_list.append(cm_distance)

    average_accurace_list.append(np.mean(accuracy_list))
    average_cm_distance_list.append(np.mean(cm_distance_list))


fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle('True Prediction Rate')

ax1.plot(truePredictionRates, average_accurace_list)
ax1.set_title("Accuracy")
ax1.set_xlabel("True prediction rate")
ax1.set_ylabel("Accuracy")

ax2.plot(truePredictionRates, average_cm_distance_list)
ax2.set_title("Confusion Matrix Distance")
ax2.set_xlabel("True prediction rate")
ax2.set_ylabel("Distance")

plt.show()