### New Method ###
CM with more than two modes > buncha simulated CMs > Estimate the probabilities of each prediction > Estimate true counts

In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
# sampling and probability calculation fcns which you need to do for both counts and distances.

from numpy.random import default_rng

'''
Sample a CM (as a DF) and create n # of CMs based on it.
input: a CM to sample, number of times to sample
output: a list of n CMs
'''
def sampling(samplingCM, n):
    # dirichlet sampling
    v = [] # from the CM, going left to right by row
    for row in samplingCM.index:
        for col in samplingCM.columns:
            v.append(samplingCM.loc[row][col])
    a = np.ones(samplingCM.size)

    rng = default_rng()
    dirichlet_samples = rng.dirichlet((v+a), n)

    # multinomial sampling
    multinomial_samples = []
    n_trips = 0
    for col in samplingCM:
        n_trips += samplingCM[col].sum()

    for params in dirichlet_samples:
        s = rng.multinomial(n_trips, params)
        multinomial_samples.append(s)

    # put each of these into their own CM, same dimensions as samplingCM (do row by row)
    output_CMs = []
    for samples in multinomial_samples:
        samples2D = np.reshape(samples, (len(samplingCM.index), len(samplingCM.columns)))
        outputCM = pd.DataFrame(samples2D, columns = samplingCM.columns, index = samplingCM.index)
        output_CMs.append(outputCM)
    return output_CMs

In [None]:
df = pd.DataFrame({'a':[1, 2, 3], 'b':[4, 5, 6]})
print(df)
print(df.sum(axis=1))
print(df.div(df.sum(axis=1), axis=0))

In [None]:
'''
Input: 
    cm: A CM with ground truth labels as rows and sensed labels as columns (DF)
    NMC: number of times to simulate (int)
    prior_mode_probs: assumed mode distributions (dictionary)
    sample (optional): whether to simulate CMS by sampling w/ dirichlet (bool)
Output:
    actual number of each mode
    predicted number of each mode (by our placeholder predictor) - average over all simulations
    estimated number of each mode - avg over all simulations
    p(actual = mode) of each mode - avg
'''

def latent_class_method(cm, NMC, prior_mode_probs, sample = False):
    
    p_sensed_given_actual = cm.div(cm.sum(axis=1), axis=0)
    row_normalized_confusion_matrix =  p_sensed_given_actual
    likelihood_times_priors = row_normalized_confusion_matrix.multiply(pd.Series(prior_mode_probs), axis='rows')
    normalizing_constants = likelihood_times_priors.sum(axis='rows')
    # prob_actual_given_predicted_df = likelihood_times_priors.divide(normalizing_constants, axis='columns').copy()
    #print(prob_actual_given_predicted_df)
    
    #stuff to return that we want to keep track of thru simulations
    all_estimated_counts = [] 
    all_predicted_counts = []
    all_prediction_probs = []

    simulated_CMs = []

    if sample:
        simulated_CMs = sampling(cm, NMC)
    else:
        n_trips_per_mode = cm.sum(axis=1)
        for n_sim in range(NMC):
            #simulate predictions for each mode
            predictions = {} # looks like {actual:[predicted mode1, predicted mode2, predicted mode3....]}
            for i, row in p_sensed_given_actual.iterrows():
                predictions[i] = np.random.choice(cm.index,size=int(n_trips_per_mode[i]), replace=True, p = row.values.tolist())
            
            #make into a cm
            simCM = {}
            for gt_mode in predictions:
                for sensed_mode in predictions[gt_mode]:
                    if sensed_mode in simCM:
                        if gt_mode in simCM[sensed_mode]: simCM[sensed_mode][gt_mode] += 1
                        else: simCM[sensed_mode][gt_mode] = 1
                    else: 
                        simCM[sensed_mode] = {}
                        simCM[sensed_mode][gt_mode] = 1
            
            simulated_CMs.append(pd.DataFrame(simCM))


    for sample in simulated_CMs:
        
        #find probability of predicting each mode
        prediction_probs = sample.sum(axis=0)/sample.sum().sum() # column sum / total

        true_probs = pd.DataFrame(np.linalg.solve(np.linalg.inv(row_normalized_confusion_matrix.to_numpy()), prediction_probs.to_numpy()), index=cm.index) #?

        estimates = true_probs * sample.sum().sum()

        all_estimated_counts.append(estimates)
        all_predicted_counts.append(sample.sum(axis=0))
        all_prediction_probs.append(true_probs)
    
    predicted_mean = pd.concat(all_predicted_counts, axis='columns').mean(axis='columns')
    probability_mean = pd.concat(all_prediction_probs, axis='columns').mean(axis='columns')
    estimated_mean = pd.concat(all_estimated_counts, axis='columns').mean(axis='columns')

    print(pd.DataFrame({"Actual counts":cm.sum(axis=1), "Predicted counts": predicted_mean, "True probs":probability_mean, "Estimated counts":estimated_mean}))

n_trips=[1000, 5000, 10000]

priors={"ebike":0.2, "car":0.7, "train":0.1}

for n in n_trips:
    #make a confusion matrix for a certain number of trips and a certain spread of gt modes
    sensed_ebike = {"ebike":.6, "car":.2, "train":.1}
    sensed_car = {"ebike":.3, "car":.65, "train":.05}
    sensed_train = {"ebike":.1, "car":.15, "train":.85}

    input_cm = pd.DataFrame({"ebike":sensed_ebike, "car":sensed_car, "train":sensed_train})
    for col in input_cm.columns:
        input_cm.loc[col] = input_cm.loc[col] * n * priors[col]

    print("\n", n, " total trips")
    print(input_cm)
    latent_class_method(input_cm, NMC = 2000, prior_mode_probs = priors)