## Distribution sampling

Caelen: each param of a Dirichlet is one of the entries in a CM, each param of multinomial is sampled from Dirichlet.

Steps (don't have to do every single time tho):
1. Start docker container for whatever db u want
2. Run store_expanded_labeled_trips to set the expanded_labeled_trips variable
3. Generate mode maps for user mode labels in ur dataset somehow...
4. Run the "get data" cell in this notebook

Then run:
- sampling(CM to sample, n) 
- actual_given_sensed_CM(list of simulated CMs) 
- count_estimate(CM to test, probabilities) or distance_estimate(CM to test, probabilities, OS)

## CHANGES MADE after talking to Wen :)
1. Made two functions, where  the inputs are just the validation set and the training CM. These are called find_counts and find_distances. Use these!
2. Fixed the estimates functions to take validation set (list of predictions for each mode) instead of a CM.
3. Moved all the necessary cells to run to the top, under "RUN THESE!"
4. Moved all the examples of using the functions under "EXAMPLES"

## RUN THESE!

Sampling and probability fcns for both counts and distances

In [None]:
# sampling and probability calculation fcns which you need to do for both counts and distances.

import pandas as pd
import numpy as np
from numpy.random import default_rng

'''
Sample a CM (as a DF) and create n # of CMs based on it.
input: a CM to sample, number of times to sample
output: a list of n CMs
'''
def sampling(samplingCM, n):
    # dirichlet sampling
    v = [] # from the CM, going left to right by row
    for row in samplingCM.index:
        for col in samplingCM.columns:
            v.append(samplingCM.loc[row][col])
    a = np.ones(samplingCM.size)

    rng = default_rng()
    dirichlet_samples = rng.dirichlet((v+a), n)

    # multinomial sampling
    multinomial_samples = []
    n_trips = 0
    for col in samplingCM:
        n_trips += samplingCM[col].sum()

    for params in dirichlet_samples:
        s = rng.multinomial(n_trips, params)
        multinomial_samples.append(s)

    # put each of these into their own CM, same dimensions as samplingCM (do row by row)
    output_CMs = []
    for samples in multinomial_samples:
        samples2D = np.reshape(samples, (len(samplingCM.index), len(samplingCM.columns)))
        outputCM = pd.DataFrame(samples2D, columns = samplingCM.columns, index = samplingCM.index)
        output_CMs.append(outputCM)
    return output_CMs

'''
Finding P(actual|sensed) by dividing each cell by column sum, for a list of DFs
input: list of DFs of values
output: list of DF of P(actual|sensed)
'''
def actual_given_sensed_CM(list_of_value_CMs):
    actual_given_sensed = []
    for cm in list_of_value_CMs:
        probs = cm.div(cm.sum(axis=0), axis=1)
        actual_given_sensed.append(probs)
    return actual_given_sensed


Distance estimation function

In [56]:
# distance estimation and variance calculation fcn. very similar to the count fcn, not sure whether to combine them or not.

import pandas as pd
import numpy as np

'''
Finding estimated values and variances FOR DISTANCES.
input: 
    predicted_distances: a dictionary {"mode1": # of trips predicted in mode1...}
    actual_given_sensed: list of DFs, which have probabilities in each cell
    os: which OS we're using, either "ios" or "android"
output: a single estimated distance for each mode, and a single estimated variance for each mode. 
    (prints some other interesting stuff out too)
'''
def distance_estimate(predicted_distances, actual_given_sensed, os):
    os_unit_info = pd.read_csv(r'unit_distance_MCS.csv')

    # adjusting using os unit info
    adjusted_predicted_distances = {}
    for mode in predicted_distances.index:
        adjusted_predicted_distances[mode] = predicted_distances[mode] * os_unit_info[os][0]

    # find expected counts based on each actual_given_sensed CM and predicted counts
    expected_counts = [] #list of dfs (one df per cm)
    for df in actual_given_sensed:
        expected_value = df.mul(adjusted_predicted_distances, axis = 'columns') # multiply row by row
        expected_value = expected_value.sum(axis='columns') # sum of each row
        expected_counts.append(expected_value)
    
    # average expected values: concat dfs and find mean
    all_expected = pd.concat(expected_counts, axis='columns')
    average_ev = all_expected.mean(axis='columns')

    #VARIANCES
    df_list = []
    for df in actual_given_sensed:
        df_list.append(df.to_numpy())

    # variance of each cell in prob CMs    
    cell_variance = np.square(pd.DataFrame(np.dstack((df_list)).std(axis=2), columns = actual_given_sensed[0].columns, index = actual_given_sensed[0].index)) 
    
    # multiply each row of cell variances by the row of L_i^2
    adjusted_predicted_distances = pd.DataFrame([adjusted_predicted_distances])
    n_squared = np.square(adjusted_predicted_distances) #row of L^2s
    variance1 = cell_variance.mul(n_squared.values, axis = 'columns') # E(L_mode)^2 V(p)
    
    # extra variance term since distance has its own uncertainty
    avg_actual_given_sensed = pd.DataFrame(np.dstack((df_list)).mean(axis=2), columns = actual_given_sensed[0].columns, index = actual_given_sensed[0].index)
    dist_variance = np.square(pd.Series(predicted_distances)) * os_unit_info[os][1] # row of L_i^2s
    variance2 = dist_variance.mul(np.square(avg_actual_given_sensed)) # E(p)^2*V(L_mode)]

    # sum up rows
    variance = variance1.add(variance2)
    variance = variance.sum(axis='columns')

    # diff = pd.Series(average_ev).T.subtract(pd.Series(predicted_distances)).T #expected value - actual value

    # count_results = pd.DataFrame({"EV": average_ev, 
    #                             "actual": pd.Series(predicted_distances), 
    #                             "diff":diff,
    #                             "variance":variance,
    #                             "stdev from actual":diff.div(np.sqrt(variance), axis='rows')}) #diff/sqrt(variance)
    # print(count_results)
    # print("\nPredicted:\n", predicted_distances.T)
    return (average_ev, variance)


Count estimation function

In [42]:
# count estimate and variance calculation function

import pandas as pd
import numpy as np

'''
Finding estimated values and variances FOR COUNTS.
input:
    predicted_counts: a dictionary {"mode1": # of trips predicted in mode1...}
    actual_given_sensed: list of DFs, which have probabilities in each cell
output: a single estimated count for each mode, and a single estimated variance for each mode count. 
    (prints some other interesting stuff out too)
'''
def count_estimate(predicted_counts, actual_given_sensed):
    # find expected counts based on each actual_given_sensed CM and predicted counts
    expected_counts = [] #list of dfs (one df per cm)
    for df in actual_given_sensed:
        expected_value = df.mul(pd.Series(predicted_counts), axis = 'columns') # multiply row by row
        expected_value = expected_value.sum(axis='columns') # sum of each row
        expected_counts.append(expected_value)

    # average expected values: concat dfs and find mean
    all_expected = pd.concat(expected_counts, axis='columns')
    average_ev = all_expected.mean(axis='columns')

    #VARIANCES
    # variance of each cell
    df_list = []
    for df in actual_given_sensed:
        df_list.append(df.to_numpy())
    cell_variance = np.square(pd.DataFrame(np.dstack((df_list)).std(axis=2), columns = actual_given_sensed[0].columns, index = actual_given_sensed[0].index)) 

    # multiply each row of cell variances by  the row of n_i^2
    predicted_counts = pd.DataFrame([predicted_counts])
    n_squared = np.square(predicted_counts) #row of n^2s
    n2_times_var = cell_variance.mul(n_squared.values, axis = 'columns')

    # sum up rows
    variance = n2_times_var.sum(axis='columns')

    # diff = pd.Series(average_ev).T.subtract(predicted_counts).T #expected value - actual value

    # count_results = pd.DataFrame({"EV": average_ev, 
    #                             "actual": (predicted_counts),
                                # "diff":diff,
                                # "stdev from actual":diff.div(np.sqrt(variance), axis='rows')}) #diff/sqrt(variance)
    # print(count_results)
    # print("\nPredicted:\n", predicted_counts.T)

    return (average_ev, variance)

In [33]:
# find counts
def find_counts(trainingCM, data):
    countSamples = sampling(trainingCM, 2000) # you can change 2000 to anything!
    countProbs = actual_given_sensed_CM(countSamples)
    countEstimates = count_estimate(data, countProbs)
    return countEstimates

In [44]:
# find distances
def find_distances(trainingCM, data, os):
    distanceSamples = sampling(trainingCM, 2000) # you can change 2000 to anything!
    distanceProbs = actual_given_sensed_CM(distanceSamples)
    distanceEstimates = distance_estimate(data, distanceProbs, os)
    return distanceEstimates

## EXAMPLES

CM-creation funtions

In [None]:
# functions for taking raw CBC CO trip data to cleaned CMs
# add the fcns here to confusion_matrix_handling.py!!!

import pandas as pd
import numpy as np
import helper_functions as helper
import confusion_matrix_handling as cm_handling

'''
Loop through a DF of trips and tally up label pairs. Pairs are made of 'primary_mode' and 'mode_confirm' labels.
    (primary_mode column comes from get_primary_modes fcn in helper_functions.py)
    Will match either by counts or by distance.
Input: DF of raw trip data (data with a buncha extra info, one row per trip)
Output: CM of pairs and their counts or distances (as a DF).
'''
def pair_matching(df, by_distances=False):
    # make columns of sensed labels (primary_mode)
    columns = {} # looks like {sensed_label: {user_label: count, user_label: count...}}
    value = 1 # default is by counts, where we add one each time
    for index, row in df.iterrows():
        if by_distances: # if set to true, instead of adding one for each pair, we add distances
            value = row['distance']
        if row['primary_mode'] not in columns: # if sensed label not added as column yet
            columns[row['primary_mode']] = {} # make column
            columns[row['primary_mode']][row['mode_confirm']] = value
        elif row['mode_confirm'] not in columns[row['primary_mode']]: #if user label not in sensed label column
            columns[row['primary_mode']][row['mode_confirm']] = value
        else: # else [sensed label, user label] pair already there
            columns[row['primary_mode']][row['mode_confirm']] += value
        
    pairs = pd.DataFrame(columns)
    return pairs

'''
Consolidate user labels (row labels, aka DF index labels) of a user_label, sensed_label CM that are basically the same by using some mode map dictionary.
    If rows have the same label after mapping, they're combined by addition.
    If rows have no user label, they're dropped.
    If remove_unknown is true, unknown labels are also dropped.
Input: DF, mapping dictionary (with the form {user label: standardized label})
Output: DF with new index labels
'''
def map_labels(df, map, remove_unknown=True):
    # do mapping
    renamed_pairs = df.rename(index=map)
    consolidated_pairs = renamed_pairs.groupby(level=0).aggregate(['sum'])

    # remove the annoying "sum" part of the label that appears after aggregate
    consolidated_pairs.columns=consolidated_pairs.columns.droplevel(1)
    
    # remove rows where user label is not in map if set
    if remove_unknown:
        for index, data in consolidated_pairs.iterrows():
            if index not in map:
                consolidated_pairs = consolidated_pairs.drop(labels = [index], axis = 0)
    return consolidated_pairs

'''
Add missing rows or columns to a DF according to a list of other DFs. New rows/cols filled with all zeroes.
Input: list of model DFs, DF to modify
Output: DF with filled in rows and columns
'''
def fill_in(inputCMs, badCM):
    wanted_columns = []
    wanted_rows = []
    for inputCM in inputCMs:
        for col in inputCM.columns:
            if col not in wanted_columns:
                wanted_columns.append(col)
        for row in inputCM.index:
            if row not in wanted_rows:
                wanted_rows.append(row)
                
    for col in wanted_columns:
        if col not in badCM.columns:
            badCM[col] = np.zeros(len(badCM.index))
    for row in wanted_rows:
        if row not in badCM.index:
            badCM.loc[row] = np.zeros(len(badCM.columns))
    return badCM

Putting CanBikeCO trip data into CMs

In [45]:
# get data
# make df for each half of data (use one half for distribution sampling, the other half for the estimating part)
'''
Split data and get two nice DFs
'''
df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv')
energy_dict = cm_handling.get_energy_dict(df_EI, units='MWH')
from confusion_matrix_handling import MODE_MAPPING_DICT

# get result from running store_expanded_labeled_trips.ipynb
%store -r expanded_labeled_trips 
expanded_labeled_trips = helper.get_primary_modes(expanded_labeled_trips[expanded_labeled_trips.section_modes.notna()], energy_dict, MODE_MAPPING_DICT)

# split in half.
from sklearn.utils import shuffle
shuffled_trips = shuffle(expanded_labeled_trips)
shuffled_trips = helper.drop_unwanted_trips(shuffled_trips, drop_not_a_trip=True)
samplingCM = shuffled_trips.iloc[0:int(len(expanded_labeled_trips.index)/2)]
testingCM = shuffled_trips.iloc[int(len(expanded_labeled_trips.index)/2):]

split_by_os = True 

if split_by_os: # for distances
    samplingCMiOS = samplingCM.loc[samplingCM['os'] == "ios"]
    samplingCMandroid = samplingCM.loc[samplingCM['os'] == "android"]
    testingCMiOS = testingCM.loc[testingCM['os'] == "ios"]
    testingCMandroid = testingCM.loc[testingCM['os'] == "android"]

    samplingCMiOS = pair_matching(samplingCMiOS, by_distances=True)
    samplingCMandroid = pair_matching(samplingCMandroid, by_distances=True)
    testingCMiOS = pair_matching(testingCMiOS, by_distances=True)
    testingCMandroid = pair_matching(testingCMandroid, by_distances=True)

    samplingCMiOS = map_labels(samplingCMiOS, MODE_MAPPING_DICT)
    samplingCMandroid = map_labels(samplingCMandroid, MODE_MAPPING_DICT)
    testingCMiOS = map_labels(testingCMiOS, MODE_MAPPING_DICT)
    testingCMandroid = map_labels(testingCMandroid, MODE_MAPPING_DICT)

    # convert from meters to miles
    METERS_TO_MILES = 0.000621371 # 1 meter = 0.000621371 miles
    samplingCMiOS = samplingCMiOS * METERS_TO_MILES
    samplingCMandroid = samplingCMandroid * METERS_TO_MILES
    testingCMiOS = testingCMiOS * METERS_TO_MILES
    testingCMandroid = testingCMandroid * METERS_TO_MILES

    testingCMiOS = fill_in([samplingCMiOS, samplingCMandroid, testingCMandroid], testingCMiOS)
    testingCMandroid = fill_in([testingCMiOS], testingCMandroid)
    samplingCMiOS = fill_in([testingCMiOS], samplingCMiOS)
    samplingCMandroid = fill_in([testingCMiOS], samplingCMandroid)
    
    samplingCMiOS = samplingCMiOS.sort_index()
    samplingCMandroid = samplingCMandroid.sort_index()
    testingCMiOS = testingCMiOS.sort_index()
    testingCMandroid = testingCMandroid.sort_index()

    print(samplingCMiOS)
    print(samplingCMandroid)
    print(testingCMiOS)
    print(testingCMandroid)

elif not split_by_os: # for counts
    # clean up
    samplingCM = pair_matching(samplingCM, by_distances=False)
    samplingCM = map_labels(samplingCM, MODE_MAPPING_DICT)
    
    testingCM = pair_matching(testingCM, by_distances=False)
    testingCM = map_labels(testingCM, MODE_MAPPING_DICT)

    samplingCM = fill_in([testingCM], samplingCM)
    testingCM = fill_in([samplingCM], testingCM)

    # also removing the no_sensed and air_or_hsr sensed labels since they don't have a corresp match in the standardized user labels
    samplingCM = samplingCM.drop(labels=["air_or_hsr", "no_sensed"], axis=1)
    testingCM = testingCM.drop(labels=["air_or_hsr", "no_sensed"], axis=1)

    # making rows and columns the same order
    testingCM = testingCM.loc[:, samplingCM.columns]
    testingCM = testingCM.sort_index()
    samplingCM = samplingCM.sort_index()

    print(samplingCM)
    print(testingCM)

# this takes around 20s for the entire all_ceo dataset

Dropped 93 trips with no sensed sections.
Dropping user labeled AIR trips and trips with no OS.
Also dropping trips labeled as not a trip and trips with mode_confirm of nan.
                               car      walking        bus    bicycling  \
Bikeshare                14.080071    28.123487   0.000000    90.813872   
Bus                    2119.844174   844.631202  50.693043   206.844883   
E-car, drove alone        0.000000     4.050190   0.000000     0.000000   
Free Shuttle             53.033213     8.721774   0.000000     1.891534   
Gas Car, drove alone  26176.607588  2836.043327  25.776840   638.854047   
Gas Car, with others  47759.367236  5406.811993  38.180835  1379.209095   
Pilot ebike            9206.409708  3466.478592   3.856213  7384.268952   
Regular Bike            704.403608   633.217658   8.981032  1292.084770   
Scooter share             0.000000     4.659207   0.000000     0.000000   
Skate board               0.000000     0.000000   0.000000     0.000000   
T

In [55]:
predicted_distances = testingCMandroid.sum(axis=0)
find_distances(samplingCMandroid, predicted_distances, "android")

walking        8516.187595
car           80802.792877
bicycling      7614.808000
bus             374.239010
no_sensed      4920.651546
train           810.268706
air_or_hsr     3692.395252
subway          175.207894
dtype: float64
walking
car
bicycling
bus
no_sensed
train
air_or_hsr
subway


(Bikeshare                  98.355018
 Bus                      4941.308390
 E-car, drove alone          7.711472
 Free Shuttle              129.211763
 Gas Car, drove alone    32785.371016
 Gas Car, with others    57563.947695
 Pilot ebike             12550.420043
 Regular Bike             1707.715750
 Scooter share             109.341396
 Skate board                35.536258
 Taxi/Uber/Lyft           1069.848613
 Train                    1733.393446
 Walk                     5511.605282
 dtype: float64,
 Bikeshare               7.780731e+03
 Bus                     3.977461e+05
 E-car, drove alone      1.905686e+03
 Free Shuttle            4.504362e+03
 Gas Car, drove alone    2.261677e+07
 Gas Car, with others    6.664904e+07
 Pilot ebike             1.744036e+06
 Regular Bike            7.870109e+04
 Scooter share           1.067583e+04
 Skate board             2.797220e+03
 Taxi/Uber/Lyft          3.840188e+04
 Train                   4.257177e+04
 Walk                    6.495045

In [43]:
predicted_counts = testingCM.sum(axis=0)
find_counts(samplingCM, predicted_counts)

(Bikeshare                 65.894072
 Bus                      695.034071
 E-car, drove alone         4.637843
 Free Shuttle              53.544810
 Gas Car, drove alone    9357.277206
 Gas Car, with others    9677.533056
 Pilot ebike             9684.089111
 Regular Bike            1335.176330
 Scooter share             52.363406
 Skate board               24.997295
 Taxi/Uber/Lyft           204.438859
 Train                     84.497270
 Walk                    6840.516670
 dtype: float64,
 Bikeshare                 117.837569
 Bus                      1214.400858
 E-car, drove alone          7.817907
 Free Shuttle               96.335416
 Gas Car, drove alone    10976.570335
 Gas Car, with others    11257.610107
 Pilot ebike             10302.969667
 Regular Bike             2148.062369
 Scooter share              93.664400
 Skate board                43.598504
 Taxi/Uber/Lyft            377.138834
 Train                     121.362239
 Walk                     7239.889105
 dtype: 

Finding distances for CanBikeCo data

In [None]:
# finding distances
'''
iOS
'''
print("ios:\n")
iosCMs = sampling(samplingCMiOS, n = 2000)
iosProbs = actual_given_sensed_CM(iosCMs)
iosEstimates = distance_estimate(testingCMiOS, iosProbs, "ios")

'''
android
'''
print("\nandroid:\n")
androidCMs = sampling(samplingCMandroid, n = 2000)
androidProbs = actual_given_sensed_CM(androidCMs)
androidEstimates = distance_estimate(testingCMandroid, androidProbs, "android")


In [None]:
# combine results for ios and android distances just by adding
total_distance_estimates = androidEstimates[0] + iosEstimates[0]
total_variances = androidEstimates[1] + iosEstimates[1]

actual = (testingCMandroid.add(testingCMiOS)).sum(axis = "columns").T
sd_from_actual = (total_distance_estimates.subtract(actual)).div(np.sqrt(total_variances))

print(pd.DataFrame({"estimates":total_distance_estimates, "actual":actual, "sd from actual": sd_from_actual}))

In [None]:
# combine results for ios and android distances by a weighted sum
total_android_distance = testingCMandroid.sum().sum()
total_ios_distance = testingCMiOS.sum().sum()
ios_weight = total_ios_distance / (total_ios_distance + total_android_distance)
android_weight = 1 - ios_weight
print("ios weight: ", ios_weight, " android weight: ", android_weight, "\n")

total_weighted_estimates = (iosEstimates[0] * ios_weight).add(androidEstimates[0] * android_weight)
total_weighted_variance = (iosEstimates[1] * ios_weight).add(androidEstimates[1] * android_weight)
sd_from_actual = (total_weighted_estimates.subtract(actual)).div(np.sqrt(total_weighted_variance))

print(pd.DataFrame({"weighted estimate":total_weighted_estimates, "actual":actual, "sd from actual": sd_from_actual}))

Finding mode counts for CanBikeCo data

In [None]:
# finding counts
countSamples = sampling(samplingCM, 2000)
countProbs = actual_given_sensed_CM(countSamples)
print(countProbs)
countEstimates = count_estimate(testingCM, countProbs)

Finding distance estimates for canbikeco data after training on mobility net data

In [None]:
def update_probabilities(cm, prior_mode_probs):
    p_predicted_given_actual = cm.div(cm.sum(axis=1), axis='rows')
    likelihood_times_priors = p_predicted_given_actual.multiply(pd.Series(prior_mode_probs), axis='rows')
    normalizing_constants = likelihood_times_priors.sum(axis='rows')
    prob_actual_given_predicted_df = likelihood_times_priors.divide(normalizing_constants, axis='columns').copy()

    return prob_actual_given_predicted_df

In [None]:
import pandas as pd
import confusion_matrix_handling as cm_handling
from confusion_matrix_handling import MODE_MAPPING_DICT
import helper_functions as helper
df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv')
energy_dict = cm_handling.get_energy_dict(df_EI, units='MWH')

pd.set_option("display.precision", 8)
# csv files were generated in classification_analysis.ipynb from the MobilityNet repo
training_android = pd.read_csv("myCoolCMs/mobilitynet_android.csv", index_col=0).T
training_ios = pd.read_csv("myCoolCMs/mobilitynet_ios.csv", index_col=0).T

# remove all no_gt and no_sensed
training_android = cm_handling.drop_rows_and_columns(training_android, row_list=["NO_GT_START", "NO_GT_MIDDLE", "NO_GT_END"], column_list=["NO_SENSED_START", "NO_SENSED_MIDDLE", "NO_SENSED_END"])
training_ios =cm_handling.drop_rows_and_columns(training_ios, row_list=["NO_GT_START", "NO_GT_MIDDLE", "NO_GT_END"], column_list=["NO_SENSED_START", "NO_SENSED_MIDDLE", "NO_SENSED_END", "UNKNOWN"])

# get canbikeco data by OS
%store -r expanded_labeled_trips 
expanded_labeled_trips = helper.get_primary_modes(expanded_labeled_trips[expanded_labeled_trips.section_modes.notna()], energy_dict, MODE_MAPPING_DICT)

from sklearn.utils import shuffle
shuffled_trips = shuffle(expanded_labeled_trips)
shuffled_trips = helper.drop_unwanted_trips(shuffled_trips, drop_not_a_trip=True)

testing_android = shuffled_trips.loc[shuffled_trips['os'] == "android"]
testing_ios = shuffled_trips.loc[shuffled_trips['os'] == "ios"]

# make canbikeco trips into CMs
testing_android = pair_matching(testing_android, by_distances=True)
testing_ios = pair_matching(testing_ios, by_distances=True)

from confusion_matrix_handling import MODE_MAPPING_DICT

testing_android = map_labels(testing_android, MODE_MAPPING_DICT)
testing_ios = map_labels(testing_ios, MODE_MAPPING_DICT)

all_ceo_to_MN_label_map = {
    "Regular Bike": "Bicycling",
    "Walk":"Walking"
}

testing_android = map_labels(testing_android, all_ceo_to_MN_label_map, remove_unknown=False)
testing_ios = map_labels(testing_ios, all_ceo_to_MN_label_map, remove_unknown=False)
    
print(testing_android)
print("******************")
print(training_android)

# MN has labels in all caps....
testing_android.columns = map(lambda x: str(x).upper(), testing_android.columns)
testing_android.index = map(lambda x: str(x).upper(), testing_android.index)
testing_ios.columns = map(lambda x: str(x).upper(), testing_ios.columns)
testing_ios.index = map(lambda x: str(x).upper(), testing_ios.index)

testing_android = cm_handling.drop_rows_and_columns(testing_android, row_list=[], column_list=["NO_SENSED"])
testing_ios = cm_handling.drop_rows_and_columns(testing_ios, row_list=[], column_list=["NO_SENSED"])

# make sure all CMs have the same columns and rows
training_android = fill_in([training_ios, testing_android, testing_ios], training_android)
training_ios = fill_in([training_android], training_ios)
testing_android = fill_in([training_android], testing_android)
testing_ios = fill_in([training_android], testing_ios)


# put all CM indexes into the same order
training_ios = training_ios.sort_index()
training_android = training_android.sort_index()
testing_ios = testing_ios.sort_index()
testing_android = testing_android.sort_index()


In [None]:
'''
Do the thing
'''
# find priors of testing set
total_testing = testing_android.add(testing_ios)
priors = total_testing.sum(axis=1) / total_testing.sum().sum()
print(priors)
print("Android results: \n")
android_samples = sampling(training_android, 2000)
android_probabilities = [] 
for cm in android_samples:
    android_probabilities.append(update_probabilities(cm, priors))
android_distance_estimates = distance_estimate(testing_android, android_probabilities, 'android')

print("\niOS results: \n")
ios_samples = sampling(training_ios, 2000)
ios_probabilities = []
for cm in ios_samples:
    ios_probabilities.append(update_probabilities(cm, priors))
ios_distance_estimates = distance_estimate(testing_ios, ios_probabilities, 'ios')

# combine results for ios and android distances by addition
total_distance = android_distance_estimates[0] + ios_distance_estimates[0]
total_variances = android_distance_estimates[1] + ios_distance_estimates[1]

actual = (testing_android.add(testing_ios)).sum(axis = "columns").T
sd_from_actual = (total_distance.subtract(actual)).div(np.sqrt(total_variances))

print("\nCombined results: \n", pd.DataFrame({"estimates":total_distance, "actual":actual, "sd from actual": sd_from_actual}))

In [None]:
# cell entries / row sums of MN, CBCO for michael

df = pd.DataFrame({"a": [1, 2, 3], "b":[4, 5, 6]})
df.div(df.sum(axis=1), axis=0)

# MN distances data
android = pd.read_csv("myCoolCMs/mobilitynet_android.csv", index_col=0).T
ios = pd.read_csv("myCoolCMs/mobilitynet_ios.csv", index_col=0).T
mn_total = android.add(ios)
mn_total = cm_handling.drop_rows_and_columns(mn_total, row_list=[], column_list=["AIR_OR_HSR", "LIGHT_RAIL", "UNKNOWN"])
# mn_total = mn_total.div(mn_total.sum(axis=1), axis=0)
# mn_total.to_csv("MobilityNet_distances_predicted_given_gt.csv")

# all_ceo count data
# cbco_counts = samplingCM.add(testingCM)
# cbco_counts = cbco_counts.div(cbco_counts.sum(axis=1), axis=0)
# cbco_counts.to_csv("All_CEO_counts_predicted_given_gt.csv")

# all_ceo distance data
cbco_distances = samplingCMandroid.add(testingCMandroid).add(samplingCMiOS).add(testingCMiOS)
# cbco_distances = cbco_distances.div(cbco_distances.sum(axis=1), axis=0)
# print(cbco_distances)
# cbco_distances.to_csv("All_CEO_distances_predicted_given_gt.csv")