# Explore clustering algorithm confidence discounting
The proposed clustering algorithm confidence discounting algorithm (https://github.com/e-mission/e-mission-docs/issues/663#issuecomment-898994131) has several parameters that may be tuned. This is to explore what they should be set to and whether the algorithm works well overall.

For now, I will not do the full testing, which would consist of a train/test split, running the clustering on only the training data, then comparing the calculated p-values to how well each cluster actually does at predicting test trips.

First, let's get some users. We select only users who have at least 30 confirmed trips.

In [None]:
# Copypasted from Explore stage before vs after; TODO refactor into a module
import emission.storage.timeseries.abstract_timeseries as esta

EXCLUDE_UUIDS = [UUID(s) for s in input("Enter UUIDs to exclude, separated by spaces: ").split(" ") if len(s) > 0]
REQUIRED_TRIPS_TOTAL = 30

def filter_update(new, old, reason):
    print(f"Excluded {len(old)-len(new)} users, left with {len(new)}: {reason}")

all_users = esta.TimeSeries.get_uuid_list()
confirmed_trip_df_map = {}
print(f"Working with {len(all_users)} initial users")

filter0_users = [u for u in all_users if u not in EXCLUDE_UUIDS]  # Users that we don't explicitly exclude
filter_update(filter0_users, all_users, "presence on exclusion list")

filter1_users = []  # Users with enough total trips
for u in filter0_users:
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")
    confirmed_trip_df_map[u] = ct_df
    if ct_df.shape[0] >= REQUIRED_TRIPS_TOTAL: filter1_users.append(u)
filter_update(filter1_users, filter0_users, "not enough total trips")

filtered_users = filter1_users

Now let's get all the cleaned trips for those users and figure out what the naïve predictions would be. Note that this requires the model files to be copied into the working directory.

In [None]:
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.analysis.classification.inference.labels.inferrers as eacili
import arrow
import emission.storage.timeseries.timequery as estt
import uuid
import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
import emission.analysis.modelling.tour_model_first_only.load_predict as lp

cleaned_trips = []
findings = []
naive_ps = {0.0, 1.0}
naive_counts = {}
for u in filtered_users:
    tq = estt.TimeQuery("data.end_ts", arrow.get("2010-01-01").timestamp, arrow.now().timestamp)
    cleaned_trips += esda.get_entries(esda.CLEANED_TRIP_KEY, u, time_query=tq)
for trip in cleaned_trips:
    finding = {}
    finding["trip"] = trip
    finding["naive_prediction"] = eacili.predict_two_stage_bin_cluster(trip)
    finding["naive_mli_p"] = finding["naive_prediction"][0]["p"] if len(finding["naive_prediction"]) > 0 else 0
    naive_ps.add(finding["naive_mli_p"])
    if finding["naive_mli_p"] not in naive_counts: naive_counts[finding["naive_mli_p"]] = 0
    naive_counts[finding["naive_mli_p"]] += 1
    findings.append(finding)
    
print(len(cleaned_trips))
print(len(findings))

Now let's compute discounted predictions and test out our graphing functionality.

In [None]:
discounted_ps = {0.0, 1.0}
discounted_counts = {}
def compute_discounting_full(max_confidence=None, first_confidence=None, confidence_multiplier=None):
    global discounted_ps, discounted_counts
    discounted_ps = {0.0, 1.0}
    discounted_counts = {}
    for finding in findings:
        finding["discounted_prediction"] = eacili.predict_cluster_confidence_discounting(finding["trip"], max_confidence, first_confidence, confidence_multiplier)
        finding["discounted_mli_p"] = finding["discounted_prediction"][0]["p"] if len(finding["discounted_prediction"]) > 0 else 0
        discounted_ps.add(finding["discounted_mli_p"])
        if finding["discounted_mli_p"] not in discounted_counts: discounted_counts[finding["discounted_mli_p"]] = 0
        discounted_counts[finding["discounted_mli_p"]] += 1
    return discounted_ps, discounted_counts
    
import numpy as np
import matplotlib.pyplot as plt
import copy
import warnings

def bar(labels, a, b, title, figsize):
    x = np.arange(len(labels))
    y_a = [a[k] if k in a else 0 for k in labels]
    y_b = [b[k] if k in b else 0 for k in labels]

    fig,ax = plt.subplots(figsize=figsize)
    width = 0.4
    bars_a = ax.bar(x-width/2, y_a, width, label="Naïve")
    bars_b = ax.bar(x+width/2, y_b, width, label="Discounted")

    ax.set_ylabel("Number of trips")
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels([f"{n:.2f}" for n in labels])
    ax.legend()
    
    for i,l in enumerate(ax.xaxis.get_ticklabels()):
        if i < 6 or i >= len(labels)-6: l.set_rotation(90)
        elif i % 4 != 0: l.set_visible(False)  # Hide labels that don't fit
    
    plt.show()

def expand_dict(src, dest):
    for k in src: dest += [float(k)]*int(src[k])
    
def box(a, b, title, figsize, first_confidence):
    data = [[] for i in range(4)]
    labels = ["Naïve", "Discounted", "Naïve no 0", "Discounted no 0", "Naïve no 0, 1", "Discounted no 0, B"]
    expand_dict(a, data[0])
    expand_dict(b, data[1])
    a_no_0 = a.copy()
    a_no_0[0] = 0
    b_no_0 = b.copy()
    b_no_0[0] = 0
    expand_dict(a_no_0, data[2])
    expand_dict(b_no_0, data[3])
    
    # These are not very useful because stripping out the 1s is not the same as stripping out the Bs
    # a_no_extremes = a_no_0.copy()
    # a_no_extremes[1] = 0
    # b_no_extremes = b_no_0.copy()
    # b_no_extremes[first_confidence] = 0
    # expand_dict(a_no_extremes, data[4])
    # expand_dict(b_no_extremes, data[5])
    
    fig,ax = plt.subplots(figsize=figsize)
    ax.set_title(title)
    ax.set_xticklabels(labels)
    
    warnings.filterwarnings("ignore") # https://github.com/matplotlib/matplotlib/issues/16353
    ax.boxplot(data)
    warnings.resetwarnings()
    
def viz_discounting(max_confidence, first_confidence, confidence_multiplier, bar_figsize=(20,10), box_figsize=(10,5), title_add=""):
    compute_discounting_full(max_confidence, first_confidence, confidence_multiplier)
    print(f"Not shown: {naive_counts[0.0]}, {discounted_counts[0.0]} trips with confidence 0")
    labels = list((naive_ps | discounted_ps) - {0.0})
    labels.sort()
    title = f"A={1-max_confidence:.2f}, B={first_confidence:.2f}, C={confidence_multiplier:.2f}"+title_add
    bar(labels, naive_counts, discounted_counts, title, bar_figsize)
    box(naive_counts, discounted_counts, title, box_figsize, first_confidence)
    
viz_discounting(0.99, 0.75, 0.25)


And now let's use the graphs to test out a bunch of scenarios.

In [None]:
scenarios = {
    "default": (0.99, 0.75, 0.25),
    "no_a": (1.00, 0.75, 0.25),
    "high_a": (0.95, 0.75, 0.25),
    "higher_a": (0.90, 0.75, 0.25),
    "low_b": (0.99, 0.6, 0.25),
    "lower_b": (0.99, 0.3, 0.25),
    "low_c": (0.99, 0.5, 0.10),
    "low_b_and_c": (0.99, 0.6, 0.10)
}

for scenario in scenarios:
    viz_discounting(*scenarios[scenario], title_add=f" ({scenario})")