### Extract test results

The following notebook is to extrapolate the data used in testing in the ./ml_training/test_results/...

In [1]:
import numpy as np
import os
from tqdm import tqdm
import pickle
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, roc_auc_score)
import pandas as pd

In [2]:
# Only set to True if you don't have the pickle file (LEAVE THIS FALSE IF YOU HAVE THE PICKLE FILE)
if False:
    results_path = "./ml_training/test_results/"

    # Iterate through the results files and load the folders into dicts
    results = {}
    entries = os.listdir(results_path)
    for entry in tqdm(entries):
        if os.path.isdir(results_path + entry):
            results[entry] = {}
            sub_entries = os.listdir(results_path + entry)
            for sub_entry in sub_entries:
                if os.path.isdir(results_path + entry + "/" + sub_entry):
                    results[entry][sub_entry] = {}
                    sub_sub_entries = os.listdir(results_path + entry + "/" + sub_entry)
                    # Extract the one file with .csv extension
                    for sub_sub_entry in sub_sub_entries:
                        if sub_sub_entry.endswith(".csv"):
                            final_path = results_path + entry + "/" + sub_entry + "/" + sub_sub_entry
                            results[entry][sub_entry] = np.genfromtxt(final_path, delimiter=",", dtype=float)
                            # Greater than 0.5 is 1 result else 0
                            results[entry][sub_entry][results[entry][sub_entry] >= 0.5] = 1
                            results[entry][sub_entry][results[entry][sub_entry] < 0.5] = 0
                            results[entry][sub_entry] = results[entry][sub_entry].astype(np.int8)
                            break

    # Save the dict with pickle
    with open("test_results.pkl", "wb") as f:
        pickle.dump(results, f)

In [3]:
results = pickle.load(open("test_results.pkl", "rb"))
# Print keys and subkeys
for key in results.keys():
    print(key)
    for subkey in results[key].keys():
        print("\t" + subkey)
    print()

print("The names listed above were used for training the models. Get in touch with Michal to find out what the test set was or you can figure it out from ml_running.py")

cross_animal
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

cross_day_cross_session
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

cross_day_same_session
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

cross_session_same_day
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

within_session
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

The names listed above were used for training the models. Get in touch with Michal to find out what the test set was or you can figure it out from ml_running.py


In [4]:
# The following function takes in three arguments, the experiment type, the training data and a function to apply to the data.
# You can omit providing the training data and it will accumulate the results for all subkeys in the results dict.

def apply_to_results(experiment_type, func, training_name=None, average=False):
    if experiment_type not in results.keys():
        raise ValueError("The experiment type provided is not in the results dict")
    if training_name is not None:
        if training_name not in results[experiment_type].keys():
            raise ValueError("The experiment type provided is not in the results dict")
        training_name = [training_name]
    else:
        training_name = results[experiment_type].keys()
    
    # We need to extract the results. It is set up in the following manner:
    # Each two rows represent the ground truth and predictions respectively. There should be 60 rows because each experiment was repeated 5 times, for values [1, 2, 5, 10, 15, 20]. So 2*5*6=60
    # Therefore the func should be applied to each pair of rows.

    metrics = {}
    for name in training_name:
        arr = results[experiment_type][name]
        for i, size in enumerate([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20]):
            ground_truth = arr[i*2]
            predictions = arr[i*2+1]
            if metrics.get(size) is None:
                metrics[size] = []
            metrics[size].append(func(ground_truth, predictions))
    
    if average:
        for key in metrics.keys():
            metrics[key] = np.mean(metrics[key])
    
    return metrics


def true_positives(y_true, y_pred, target=1):
    return np.sum(np.logical_and(y_true == target, y_pred == target))

def true_negatives(y_true, y_pred, target=0):
    return np.sum(np.logical_and(y_true == target, y_pred == target))

def false_positives(y_true, y_pred, target=1):
    return np.sum(np.logical_and(y_true == target, y_pred != target))

def false_negatives(y_true, y_pred, target=0):
    return np.sum(np.logical_and(y_true == target, y_pred != target))

def _extract_indices(indices):
        indices = indices.nonzero()[0]
        if indices.any():
            # Split up the indices into groups
            indices = np.split(indices, np.where(np.diff(indices) != 1)[0]+1)
            # Now Split the indices into pairs of first and last indices
            indices = [(indices_group[0], indices_group[-1]+1) for indices_group in indices]

        return indices

def _overlaps(range1, range2):
    return len(range(max(range1[0], range2[0]), min(range1[1], range2[1]))) != 0

def transient_overlap(y_true, y_pred, threshold=0.5):
    # Not sure if this works as necessary avoid for the time being.    
    indices_true = _extract_indices(y_true)
    indices_pred = _extract_indices(y_pred)

    # Now iterate through the indices and check if they overlap and if they do overlap what is the percentage of overlap
    tp = 0
    fp = 0
    fn = 0

    # We'll iterate through both simultaneously and check their ranges, we'll pop from the list anything that is outside the range or already counted
    while indices_true and indices_pred:
        true_range = indices_true[0]
        pred_range = indices_pred[0]

        # Check if there is overlap
        if _overlaps(true_range, pred_range):
            # Check the percentage of overlap
            overlap = len(range(max(true_range[0], pred_range[0]), min(true_range[1], pred_range[1])))
            overlap_percentage = overlap / (pred_range[1] - pred_range[0])
            if overlap_percentage >= threshold:
                tp += 1
                # Pop both indices
                indices_true.pop(0)
                indices_pred.pop(0)
            elif true_range[1] < pred_range[1]:
                indices_true.pop(0)
                fn += 1
            else:
                indices_pred.pop(0)
                fp += 1

        else:
            # Whichever ends first, pop it
            if true_range[1] < pred_range[1]:
                indices_true.pop(0)
                fn += 1
            else:
                indices_pred.pop(0)
                fp += 1
        
    # Now we need to add the remaining indices
    fn += len(indices_true)
    fp += len(indices_pred)

    return (tp, fp, fn)

def dict_to_flattened(dict_):
    flattened = []
    for value in dict_.values():
        flattened.extend(value)
    return flattened

def to_csv(metrics, filename):
    # get the first value and check the length
    length = len(metrics[0][1][1])
    
    column_names = []
    for no_cells in [1, 2, 5, 10, 15, 20]:
        for run in range(1, 6):
            if length == 5:
                column_names.append(f"C{no_cells} R{run}")
            else:
                for name in ["AA034_D1S1", "AA036_D2S1", "AA058_D1S1", "PL010_D1S1"]:
                    column_names.append(f"{name} C{no_cells} R{run}")
    
    # Create pandas dataframe and insert column names
    df = pd.DataFrame(columns=column_names)
    for name, metric in metrics:
        df.loc[name] = dict_to_flattened(metric)

    with open(filename, "w") as f:
        df.to_csv(f, lineterminator="\n")
        
                


SyntaxError: expected ':' (732285853.py, line 123)

In [None]:
f1_macro = lambda x, y: f1_score(x, y, average="macro")
f1_standard = lambda x, y: f1_score(x, y)
tp = lambda x, y: true_positives(x, y)
tn = lambda x, y: true_negatives(x, y)
fp = lambda x, y: false_positives(x, y)
fn = lambda x, y: false_negatives(x, y)
precision = lambda x, y: precision_score(x, y)
recall = lambda x, y: recall_score(x, y)
accuracy = lambda x, y: accuracy_score(x, y)
trans = lambda x, y: transient_overlap(x, y)

In [None]:
# Tests
f1_result_macro = apply_to_results("cross_animal", f1_macro, training_name="PL010_D1S1", average=False)
f1_result_standard = apply_to_results("cross_animal", f1_standard, training_name="PL010_D1S1", average=False)

tp_result = apply_to_results("within_session", tp, average=False)
tn_result = apply_to_results("within_session", tn, average=False)
fp_result = apply_to_results("within_session", fp, average=False)
fn_result = apply_to_results("within_session", fn, average=False)
precision_result = apply_to_results("within_session", precision, average=False)
recall_result = apply_to_results("within_session", recall, average=False)
accuracy_result = apply_to_results("within_session", accuracy, average=False)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(f1_result_macro)

{1: [0.495326928109462, 0.4931821082903461, 0.4962271614521137, 0.49263542437920704, 0.7631078508852389], 2: [0.6203726974340997, 0.8336420110631643, 0.71163797253501, 0.790984696177297, 0.764016418719109], 5: [0.8499461083595601, 0.895305322430282, 0.7993298959296035, 0.7716141709164169, 0.7973670509708289], 10: [0.8933202110429579, 0.8126358084632079, 0.8041508284468021, 0.9304224477038816, 0.8387402312219573], 15: [0.9022295811866685, 0.934016268001163, 0.8888690529129128, 0.8820831353421288, 0.8958310598216916], 20: [0.9190141175894933, 0.8794123082899297, 0.9025038499722022, 0.8937177340512059, 0.9158193842642791]}


In [None]:
metrics = [("TP", tp_result), ("TN", tn_result), ("FP", fp_result), ("FN", fn_result), ("Precision", precision_result), ("Recall", recall_result), ("Accuracy", accuracy_result)]

to_csv(metrics, "result.csv")

In [None]:
print("f1 macro")
print(f1_result_macro)
print("f1 standard")
print(f1_result_standard)
"""
print("True Positives Average")
print(tp_result)
print("True Negatives Average")
print(tn_result)
print("False Positives Average")
print(fp_result)
print("False Negatives Average")
print(fn_result)
print("Precision Average")
print(precision_result)
print("Recall Average")
print(recall_result)
print("Accuracy Average")
print(accuracy_result)
"""