### Extract test results

The following notebook is to extrapolate the data used in testing in the ./ml_training/test_results/...

In [8]:
import numpy as np
import os
from tqdm import tqdm
import pickle
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, roc_auc_score)

In [2]:
# Only set to True if you don't have the pickle file (LEAVE THIS FALSE IF YOU HAVE THE PICKLE FILE)
if False:
    results_path = "./ml_training/test_results/"

    # Iterate through the results files and load the folders into dicts
    results = {}
    entries = os.listdir(results_path)
    for entry in tqdm(entries):
        if os.path.isdir(results_path + entry):
            results[entry] = {}
            sub_entries = os.listdir(results_path + entry)
            for sub_entry in sub_entries:
                if os.path.isdir(results_path + entry + "/" + sub_entry):
                    results[entry][sub_entry] = {}
                    sub_sub_entries = os.listdir(results_path + entry + "/" + sub_entry)
                    # Extract the one file with .csv extension
                    for sub_sub_entry in sub_sub_entries:
                        if sub_sub_entry.endswith(".csv"):
                            final_path = results_path + entry + "/" + sub_entry + "/" + sub_sub_entry
                            results[entry][sub_entry] = np.genfromtxt(final_path, delimiter=",", dtype=float)
                            # Greater than 0.5 is 1 result else 0
                            results[entry][sub_entry][results[entry][sub_entry] >= 0.5] = 1
                            results[entry][sub_entry][results[entry][sub_entry] < 0.5] = 0
                            results[entry][sub_entry] = results[entry][sub_entry].astype(np.int8)
                            break

    # Save the dict with pickle
    with open("test_results.pkl", "wb") as f:
        pickle.dump(results, f)

In [9]:
results = pickle.load(open("test_results.pkl", "rb"))
# Print keys and subkeys
for key in results.keys():
    print(key)
    for subkey in results[key].keys():
        print("\t" + subkey)
    print()

print("The names listed above were used for training the models. Get in touch with Michal to find out what the test set was or you can figure it out from ml_running.py")

cross_animal
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

cross_day_cross_session
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

cross_day_same_session
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

cross_session_same_day
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

within_session
	AA034_D1S1
	AA036_D2S1
	AA058_D1S1
	PL010_D1S1

The names listed above were used for training the models. Get in touch with Michal to find out what the test set was or you can figure it out from ml_running.py


In [4]:
# The following function takes in three arguments, the experiment type, the training data and a function to apply to the data.
# You can omit providing the training data and it will accumulate the results for all subkeys in the results dict.

def apply_to_results(experiment_type, func, training_name=None, average=False):
    if experiment_type not in results.keys():
        raise ValueError("The experiment type provided is not in the results dict")
    if training_name is not None:
        if training_name not in results[experiment_type].keys():
            raise ValueError("The experiment type provided is not in the results dict")
        training_name = [training_name]
    else:
        training_name = results[experiment_type].keys()
    
    # We need to extract the results. It is set up in the following manner:
    # Each two rows represent the ground truth and predictions respectively. There should be 60 rows because each experiment was repeated 5 times, for values [1, 2, 5, 10, 15, 20]. So 2*5*6=60
    # Therefore the func should be applied to each pair of rows.

    metrics = {}
    for name in training_name:
        arr = results[experiment_type][name]
        for i, size in enumerate([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20]):
            ground_truth = arr[i*2]
            predictions = arr[i*2+1]
            if metrics.get(size) is None:
                metrics[size] = []
            metrics[size].append(func(ground_truth, predictions))
    
    if average:
        for key in metrics.keys():
            metrics[key] = np.mean(metrics[key])
    
    return metrics


def true_positives(y_true, y_pred, target=1):
    return np.sum(np.logical_and(y_true == target, y_pred == target))

def true_negatives(y_true, y_pred, target=0):
    return np.sum(np.logical_and(y_true == target, y_pred == target))

def false_positives(y_true, y_pred, target=1):
    return np.sum(np.logical_and(y_true == target, y_pred != target))

def false_negatives(y_true, y_pred, target=0):
    return np.sum(np.logical_and(y_true == target, y_pred != target))

"""
def transient_overlap(y_true, y_pred):
    # We'll do the following, set ground truth to 2 and predictions to 1.
    sum_arr = y_true + y_pred
    # Find stretches where the result is non-zero
    diff_arr = np.diff(sum_arr[sum_])
"""

        

In [5]:
f1_macro = lambda x, y: f1_score(x, y, average="macro")
f1_standard = lambda x, y: f1_score(x, y)
tp = lambda x, y: true_positives(x, y)
tn = lambda x, y: true_negatives(x, y)
fp = lambda x, y: false_positives(x, y)
fn = lambda x, y: false_negatives(x, y)
precision = lambda x, y: precision_score(x, y)
recall = lambda x, y: recall_score(x, y)
accuracy = lambda x, y: accuracy_score(x, y)

In [13]:
# Tests
f1_result_macro = apply_to_results("cross_animal", f1_macro, training_name="PL010_D1S1", average=True)
f1_result_standard = apply_to_results("cross_animal", f1_standard, training_name="PL010_D1S1", average=True)
"""
tp_result = apply_to_results("within_session", tp, average=True)
tn_result = apply_to_results("within_session", tn, average=True)
fp_result = apply_to_results("within_session", fp, average=True)
fn_result = apply_to_results("within_session", fn, average=True)
precision_result = apply_to_results("within_session", precision, average=True)
recall_result = apply_to_results("within_session", recall, average=True)
accuracy_result = apply_to_results("within_session", accuracy, average=True)
"""


'\ntp_result = apply_to_results("within_session", tp, average=True)\ntn_result = apply_to_results("within_session", tn, average=True)\nfp_result = apply_to_results("within_session", fp, average=True)\nfn_result = apply_to_results("within_session", fn, average=True)\nprecision_result = apply_to_results("within_session", precision, average=True)\nrecall_result = apply_to_results("within_session", recall, average=True)\naccuracy_result = apply_to_results("within_session", accuracy, average=True)\n'

In [12]:
print("f1 macro")
print(f1_result_macro)
print("f1 standard")
print(f1_result_standard)
"""
print("True Positives Average")
print(tp_result)
print("True Negatives Average")
print(tn_result)
print("False Positives Average")
print(fp_result)
print("False Negatives Average")
print(fn_result)
print("Precision Average")
print(precision_result)
print("Recall Average")
print(recall_result)
print("Accuracy Average")
print(accuracy_result)
"""

f1 macro
{1: 0.5480958946232736, 2: 0.744130759185736, 5: 0.8227125097213384, 10: 0.8558539053757613, 15: 0.900605819452913, 20: 0.902093478833422}


'\nprint("f1 standard")\nprint(f1_result_standard)\nprint("True Positives Average")\nprint(tp_result)\nprint("True Negatives Average")\nprint(tn_result)\nprint("False Positives Average")\nprint(fp_result)\nprint("False Negatives Average")\nprint(fn_result)\nprint("Precision Average")\nprint(precision_result)\nprint("Recall Average")\nprint(recall_result)\nprint("Accuracy Average")\nprint(accuracy_result)\n'