In [None]:
%load_ext autoreload 
%autoreload 2
%matplotlib notebook
import matplotlib.pyplot as plt
import random
import numpy as np
import sys
import os
os.chdir("/Users/rajpurkar/Documents/Code/ecg")
sys.path.append('./ecg')

## Loading Data

In [None]:
prediction_folder = 'saved/predictions/1503712028'

import json
import evaluate
import predict

x, gt, probs, processor = predict.load_predictions(prediction_folder)
params = json.load(open(prediction_folder + '/params.json', 'r'))

## Count Patients

In [None]:
import fnmatch
from collections import defaultdict
from tqdm import tqdm

path = params["data_path"]
ext = '*_grp*.episodes.json'

def get_files(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in fnmatch.filter(filenames, ext):
            yield(os.path.join(root, filename))

def patient_id(record):
    return os.path.basename(record).split("_")[0]

class_patients = defaultdict(set)
for f in tqdm(get_files(path)):
    jfile = json.load(open(f, 'r'))
    for episode in jfile['episodes']:
        rhythm_name = episode['rhythm_name']
        class_patients[rhythm_name].add(patient_id(f))

for (k, v) in class_patients.items():
    print(k, len(v))

In [None]:
total = 0
for (k, v) in sorted(class_patients.items()):
    lv = len(v)
    total += lv
    print(k, lv)
print(total)

## Count Distribution

In [None]:
truths = gt
num_outputs_for_thirty_seconds = len(truths[0])
truths_flat = truths.flatten()
classes_unique, counts = np.unique(truths_flat, return_counts=True)
classes_u = np.array(processor.classes)[classes_unique]
num_hours = (counts * 30 / num_outputs_for_thirty_seconds) / 3600.0
for pair in zip(classes_u, num_hours):
    print(pair)

In [None]:
x = x
y = gt[0]

def from_one_hot_to_int(label):
    return np.argmax(label, axis=-1)

def get_x_y_predictions_at_index(index, probs=None):
    x_sample = x[index]
    y_sample = y[index]
    y_prediction = None
    if probs is not None:
        y_prediction = np.argmax(probs[index], axis=-1)
    return x_sample, y_sample, y_prediction

def get_sample_from_classes(
        categories, min_mistakes = 20, num_tries = 1000, only_classes=False, probs=None):
    class_nums = np.array([processor.classes.index(c) for c in categories])
    indices = np.where(np.array([np.in1d(class_nums, row).all() for row in y]))[0]
    for _ in range(num_tries):
        index = random.choice(indices)
        y_prediction = None
        x_sample, y_sample, y_prediction = get_x_y_predictions_at_index(index, probs=probs)
        if only_classes:
            if (set(np.unique(y_sample)) != set(np.unique(class_nums))):
                continue
        num_wrong = 0
        if y_prediction is None:
            break
        num_wrong = np.sum(y_sample != y_prediction)
        if (num_wrong > min_mistakes):
            print("Prediction got wrong " +  str(num_wrong * 1.0 / len(y_sample)))
            break
    return x_sample, y_sample, y_prediction, index

#for class_indiv in processor.classes:
x_sample, y_sample, y_prediction, index = get_sample_from_classes([u'SINUS'], only_classes=False, probs=probs, min_mistakes=0)
#x_sample, y_sample, y_prediction = get_x_y_predictions_at_index(index)

## Visualize Data

In [None]:
%matplotlib inline
import matplotlib.cm as cm
from itertools import groupby
plt.rcParams["figure.figsize"] = (9, 6)

def draw_sample(x_sample, y_sample, y_prediction, step, show_label=True, save=False, small_frame=False):
    colors = cm.Pastel2(np.linspace(0, 1, 20))
    y_times = np.linspace(step/2, len(x_sample) - step/2, len(y_sample))
    if show_label is True:
        grouped_labels = [(k, sum(1 for i in g)) for k,g in groupby(y_sample)]
        acc = 0
        seen = {}
        for label, number in grouped_labels:
            p = {
                "color": colors[label],
                "alpha": 0.5,
                "lw": 0
            }
            if label not in seen:
                label_name = processor.int_to_class[label]
                p["label"] = label_name
                seen[label] = True
            plt.axvspan(
                acc * step,
                (acc + number) * step, **p)
            acc += number
    print(np.array(processor.classes)[y_sample])
    if y_prediction is not None:
        print(np.array(processor.classes)[y_prediction])
    plt.plot(x_sample, color='#000000', alpha=1)
    plt.legend(loc="best", prop={'size':14})
    plt.yticks([])
    plt.xticks([])
    plt.tight_layout()
    if small_frame is True:
        plt.xlim([1050, 3050])
    if save is True:
        plt.savefig(str(np.unique(np.array(classes)[y_sample])[0]) + "-" + str(index) + '.pdf', dpi=400, format='pdf',bbox_inches='tight',pad_inches=0)
    plt.show()
    plt.close()

step = params["step"] if "step" in params else 256
draw_sample(x_sample, y_sample, y_prediction, step, save=False, show_label=True, small_frame=False)

## Evaluating Models

In [None]:
import evaluate
import human_performance
from tabulate import tabulate
import load

aggregate_data = []
f1_data = []
for metric in ['seq', 'set']:
    # models
    evaluator = evaluate.evaluate_multiclass(
        gt, probs, processor.classes, metric, ', '.join(params["model_paths"]), display_scores=False)
    model_plotMat, model_support, class_names = evaluate.parse_classification_report(evaluator.scorer.report)
    model_f1 = model_plotMat[:-1, 2]
    f1_data.append(model_f1)
    aggregate_data.append(model_plotMat[-1, :])

    # humans
    params_copy = params.copy()
    human_ground_truths, human_probs = human_performance.human_gt_and_probs(params_copy, x, gt, processor)
    evaluator = evaluate.evaluate_multiclass(
        human_ground_truths, human_probs, processor.classes, metric, ', '.join(params["model_paths"]), display_scores=False)
    human_plotMat, human_support, class_names = evaluate.parse_classification_report(evaluator.scorer.report)
    human_f1 = human_plotMat[:-1, 2]
    f1_data.append(human_f1)
    aggregate_data.append(human_plotMat[-1, :])

cell_text = []

f1_data = np.array(f1_data).T
for row, class_name in zip(f1_data, class_names):
    cell_text.append([class_name] + ['%1.3f' % x for x in row])

aggregate_data = np.array(aggregate_data).T
for row, class_name in zip(aggregate_data, ['Precision', 'Recall', 'F1']):
    cell_text.append([class_name] + ['%1.3f' % x for x in row])

table = tabulate(
    cell_text, tablefmt="latex", floatfmt=".3f",
    headers=["Model seq", "Human seq", "Model set", "Human set"])

rows = []
import re
for row in table.split('\n'):
    elems = re.split('\s+', row)
    if len(elems) > 2 and "seq" not in elems:
        for start in [3, 7]:
            end = start+2
            winner = start if float(elems[start]) > float(elems[end]) else end
            elems[winner] = "\\textbf{" + elems[winner] + "}"
    row = " ".join(elems)
    rows.append(row)
table = "\n".join(rows)
print(table)

In [None]:
# establish difference in scores, not neccessarily show that it's better
import human_performance
from sklearn import preprocessing

lb = preprocessing.MultiLabelBinarizer(range(len(processor.classes)))

def get_preds_from_probs(probs):
    preds = np.argmax(probs, axis=-1)
    return preds

# in case we want to do binary
def get_binary_preds_from_probs(probs):
    preds = get_preds_from_probs(probs)
    multi_label_preds = lb.fit_transform(preds)
    return multi_label_preds

test_params_copy = params.copy()
human_probs_all = None
human_probs_concat = []

for i in [0,1,2,3,4,5]:
    test_params_copy["epi_ext"] = "_rev" + str(i) + ".episodes.json"
    _, human_probs, _ = load.load_x_y_with_processor(test_params_copy, processor)
    human_probs_concat.append(human_probs)
    if human_probs_all is None:
        human_probs_all = human_probs
    else:
        human_probs_all = human_probs + human_probs_all
human_probs_concat = np.array(human_probs_concat)

In [None]:
from scipy.stats import ttest_rel
import evaluate

def get_accs(preds, gt):
    return np.count_nonzero(preds == gt, axis=-1)[0]*1.0 / preds.shape[1]

def get_ttest_from_accs(hb_accs, mb_accs, title):
    #print(np.sum(hb_accs - mb_accs > 0))
    #print(np.sum(mb_accs - hb_accs > 0))

    mean_hb_acc = np.mean(hb_accs)
    score = ttest_rel(mb_accs, hb_accs)
    return score, mean_hb_acc, hb_accs, mb_accs

def get_ttest_from_probs(h_probs, m_probs, title='human'):
    mb = get_preds_from_probs(m_probs)
    mb_accs = get_accs(mb, gt)
    hb = get_preds_from_probs(h_probs)
    hb_accs = get_accs(hb, gt)
    #evaluator = evaluate.evaluate_multiclass(gt, h_probs, processor.classes, 'seq', title, display_scores=True)
    return get_ttest_from_accs(hb_accs, mb_accs, title=title)
   

cell_text = []

hb_accs_concat = []
for index, human_probs in enumerate(human_probs_concat):
    score, mean_hb_acc, hb_accs, mb_accs = get_ttest_from_probs(human_probs, probs)
    hb_accs_concat.append(hb_accs)
    # cell_text.append(['Human ' + str(index + 1), mean_hb_acc, score.pvalue])

# get average accuracy
hb_accs_concat = np.array(hb_accs_concat)
avg_hb_accs = np.mean(hb_accs_concat, axis = 0)
score, mean, _, _ = get_ttest_from_accs(avg_hb_accs, mb_accs, title='avg human')
cell_text.append(['Human Acc. Averaged', mean, score.pvalue])

# majority vote
maj_score, mean, _, _ = get_ttest_from_probs(human_probs_all, probs, title='maj vote')
cell_text.append(['Majority Vote Human', mean, maj_score.pvalue])

# model
maj_score, mean, _, _ = get_ttest_from_probs(probs, probs, title='maj vote')
cell_text.append(['Model', mean, maj_score.pvalue])

table = tabulate(
    cell_text, floatfmt=".7f",
    headers=["Class", "Mean Acc", "Paired-t-test p-value"])

print(table)