In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [4]:
import sklearn
from sklearn.neighbors import KDTree

In [5]:
dire = 'mixnn_em_data/'
train_feat_zeros = np.loadtxt(dire + 'X_train_hist_zeros.txt')
train_labels_zeros = np.loadtxt(dire + 'Y_train_hist_zeros.txt')
print ("Loaded train_zeros")

val_feat_zeros = np.loadtxt(dire + 'X_val_hist_zeros.txt')
val_labels_zeros = np.loadtxt(dire + 'Y_val_hist_zeros.txt')
print ("Loaded val_zeros")

test_feat_zeros = np.loadtxt(dire + 'X_test_hist_zeros.txt')
test_labels_zeros = np.loadtxt(dire + 'Y_test_hist_zeros.txt')
print ("Loaded test_zeros")

Loaded train_zeros
Loaded val_zeros
Loaded test_zeros


In [6]:
print(val_feat_zeros.shape, val_labels_zeros.shape)

(24338, 198) (24338, 25)


In [7]:
inv_action_map = {}
for iv in range(5):
    for vaso in range(5):
        inv_action_map[5*iv+vaso] = [iv,vaso]    

In [8]:
# define an action mapping - how to get an id representing the action from the (iv,vaso) tuple
action_map = {}
count = 0
for iv in range(5):
    for vaso in range(5):
        action_map[(iv,vaso)] = count
        count += 1

In [9]:
df = pd.read_csv('../data/rl_train_data_final_cont.csv')
val_df = pd.read_csv('../data/rl_val_data_final_cont.csv')
test_df = pd.read_csv('../data/rl_test_data_final_cont.csv')

In [10]:
train_feat = train_feat_zeros
train_labels = train_labels_zeros

val_feat = val_feat_zeros
val_labels = val_labels_zeros

test_feat = test_feat_zeros
test_labels = test_labels_zeros

In [11]:
# the features we want to use for evaluating distance: Arterial_lactate, output_4h, meanbp, diabp,
# chloride, paO2 fiO2, hb, adm_weight, age, sofa.
# the ones loaded in the numpy arrays are scaled to zero mean, unit variance (so they're at the same scale)
# we have data for t, t-1, t-2, t-3. have a time weighting: geometric series with ratio r_time
# Weight the features according to this mapping: Arterial_lactate: 1, sofa:1, output4h: 1, *bp:1, chloride: 0.7, 
# pa02fi02: 0.7, hb: 0.7, adm_weight: 0.7, age: 0.7 
# TGet the indices into the array and construct the multiplication factor for the euclidean distance
# extract the rel components. Get pairwise squared distance. Multiply by fixed weighting array

In [12]:
num_features_for_dist = 10
hist = 3
time_weight_arr = np.ones((hist+1) * num_features_for_dist)
r_time = 0.5
time_weight_arr[num_features_for_dist:] *= 0.5
time_weight_arr[2*num_features_for_dist:] *= 0.5
time_weight_arr[3*num_features_for_dist:] *= 0.5

In [13]:
# indices: [lactate, sofa, output4h, meanbp, diabp, chloreide, pa02fi02, hb, weight, age]
rel_feat_indices  = [152, 179, 192, 169, 159, 157, 172, 165, 187, 188]
# feat_weighting = np.array([10, 0.7, 10, 5, 5, 0.7, 0.7, 0.7, 0.7, 0.7]*4)
feat_weighting = np.array([0, 10, 0, 0, 0, 0, 0, 0, 0, 0]*4)

In [14]:
rel_f_copy = rel_feat_indices.copy()
for i in range(1,4):
    new_arr = [j - 50*i for j in rel_f_copy]
    rel_feat_indices += new_arr 

In [15]:
rel_feat_indices = np.array(rel_feat_indices)

In [16]:
def physio_distance(arr1, arr2):
    rel_feat_distance = (arr1[rel_feat_indices] - arr2[rel_feat_indices])**2
    dist = np.sum(time_weight_arr*feat_weighting*rel_feat_distance)
    return dist

In [58]:
def physio_distance2(arr1, arr2):
    weights = np.ones(len(arr1))
    # weight sofa more
#     weights[179] = 5
    return np.sum((weights*(arr1 - arr2))**2)

In [80]:
# kdt = KDTree(train_feat[:100], leaf_size=30, metric='pyfunc', metric_params={"func":physio_distance})
# bt = sklearn.neighbors.BallTree(train_feat,leaf_size=40, metric=physio_distance)
# bt1 = sklearn.neighbors.BallTree(train_feat,leaf_size=10000, metric=physio_distance)
bt2 = sklearn.neighbors.BallTree(train_feat,leaf_size=100, metric=physio_distance2)

In [69]:
sampled_ids = np.random.choice(len(val_feat), 300)

In [73]:
# to assess the metric, iterate over the val set and get accuracy of labels
val_labels_argmaxed = np.argmax(val_labels, axis=1)
tot = len(val_labels_argmaxed)
correct = 0
dists, inds = bt1.query(val_feat[sampled_ids], k=100)
for i, sampled_id in enumerate(sampled_ids):
#     dist, ind = bt.query(val_x_.reshape(1,-1), k=100)
    dist = dists[i]
    ind = inds[i]
#     val_x_actions = np.argmax(train_labels[np.squeeze(ind)], axis=1)
    val_x_actions = np.argmax(train_labels[ind], axis=1)
    acs, counts = np.unique(val_x_actions, return_counts=True)
    emp_prob = 0.1*np.ones(25)
    emp_prob[acs] += counts
    pred = np.argmax(emp_prob)
    if pred == val_labels_argmaxed[sampled_id]:
        correct += 1
    if i % 1000 == 0 and i > 0:
        print("Count %d" % i)
print(correct)

53


In [81]:
# to assess the metric, iterate over the val set and get accuracy of labels
val_labels_argmaxed = np.argmax(val_labels, axis=1)
tot = len(val_labels_argmaxed)
correct = 0
# sampled_ids = np.random.choice(len(val_feat), 100)
dists, inds = bt2.query(val_feat[sampled_ids], k=50)
for i, sampled_id in enumerate(sampled_ids):
#     dist, ind = bt.query(val_x_.reshape(1,-1), k=100)
    dist = dists[i]
    ind = inds[i]
#     val_x_actions = np.argmax(train_labels[np.squeeze(ind)], axis=1)
    val_x_actions = np.argmax(train_labels[ind], axis=1)
    acs, counts = np.unique(val_x_actions, return_counts=True)
    emp_prob = 0.1*np.ones(25)
    emp_prob[acs] += counts
    pred = np.argmax(emp_prob)
    if pred == val_labels_argmaxed[sampled_id]:
        correct += 1
    if i % 1000 == 0 and i > 0:
        print("Count %d" % i)
print(correct)

164


In [85]:
def knn_run(leaf_size, k, dist, tot_ids,seed):
    rng = np.random.RandomState(seed)
    val_labels_argmaxed = np.argmax(val_labels, axis=1)
    correct = 0
    
    sampled_ids = rng.choice(len(val_feat), tot_ids)
    
    tree = sklearn.neighbors.BallTree(train_feat,leaf_size=leaf_size, metric=dist)
    
    dists, inds = tree.query(val_feat[sampled_ids], k=k)
    
    for i, sampled_id in enumerate(sampled_ids):
        dist = dists[i]
        ind = inds[i]
        val_x_actions = np.argmax(train_labels[ind], axis=1)
        acs, counts = np.unique(val_x_actions, return_counts=True)
        emp_prob = 0.1*np.ones(25)
        emp_prob[acs] += counts
        pred = np.argmax(emp_prob)
        if pred == val_labels_argmaxed[sampled_id]:
            correct += 1
    return correct

In [None]:
# investigate varying leaf size and k with euclidean distance
settings = [[50, 100],[100,100],[1000,100],[2000,100], [5000,100], [10000,100],
           [50, 200],[100,200],[1000,200],[2000,200], [5000,200], [10000,200],
           [50, 500],[100,500],[1000,500],[2000,500], [5000,500], [10000,500]]
seeds = [0,5,10]

tot_ids = 200

for setting in settings:
    tot_cor = 0
    for seed in seeds:
        leaf_size, k = setting
        dist = physio_distance2
        correct = knn_run(leaf_size, k, dist, tot_ids,seed)
        print ("Correct: %d"% (correct,))
        tot_cor += correct
    frac_correct = tot_cor / (len(seeds)*tot_ids)
    print("Leaf size %d , k %d, average correct %f" % (leaf_size, k, frac_correct))

Correct: 113
Correct: 107
Correct: 107
Leaf size 50 , k 100, average correct 0.545000
Correct: 113
Correct: 107
Correct: 107
Leaf size 100 , k 100, average correct 0.545000
