In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import os
import plotly
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as py 
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from builder import load_surv_samples

import theano
import theano.tensor as T

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import itemfreq

from sklearn.model_selection import train_test_split


 This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.


Using TensorFlow backend.


## Data:

In [4]:
data_x, data_y = load_surv_samples('simulated_data/linear_5K.pkl', sort=False)

train_x, test_x, train_y, test_y = train_test_split(
        data_x, data_y, test_size=0.2, random_state=75 )

train_x, val_x, train_y, val_y = train_test_split(
        train_x, train_y, test_size=0.25, random_state=75 )

print("Train split: %d; Val split: %d; Test split: %d." % (train_x.shape[0],val_x.shape[0],test_x.shape[0]))

Loading data from: simulated_data/linear_5K.pkl
Train split: 3000; Val split: 1000; Test split: 1000.


In [5]:
print( test_x.shape )
print( test_y.shape )

(1000, 5)
(1000, 4)


### Partial likelihood [ Theano ]

In [6]:
def partial_likelihood_th(y_true, y_pred):
    sort_idx = np.argsort( y_true[:,1] )[::-1]

    risk    = y_pred[sort_idx]
    events  = y_true[:,2][sort_idx]

    hazard_ratio = T.exp(risk)
    log_cum_risk = T.log(T.extra_ops.cumsum(hazard_ratio))
    uncencored_likelihood = risk.T - log_cum_risk
    censored_likelihood = uncencored_likelihood * events
    neg_likelihood = -T.sum( censored_likelihood )

    return neg_likelihood

### Partial likelihood [ NumPy ]

In [7]:
def partial_likelihood_np(y_true, y_pred):
    # Sort by time: longest -> shortest  i.g. [4,3,2,1]
    sort_idx = np.argsort( y_true[:,1] )[::-1]

    risk    = y_pred[sort_idx]
    events  = y_true[:,2][sort_idx]

    hazard_ratio = np.exp(risk)
    log_cum_risk = np.log(np.cumsum(hazard_ratio))
    uncencored_likelihood = risk - log_cum_risk
    censored_likelihood = uncencored_likelihood * events
    neg_likelihood = -np.sum( censored_likelihood )

    return neg_likelihood

### Efron [ NumPy ]

In [257]:
def efron_estimator_np_vec(y_true, y_pred):
    sort_idx = np.argsort( y_true[:,1] )[::-1]
    
    risk          = y_pred[sort_idx]
    risk_exp      = np.exp(risk)
    events        = y_true[:,2][sort_idx]
    ftimes        = y_true[:,1][sort_idx]
    ftimes_cens   = ftimes * events
    
    unique        = np.unique(ftimes_cens, return_index=True, return_counts=True)
    unique_ftimes = np.trim_zeros( unique[0][::-1] )
    m = np.count_nonzero(unique_ftimes)
    
    E_ti     = np.zeros(m, dtype='int32')
    risk_phi = np.zeros(m, dtype='float32')
    cum_risk = np.zeros(m, dtype='float32')
    tie_phi  = np.zeros(m, dtype='float32')
    
    cum_sum = np.cumsum(risk_exp)
    
    for j in range(m):
        idx = np.logical_and(ftimes == unique_ftimes[j], events)
        E_ti[j]      = idx.sum()
        
        risk_phi[j]  = risk[idx].sum()
        tie_phi[j]   = risk_exp[idx].sum()
        
        cum_risk[j]  = cum_sum[ ftimes == unique_ftimes[j] ][-1]
        
    likelihood = 0.
    for j in range(m):
        J = np.linspace(start=0, stop=E_ti[j]-1, num=E_ti[j]) / np.float(E_ti[j])
        D_m = cum_risk[j] - J*tie_phi[j]
        likelihood += risk_phi[j] - np.log(D_m).sum()
        
    return np.negative(likelihood)

In [258]:
def efron_estimator_np(y_true, y_pred):
    sort_idx = np.argsort( y_true[:,1] )[::-1]
    
    # Sort & prepare:
    risk          = y_pred[sort_idx]
    risk_exp      = np.exp(risk)
    events        = y_true[:,2][sort_idx]
    ftimes        = y_true[:,1][sort_idx]

    # Initis:
    tie_count    = 0
    likelihood   = 0.
    cum_risk, risk_phi, tie_phi = 0., 0., 0. 
    
    # Iterate over samples in inverse-time-order:
    for i, (ti, ei) in list(enumerate(zip(ftimes, events))):
        cum_risk += risk_exp[i]
        
        if ei:
            risk_phi   += risk[i]
            tie_phi    += risk_exp[i]
            tie_count  += 1
            
        do_sum = (i == (ftimes.size - 1) and tie_count > 0) or \
                 (ftimes[i + 1] < ti and tie_count > 0)
        if  do_sum:
            # Diagnostic print #1
            #print_function('%f - '%(risk_phi))
            
            for l in range(tie_count):
                c    = l / float(tie_count)
                dm   = np.log(cum_risk - c * tie_phi)
                likelihood -= dm
                
                # Diagnostic print #2
                #print('  log(%f - %f * %f) ' % (cum_risk, c, tie_phi))

            likelihood += risk_phi
            
            # Diagnostic print #3
            #print( '%i) %f %f %f, %i' % (i, risk_phi, cum_risk, tie_phi, tie_count) )

            # Reset:
            tie_phi   = 0.
            risk_phi  = 0.
            tie_count = 0
        
    return np.negative(likelihood)

## Testing

In [259]:
# dim order in Y: 
# 0  1  2  3
# h, t, e, id
hs     = np.array([0]*4)
ts     = np.array([1,3,4,5])
es     = np.array([1,1,1,1])
id     = np.array([0]*4)
preds  = np.array([0.1,0.18,0.3,0.4])

ys_uncens = np.column_stack((hs,ts,es,id))

print( partial_likelihood_th(ys_uncens, preds).eval() )
print( partial_likelihood_np(ys_uncens, preds) )
print( efron_estimator_np_vec(ys_uncens, preds) )
print( efron_estimator_np(ys_uncens, preds) )

3.49821419494
3.49821419494
3.49821411527
3.49821419494


In [260]:
# dim order in Y: 
# 0  1  2  3
# h, t, e, id
hs     = np.array([.1]*9)
ts     = np.array([1,2,3,4,5,5,7,1,1])
es     = np.array([1,1,0,1,1,1,1,0,1])
id     = np.array([0]*9)
preds  = np.array([-0.4, -0.3, -0.2, -0.1,  0.0,  0.1,  0.2,  0.3,  0.4])

ys_uncens = np.column_stack((hs,ts,es,id))

print( partial_likelihood_th(ys_uncens, preds).eval() )
print( partial_likelihood_np(ys_uncens, preds) )
print( efron_estimator_np_vec(ys_uncens, preds) )
print( efron_estimator_np(ys_uncens, preds) )

9.76101946041
9.76101946041
9.85944477328
9.85944496367


In [None]:
N = 5
#s_idx = np.random.choice(1000, N)
y_true = test_y[s_idx,...]
y_pred = np.random.uniform(0.0, 0.1, N)

print( partial_likelihood_th(y_true, y_pred).eval() )
print( partial_likelihood_np(y_true, y_pred) )
print( efron_estimator_np_vec(y_true, y_pred) )