In [1]:
import sys

sys.path.append('/home/arash/VRdataCleaning/DeepSurv/')

import importlib
import deepsurv
from statsmodels.stats.outliers_influence import variance_inflation_factor    
import argparse
import uuid
import pickle
import json

import numpy as np
import lasagne
import optunity

import logging
from logging import handlers


importlib.reload(deepsurv)

from deepsurv import deep_surv, utils, viz

from deepsurv.deepsurv_logger import DeepSurvLogger, TensorboardLogger



import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
data = pd.read_pickle('ALLDATA.pkl')
data=data.loc[-data['Wait Time (s)'].isin(['Err1','Err2','Err3','Err4','Err5'])]
data=data.loc[data['Age_9-12'].isin([0,1])]


data=data.loc[:,['Wait Time (s)','PET (s)','Distace to Collision Point','Speed Limit', 'Lane Width', 'Minimum Gap', 'Mean Arrival Rate', 'AV', 
                   'Full Braking Before Impact_-1.0', 'Full Braking Before Impact_1', 
                   'Full Braking Before Impact_2', 'Full Braking Before Impact_3', 'Snowy',
                   'One way', 'two way', 'Two way with median', 'Night', 'numcars', 
                    'Age_9-12', 'Age_15-18', 'Age_12-15', 'Age_18 - 24', 'Age_25 - 29', 'Age_30 - 39', 'Age_40 - 49', 'Age_50 - 59', 'Age_60+', 'Gender_Female', 'Occupation_Employed', 'Occupation_Student', 'Occupation_Unemployed', 
                   'Occupation_kid', 'Education_Bachelors degree', 'Education_College/University student', 
                   'Education_Doctorate degree', 'Education_High school diploma', 'Education_Masters degree', 
                   'Education_Professional degree', 'driving license_Yes', 'mode_Bike', 'mode_Car',
                   'mode_Public Transit', 'mode_Walking', 'workwalk_No', 'workwalk_Sometimes', 'workwalk_Yes', 
                   'shopwalk_No', 'shopwalk_Sometimes', 'shopwalk_Yes', 'shopwalk_kid', 'Vrexp_Yes', 'Heart_Currently',
                   'Heart_Over the years', 'vision_Currently', 
                   'vision_Over the years', 'anxiety_Currently', 'anxiety_Over the years', 'Headaches_Currently',
                   'Headaches_Over the years', 'dizziness_Over the years']]       #numwalk and VRexpnum removed because of some false inputs in the data should be fixed later


data=data.apply(pd.to_numeric, errors='coerce')
data['E']=1   #all pedestrians cross, so no right censored data

In [3]:
def dataframe_to_deepsurv_ds(df, event_col = 'E', time_col = 'Wait Time (s)'):
    # Extract the event and time columns as numpy arrays
    e = df[event_col].values.astype(numpy.int32)
    t = df[time_col].values.astype(numpy.float32)

    # Extract the patient's covariates as a numpy array
    x_df = df.drop([event_col, time_col], axis = 1)
    x = x_df.values.astype(numpy.float32)
    
    # Return the deep surv dataframe
    return {
        'x' : x,
        'e' : e,
        't' : t
    }



In [None]:
#Model with specific hyper parameters
hyperparams = {
    'L2_reg': 3.0,
    'batch_norm': False,
    'dropout': 0.02,
    'hidden_layers_sizes': [90,90,90],
    'learning_rate': 1e-05,
    'lr_decay': 0.001,
    'momentum': 0.9,
    'n_in': 10,
    'standardize': True
}

# Create an instance of DeepSurv using the hyperparams defined above
model = deep_surv.DeepSurv(**hyperparams)



experiment_name = 'Wait Time analysis'
logdir = './logs/tensorboard/'
logger = TensorboardLogger(experiment_name, logdir=logdir)

# Now we train the model
update_fn=lasagne.updates.nesterov_momentum # The type of optimizer to use. \
                                            # Check out http://lasagne.readthedocs.io/en/latest/modules/updates.html \
                                            # for other optimizers to use
n_epochs = 2000

# If you have validation data, you can add it as the second parameter to the function
metrics = model.train(data, n_epochs=n_epochs, logger=logger, update_fn=update_fn)

# Print the final metrics
print('Train C-Index:', metrics['c-index'][-1])
# print('Valid C-Index: ',metrics['valid_c-index'][-1])

# Plot the training / validation curves
viz.plot_log(metrics)

In [4]:
def load_logger(logdir):
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    
    # Print to Stdout
    ch = logging.StreamHandler(sys.stdout)
    ch.setFormatter(format)
    logger.addHandler(ch)

    # Print to Log file
    fh = logging.FileHandler(os.path.join(logdir, 'log_' + str(uuid.uuid4())))
    fh.setFormatter(format)
    logger.addHandler(fh)

    return logger

In [5]:
def load_box_constraints(file):
    with open(file, 'rb') as fp:
        return json.loads(fp.read())

def save_call_log(file, call_log):
    with open(file, 'wb') as fp:
        pickle.dump(call_log, fp)

In [6]:
def get_objective_function(num_epochs, logdir, update_fn = lasagne.updates.sgd):
    '''
    Returns the function for Optunity to optimize. The function returned by get_objective_function
    takes the parameters: x_train, y_train, x_test, and y_test, and any additional kwargs to 
    use as hyper-parameters.

    The objective function runs a DeepSurv model on the training data and evaluates it against the
    test set for validation. The result of the function call is the validation concordance index 
    (which Optunity tries to optimize)
    '''
    def format_to_deepsurv(x, y):
        return {
            'x': x,
            'e': y[:,0].astype(np.int32),
            't': y[:,1].astype(np.float32)
        }

    def get_hyperparams(params):
        hyperparams = {
            'batch_norm': False,
            'activation': 'selu',
            'standardize': True
        }
        # @TODO add default parameters and only take necessary args from params
        # protect from params including some other key

        if 'num_layers' in params and 'num_nodes' in params:
            params['hidden_layers_sizes'] = [int(params['num_nodes'])] * int(params['num_layers'])
            del params['num_layers']
            del params['num_nodes']

        if 'learning_rate' in params:
            params['learning_rate'] = 10 ** params['learning_rate']

        hyperparams.update(params)
        return hyperparams

    def train_deepsurv(x_train, x_test,
        **kwargs):
        hyperparams = get_hyperparams(kwargs)
        #select number of features
        ReliefAvg = pd.read_pickle('/home/arash/VRdataCleaning/DeepSurv/deepsurv/ReliefAvg.pkl')
        x_train=x_train[np.append(ReliefAvg.sort_values(['Importance'],ascending=False)
                             ['Covariate'][0:n_in].values,np.array(['Wait Time (s)','E']))]
        x_train = dataframe_to_deepsurv_ds(x_train, event_col = 'E', time_col= 'Wait Time (s)')
        
        x_test=x_test[np.append(ReliefAvg.sort_values(['Importance'],ascending=False)
                             ['Covariate'][0:n_in].values,np.array(['Wait Time (s)','E']))]
        x_test = dataframe_to_deepsurv_ds(x_test, event_col = 'E', time_col= 'Wait Time (s)')
        
        x_train = x_train['x']
        e_train = x_train['e']
        t_train = x_train['t']
        y_train = np.column_stack((e_train, t_train))
        
        x_test=x_test['x']
        e_test = x_test['e']
        t_test = x_test['t']
        y_test = np.column_stack((e_test, t_test))
        
        
        # Standardize the datasets
        train_mean = x_train.mean(axis = 0)
        train_std = x_train.std(axis = 0)

        x_train = (x_train - train_mean) / train_std
        x_test = (x_test - train_mean) / train_std

        train_data = format_to_deepsurv(x_train, y_train)
        valid_data = format_to_deepsurv(x_test, y_test)

        

        # Set up Tensorboard loggers
        # TODO improve the model_id for Tensorboard to better partition runs
        model_id = str(hash(str(hyperparams)))
        run_id = model_id + '_' + str(uuid.uuid4())
        logger = TensorboardLogger('hyperparam_search', 
            os.path.join(logdir,"tensor_logs", model_id, run_id))

        network = deep_surv.DeepSurv(**hyperparams)
        metrics = network.train(train_data, n_epochs = num_epochs, logger=logger, 
            update_fn = update_fn, verbose = False)

        result = network.get_concordance_index(**valid_data)
        main_logger.info('Run id: %s | %s | C-Index: %f | Train Loss %f' % (run_id, str(hyperparams), result, metrics['loss'][-1][1]))
        return result

    return train_deepsurv

In [None]:
NUM_EPOCHS = 100
NUM_FOLDS = 3
logdir='/home/arash/VRdataCleaning/logs'


In [None]:
#    global main_logger
main_logger = load_logger(logdir)


In [None]:
#    main_logger.debug('Loading dataset: ' + args.dataset)
box_constraints = load_box_constraints('/home/arash/VRdataCleaning/box_constraints.0.json')


In [None]:
opt_fxn = get_objective_function(NUM_EPOCHS, logdir, 
                                 utils.get_optimizer_from_str('adam'))

In [None]:
opt_fxn = optunity.cross_validated(x=data, num_folds=NUM_FOLDS)(opt_fxn)

In [None]:
#    main_logger.debug('Maximizing C-Index. Num_iterations: %d' % args.num_evals)
opt_params, call_log, _ = optunity.maximize(opt_fxn, num_evals=10,
        solver_name='sobol',
        **box_constraints)

In [None]:
#    main_logger.debug('Optimal Parameters: ' + str(opt_params))
#    main_logger.debug('Saving Call log...')
print(call_log._asdict())

In [None]:
save_call_log(os.path.join(logdir, 'optunity_log_%s.pkl' % (str(uuid.uuid4()))), call_log._asdict())