In [1]:
import numpy as np
import pandas as pd
import pickle
import psutil
import random as rn
import tensorflow as tf
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 
from keras import backend as K

Using TensorFlow backend.


In [3]:
with open('lime_analysis_data/complete_data.pkl', 'rb') as handle:
    complete_data = pickle.load(handle)

In [4]:
with open('lime_analysis_data/results.pkl', 'rb') as handle:
    lime_dict = pickle.load(handle)

In [5]:
x_train = complete_data['xtrain']
x_test = complete_data['xtest']
x_val = complete_data['xval']
y_train = complete_data['ytrain']
y_test = complete_data['ytest']
y_val = complete_data['yval']
# param_dict = complete_data['params_dict']

In [6]:
intensity_df = lime_dict['intensity_df'].copy()

# Analysis of Intensity df

In [7]:
header = intensity_df.iloc[0]
intensity_df = intensity_df[1:]
intensity_df.columns = header

In [8]:
intensity_df_trans = intensity_df.T

In [9]:
intensity_df_trans['sum_of_intensities'] = intensity_df_trans.abs().sum(axis=1)

In [10]:
intensity_df_trans.sort_values(by=['sum_of_intensities'],ascending=False,inplace=True)

# List of columns dict

In [11]:
list_of_cols = {}

In [12]:
list_of_cols['drop_none'] = list(x_train.columns)

# Drop 0 intensities

In [13]:
intensity_df_trans_0 = intensity_df_trans.copy()

In [14]:
intensity_df_trans_0 = intensity_df_trans_0.loc[(intensity_df_trans_0!=0).any(axis=1)]

In [15]:
list_of_cols['drop_0_columns'] = list(intensity_df_trans_0.index)

# Drop last n%

In [16]:
intensity_df_trans_n = intensity_df_trans.copy()

In [17]:
for n in [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95]:
    list_of_cols['drop_{}_pct_columns'.format(n)] = list(intensity_df_trans_n.iloc[:int(np.ceil(intensity_df_trans_n.shape[0] * (1-(n/100)))),:].index)

# Training with selected columns

In [18]:
# Model Training
def get_search_space():    
    space = {'num_layers': hp.choice('num_layers',['one_hidden', 'two_hidden']),
                'units1': hp.choice('units1', [32, 64, 128, 256,512]),
                'units2': hp.choice('units2', [32, 64, 128, 256,512]),
                'dropout1': hp.uniform('dropout1', .25,.75),
                'dropout2': hp.uniform('dropout2',  .25,.75),
                'batch_size' : hp.choice('batch_size', [16,32,64,128]),
                'nb_epochs' :  500,
                'optimizer': hp.choice('optimizer',['rmsprop', 'adam', 'nadam','sgd']),
                'activation': hp.choice('activation',['relu','sigmoid']),
                'early_stop_rounds': hp.choice('early_stop_rounds',[10,20,30,40,50]),
            }
    return space

In [19]:
def create_model(params):    
#     x_train_temp = x_train.copy()
#     x_test_temp = x_test.copy()
#     y_train_temp = y_train.copy()
#     y_test_temp = y_test.copy()
    model = Sequential()
    model.add(Dense(params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(params['activation']))
    model.add(Dropout(params['dropout1']))
    if(params['num_layers'] == 'two_hidden'):
        model.add(Dense(params['units2']))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout2']))
    model.add(Dense(1))
    model.add(Activation('linear'))
    model.compile(loss='mse', metrics=['mse'],
                  optimizer=params['optimizer'])
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=params['early_stop_rounds'])
    history = History()
    model.fit(x_train_temp, y_train_temp,
              batch_size=params['batch_size'],
              epochs=500,
              callbacks=[early_stop, history],
              verbose=0,
              validation_data=(x_valid_temp,y_valid_temp)) 
    [loss, mse] = model.evaluate(x_valid_temp,y_valid_temp, verbose=0)
    global num
    mem = psutil.virtual_memory()
    if(np.isnan(mse)):
        print("{}) Validation set root mean sq. error: NaN".format(num),"\tAvailable Mem:",(mem.available/1024)/1024,"mb")
        num = num + 1
        return {'loss': np.inf, 'status': STATUS_OK, 'model': model,'params':params}
    print("{}) Validation set root mean sq. error: {:7.2f}".format(num,mse**0.5),"\tAvailable Mem:",(mem.available/1024)/1024,"mb")
    num = num + 1
    return {'loss': loss**0.5, 'status': STATUS_OK, 'model': model, 'params':params}

In [20]:
def get_best_model(label_col):
#     global x_train, x_test, x_valid, y_train, y_test, y_valid
#     input_df, x_train, x_test, x_valid, y_train, y_test, y_valid = data(csv_name=csv_name,label_col=label_col,num_features=num_features)
    trials=Trials()
    space = get_search_space()
    print("Selecting the best network architecture specifically for your data...")
    best = fmin(create_model, space, algo=tpe.suggest, max_evals=50, trials=trials)
    best_model = trials.best_trial['result']['model']
#     scaled_feature_df = pd.concat([x_train,x_test])
#     label_df = pd.concat([y_train,y_test])
#     pred_df = make_predictions(model=best_model,df=scaled_feature_df)
#     output_df = pd.merge(input_df,pred_df['predictions'].to_frame(),left_index=True,right_index=True)
    return best_model, trials

In [21]:
label_col = 'SalePrice'

In [None]:
import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

resultant_dict = {}
num =1 
for n,(key,cols) in enumerate(list_of_cols.items()):
    print(n,key,len(cols))
    global x_train_temp,y_train_temp,x_valid_temp,y_valid_temp, x_test_temp, y_test_temp
    x_train_temp = x_train[cols].copy()
    y_train_temp = y_train.copy()
    x_valid_temp = x_val[cols].copy()
    y_valid_temp = y_val.copy()
    x_test_temp = x_test[cols].copy()
    y_test_temp = y_test.copy()
#     result_model = create_model(x_train_temp,y_train_temp,x_valid_temp,y_valid_temp,param_dict)
    best_model, output_trials = get_best_model(label_col)
    val_rmse = best_model.evaluate(x_valid_temp,y_valid_temp,verbose=0)[0] ** 0.5
    test_rmse = best_model.evaluate(x_test_temp,y_test_temp,verbose=0)[0] ** 0.5
    resultant_dict['{}'.format(key)] = [val_rmse,test_rmse]
    result_lime_analysis_1_param_opt = pd.DataFrame(resultant_dict).T
    result_lime_analysis_1_param_opt.columns = ['valid_rmse','test_rmse']
    result_lime_analysis_1_param_opt.to_csv('lime_analysis_data/result_lime_analysis_1_param_opt.csv')

0 drop_none 396
Selecting the best network architecture specifically for your data...
1) Validation set root mean sq. error: 42634.62 	Available Mem: 24241.828125 mb
2) Validation set root mean sq. error: NaN 	Available Mem: 24231.6484375 mb
3) Validation set root mean sq. error: NaN 	Available Mem: 24222.3515625 mb
4) Validation set root mean sq. error: 42504.20 	Available Mem: 24211.82421875 mb
5) Validation set root mean sq. error: NaN 	Available Mem: 24205.16796875 mb
6) Validation set root mean sq. error: NaN 	Available Mem: 24194.73828125 mb
7) Validation set root mean sq. error: NaN 	Available Mem: 24187.8984375 mb
8) Validation set root mean sq. error: 200034.77 	Available Mem: 24178.15234375 mb
9) Validation set root mean sq. error: 203549.09 	Available Mem: 24167.8671875 mb
10) Validation set root mean sq. error: 202024.36 	Available Mem: 24158.73046875 mb
11) Validation set root mean sq. error: 203512.22 	Available Mem: 24150.18359375 mb
12) Validation set root mean sq. erro