In [1]:
# coding: utf-8

import numpy as np
import pandas as pd
import sys
import psutil

from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Data fetch
def data_fetch(csv_name):
    print("Reading the data...")
    try:
        df = pd.read_csv('datasets/{}.csv'.format(csv_name))
        global train_columns
        train_columns = df.columns
        return df
    except Exception as e:
        print("The csv is not present in the datasets folder. \nExiting the script...")
        flush_memory()

# Pre Processing
def get_string_cols(df):    
    string_cols = list(df.select_dtypes(include=['object','category']).columns)
    return string_cols

def get_num_cols(df):    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_cols = list(df.select_dtypes(include=numerics).columns)    
    return num_cols

def pre_process(df,label_col):
    print("Pre-Processing the data...")
    y = df[label_col]
    df = df.drop(label_col,axis=1)
    string_cols = get_string_cols(df)
    num_cols = get_num_cols(df)
    substring = ':string'
    num_cat_cols = []
    for string in num_cols:
        if(substring in string):
            num_cat_cols.append(string)
    for col in num_cat_cols:
        df[col] = df[col].astype('object')
    categorical_cols = string_cols + num_cat_cols
    df = pd.get_dummies(df,columns=categorical_cols)
    global training_dummy_columns
    training_dummy_columns = df.columns
    df.columns = df.columns.str.replace(':string','')
    df[label_col] = y  
    return df

# Train Test Split
def split_train_test(df,label_col,test_size=0.2):    
    X = df.loc[:,df.columns != label_col]
    y = df[label_col]
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=42)
    return train_x, test_x, train_y, test_y

# Model Training
def get_search_space():    
    space = {'num_layers': hp.choice('num_layers',['one_hidden', 'two_hidden']),
                'units1': hp.choice('units1', [32, 64, 128, 256,512]),
                'units2': hp.choice('units2', [32, 64, 128, 256,512]),
                'dropout1': hp.uniform('dropout1', .25,.75),
                'dropout2': hp.uniform('dropout2',  .25,.75),
                'batch_size' : hp.choice('batch_size', [16,32,64,128]),
                'nb_epochs' :  500,
                'optimizer': hp.choice('optimizer',['rmsprop', 'adam', 'nadam','sgd']),
                'activation': hp.choice('activation',['relu','sigmoid']),
                'early_stop_rounds': hp.choice('early_stop_rounds',[10,20,30,40,50]),
            }
    return space

def data(csv_name,label_col,num_features):    
    data = data_fetch(csv_name)
    pre_processed_data = pre_process(df=data,label_col=label_col)
    x_train, x_test, y_train, y_test = split_train_test(df=pre_processed_data,label_col=label_col)
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.25, random_state=42)

#     train_x, test_x, train_y, test_y = split_train_test(df=pre_processed_data,label_col=label_col)  
#     best_features = rfe_feat_selection(train_x,train_y,label_col=label_col,num_features=num_features)    
#     best_features_copy = best_features.copy()
#     best_features_copy.append(label_col)
#     feature_selected_data = pre_processed_data[best_features_copy]
#     x_train, x_test, y_train, y_test = split_train_test(df=feature_selected_data,label_col=label_col)

    return data, x_train, x_test, x_valid, y_train, y_test, y_valid

def create_model(params):    
    x_train_temp = x_train.copy()
#     x_test_temp = x_test.copy()
    y_train_temp = y_train.copy()
#     y_test_temp = y_test.copy()
    model = Sequential()
    model.add(Dense(params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(params['activation']))
    model.add(Dropout(params['dropout1']))
    if(params['num_layers'] == 'two_hidden'):
        model.add(Dense(params['units2']))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout2']))
    model.add(Dense(1))
    model.add(Activation('linear'))
    model.compile(loss='mse', metrics=['mse'],
                  optimizer=params['optimizer'])
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=params['early_stop_rounds'])
    history = History()
    model.fit(x_train_temp, y_train_temp,
              batch_size=params['batch_size'],
              epochs=500,
              callbacks=[early_stop, history],
              verbose=0,
              validation_data=(x_valid,y_valid)) 
    [loss, mse] = model.evaluate(x_valid,y_valid, verbose=0)
    global num
    mem = psutil.virtual_memory()
    if(np.isnan(mse)):
        print("{}) Validation set root mean sq. error: NaN".format(num),"\tAvailable Mem:",(mem.available/1024)/1024,"mb")
        num = num + 1
        return {'loss': np.inf, 'status': STATUS_OK, 'model': model}
    print("{}) Validation set root mean sq. error: {:7.2f}".format(num,mse**0.5),"\tAvailable Mem:",(mem.available/1024)/1024,"mb")
    num = num + 1
    return {'loss': loss**0.5, 'status': STATUS_OK, 'model': model}

def get_best_model(csv_name,label_col,num_features):
    global x_train, x_test, x_valid, y_train, y_test, y_valid
    input_df, x_train, x_test, x_valid, y_train, y_test, y_valid = data(csv_name=csv_name,label_col=label_col,num_features=num_features)
    trials=Trials()
    space = get_search_space()
    print("Selecting the best network architecture specifically for your data...")
    best = fmin(create_model, space, algo=tpe.suggest, max_evals=1, trials=trials)
    best_model = trials.best_trial['result']['model']
    scaled_feature_df = pd.concat([x_train,x_test])
    label_df = pd.concat([y_train,y_test])
    pred_df = make_predictions(model=best_model,df=scaled_feature_df)
    output_df = pd.merge(input_df,pred_df['predictions'].to_frame(),left_index=True,right_index=True)
    return best_model, output_df, trials

# Make Predictions
def make_predictions(model,df):    
    predictions = model.predict(df).flatten()
    df['predictions'] = predictions    
    return df

# Displaying Result
def display_results(best_model,output_df,label_col):    
    print("#####################################################")
    print("Results:")
    print("Training Size: {} rows".format(x_train.shape[0]))
    print("Testing Size: {} rows".format(x_test.shape[0]))
    loss,mae = best_model.evaluate(x_test,y_test,verbose=0)
    rmse = loss**0.5
    pct_error = (rmse / output_df[label_col].mean())*100
    print("RMSE on the test data: ",rmse)
    print("Percent error on the test data: ", pct_error, "%")
    print("#####################################################")
    return rmse, pct_error

# Save the model and update the config db
def update_config_db(csv_name,label_col,rmse,pct_error,model_type):
    import mysql.connector
    class NumpyMySQLConverter(mysql.connector.conversion.MySQLConverter):
        """ A mysql.connector Converter that handles Numpy types """
        def _float32_to_mysql(self, value):
            return float(value)
        def _float64_to_mysql(self, value):
            return float(value)
        def _int32_to_mysql(self, value):
            return int(value)
        def _int64_to_mysql(self, value):
            return int(value)
    mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd="root",
    database="configdb")
    mydb.set_converter_class(NumpyMySQLConverter)
    mycursor = mydb.cursor()
    insert_query = "INSERT INTO config_table (model_name, model_type,list_of_features,target_column,RMSE,Percent_Error) VALUES (%s, %s, %s, %s, %s, %s)"
    values = (csv_name,model_type,str(selected_cols),label_col,rmse,pct_error)
    mycursor.execute(insert_query, values)
    mydb.commit()
    print(mycursor.rowcount, "record inserted for", csv_name, "in the config database.")

def check_duplicacy(csv_name):
    import mysql.connector
    mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd="root",
    database="configdb")
    mycursor = mydb.cursor()
    duplicate_query = "SELECT model_name FROM config_table WHERE model_name = '{}';".format(csv_name)
    mycursor.execute(duplicate_query)
    myresult = mycursor.fetchall()
    if(mycursor.rowcount != 0):
        print("Model for that csv already exists. Please change the csv name.")
        print("Exiting the script...")
        flush_memory()
    
def save(csv_name,label_col,best_model,output_df,rmse, pct_error,model_type):
    print("Saving the model...")    
    best_model.save("models/{}.h5".format(csv_name))
    print("Saving the output predictions...")
    output_df.to_csv("training_predictions/{}_predictions.csv".format(csv_name))
    print("Pickling necessary data...")
    import pickle
    with open('models/{}_columns.pkl'.format(csv_name), 'wb') as f:
        pickle.dump(train_columns, f)
    with open('models/{}_dummy_columns.pkl'.format(csv_name), 'wb') as f:
        pickle.dump(training_dummy_columns, f)
    with open('models/{}_selected_columns.pkl'.format(csv_name), 'wb') as f:
        pickle.dump(selected_cols, f)
    print("Updating the config database...")
    update_config_db(csv_name=csv_name,label_col=label_col,rmse=rmse,pct_error=pct_error,model_type=model_type)
    print("Model and the config information has been saved.")
    
def monitor_ram(threshold):
    print("Monitoring RAM...")
    mem = psutil.virtual_memory()
    print("Initial RAM available",(mem.available/1024)/1024,"mb")
    if(mem.available > threshold):
        while(mem.available > threshold):
            mem = psutil.virtual_memory()
            if(mem.available <= threshold):
                print("Overflow...")
                print("RAM is full. Please upgrade your machine.")
                print("Exiting the script...")
                flush_memory()
    else:
        flush_memory()        

        
def flush_memory():
    process = psutil.Process(os.getpid())
    os.system("kill -KILL {}".format(process.pid))


def start_thread(threshold):
    from threading import Thread
    thread = Thread(target = monitor_ram, args=(threshold,))
#     thread.daemon = True
    thread.start()

# Driver function
def driver(csv_name,label_col,num_features,model_type):
    threshold = 150 * 1024 * 1024
    start_thread(threshold)
    global num 
    num = 1
#     check_duplicacy(csv_name)
    best_model, output_df = get_best_model(csv_name=csv_name,label_col=label_col,num_features=num_features)
    rmse, pct_error = display_results(best_model=best_model, output_df=output_df,label_col=label_col)
    while(True):
        save_model = input("Do you want to save the model? y/n - ")
        save_model = save_model.strip(" ")
        try:
            if(save_model not in ['Y','y','N','n']):
                raise ValueError("Please enter a valid input.")
            else:
                break
        except Exception as e:
            print(e)
            continue
    if(save_model in ['Y','y']):
        save(csv_name=csv_name,label_col=label_col,best_model=best_model, output_df=output_df,rmse=rmse, pct_error=pct_error,model_type=model_type)
        flush_memory()
    else:
        print("Exiting the script...")
        flush_memory()

Using TensorFlow backend.


In [2]:
global num 
num = 1
best_model, output_df, trials = get_best_model(csv_name='train_v21',label_col='SalePrice',num_features=None)

Reading the data...
Pre-Processing the data...
Selecting the best network architecture specifically for your data...
1) Validation set root mean sq. error: 195322.77 	Available Mem: 24781.9296875 mb


In [3]:
csv_name='train_v21'
label_col='SalePrice'

In [11]:
best_model.save('lime_analysis_data/{}.h5'.format('train_v21'))

# Get parameters for best model from trials

In [4]:
param_dict = trials.best_trial['misc']['vals']

In [5]:
def params_of_space():    
    num_layers = ['one_hidden', 'two_hidden']
    dense1 = [32, 64, 128, 256,512]
    dense2 = [32, 64, 128, 256,512]
    batch_size = [16,32,64,128]
    optimizer = ['rmsprop', 'adam', 'nadam','sgd']
    activation = ['relu','sigmoid']
    early_stop_rounds = [10,20,30,40,50]
    return num_layers,dense1,dense2,batch_size,optimizer,activation,early_stop_rounds

In [6]:
num_layers_1,dense1_1,dense2_1,batch_size_1,optimizer_1,activation_1,early_stop_rounds_1 = params_of_space()

In [7]:
param_dict_final = param_dict.copy()

In [8]:
param_dict_final['activation'] = activation_1[param_dict['activation'][0]]
param_dict_final['batch_size'] = batch_size_1[param_dict['batch_size'][0]]
param_dict_final['dropout1'] = param_dict['dropout1'][0]
param_dict_final['dropout2'] = param_dict['dropout2'][0]
param_dict_final['early_stop_rounds'] = early_stop_rounds_1[param_dict['early_stop_rounds'][0]]
param_dict_final['num_layers'] = num_layers_1[param_dict['num_layers'][0]]
param_dict_final['optimizer'] = optimizer_1[param_dict['optimizer'][0]]
param_dict_final['units1'] = dense1_1[param_dict['units1'][0]]
param_dict_final['units2'] = dense2_1[param_dict['units2'][0]]

In [9]:
param_dict_final

{'activation': 'sigmoid',
 'batch_size': 32,
 'dropout1': 0.270892159560297,
 'dropout2': 0.7429652047426645,
 'early_stop_rounds': 20,
 'num_layers': 'one_hidden',
 'optimizer': 'nadam',
 'units1': 512,
 'units2': 32}

In [10]:
Info_dic = {}
Info_dic['xtest'] = x_test
Info_dic['xtrain'] = x_train
Info_dic['xval'] = x_valid
Info_dic['cvsname'] = csv_name
Info_dic['ytest'] = y_test
Info_dic['ytrain'] = y_train
Info_dic['yval'] = y_valid
Info_dic['classname'] = label_col
Info_dic['params_dict'] = param_dict_final

In [11]:
Info_dic.keys()

dict_keys(['xtest', 'xtrain', 'xval', 'cvsname', 'ytest', 'ytrain', 'yval', 'classname', 'params_dict'])

In [145]:
import pickle
with open("lime_analysis_data/complete_data.pkl", 'wb') as handle:
    pickle.dump(Info_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

# LIME

In [12]:
selected_cols = x_train.columns

In [15]:
x_valid.reset_index(drop=True, inplace=True)

In [19]:
import lime
import lime.lime_tabular
import numpy as np
#explainer = lime.lime_tabular.LimeTabularExplainer(np.array(x_train), feature_names=selected_cols[:383])
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(x_train), feature_names=selected_cols, class_names=[], verbose=True, mode='regression')
# i = np.random.randint(0, np.array(x_valid).shape[0])
# exp = explainer.explain_instance(x_valid.loc[10,:], predict, num_features=383)
# exp = explainer.explain_instance(x_valid.loc[0].values, model.predict, num_features=100)

NameError: name 'model' is not defined

In [40]:
def featur_im(explainer,x_valid):
    right_vl = []
    predict_local = []
    for im in range(x_valid.shape[0]):
        print('---------------',im,'---------------')
        exp = explainer.explain_instance(x_valid.loc[im], predict, num_features=383)
        name_pos = list(x_valid.columns)
        right_vl.append(float(exp.predicted_value)) 
        predict_local.append(float(exp.local_pred[0]))
        intansity = [0]*len(name_pos)
        grt = [0]*len(name_pos)
        grt_and_eql = [0]*len(name_pos)
        less = [0]*len(name_pos)
        less_and_eql = [0]*len(name_pos)
        try:
            for i in exp.as_list():
                if i[0].find(' < ') != -1 and i[0].find(' <= ') != -1:
                    grt[name_pos.index(i[0][i[0].find(' < ')+3:i[0].find(' <= ')])] = float(i[0][0:i[0].find(' < ')])
                    less_and_eql[name_pos.index(i[0][i[0].find(' < ')+3:i[0].find(' <= ')])] = float(i[0][i[0].find(' <= ')+4:])
                    intansity[name_pos.index(i[0][i[0].find(' < ')+3:i[0].find(' <= ')])] = i[1]
                elif i[0].find(' <= ') != -1 and i[0].find(' < ') != -1:
                    grt_and_eql[name_pos.index(i[0][i[0].find(' <= ')+4:i[0].find(' < ')])] = float(i[0][0:i[0].find(' <= ')])
                    less[name_pos.index(i[0][i[0].find(' <= ')+4:i[0].find(' < ')])] = float(i[0][i[0].find(' < ')+3:])
                    intansity[name_pos.index(i[0][i[0].find(' <= ')+4:i[0].find(' < ')])] = i[1]
                elif i[0].find(' < ') != -1:
                    less[name_pos.index(i[0][0:i[0].find(' < ')])] = float(i[0][i[0].find(' < ')+3:])
                    intansity[name_pos.index(i[0][0:i[0].find(' < ')])] = i[1]
                elif i[0].find(' <= ') != -1:
                    less_and_eql[name_pos.index(i[0][0:i[0].find(' <= ')])] = float(i[0][i[0].find(' <= ')+4:])
                    intansity[name_pos.index(i[0][0:i[0].find(' <= ')])] = i[1]
                elif i[0].find(' > ') != -1:
                    grt[name_pos.index(i[0][0:i[0].find(' > ')])] = float(i[0][i[0].find(' > ')+3:])
                    intansity[name_pos.index(i[0][0:i[0].find(' > ')])] = i[1]
                elif i[0].find(' >= ') != -1:
                    grt_and_eql[name_pos.index(i[0][0:i[0].find(' >= ')])] = float(i[0][i[0].find(' >= ')+4:])
                    intansity[name_pos.index(i[0][0:i[0].find(' >= ')])] = i[1]
        except:
            pass
        if im == 0:
            intensity_dic = {'feature_name':name_pos, 'intensity0':intansity}
            df_int = pd.DataFrame(intensity_dic)
            val_dic = { '<f0':grt,'<=f0':grt_and_eql,"0f<":less,"0f<=":less_and_eql,'feature_name':name_pos}
            df_val = pd.DataFrame(val_dic)
            pr_dict = {'Right'}
        else:
            df_int['intensity'+str(im)] = intansity
            df_val['<f'+str(im)] = grt
            df_val['<=f'+str(im)] = grt_and_eql
            df_val[str(im)+"f<"] = less
            df_val[str(im)+"f<="] =less_and_eql
    local_and_Rpredict = {'prediction local':predict_local,'right':right_vl}
    df_lcl_right = pd.DataFrame(local_and_Rpredict)
    return df_int.T,df_lcl_right.T,df_val.T

In [41]:
def predict(qc):
    global best_model
    qc = best_model.predict(qc)
    return qc.reshape(qc.shape[0])

In [42]:
df_int1,df_lcl_right,df_val = featur_im(explainer,x_valid=x_valid.reset_index(drop=True))

--------------- 0 ---------------
Intercept 342263.5056873229
Prediction_local [226266.88126175]
Right: 229647.48
--------------- 1 ---------------
Intercept -734.5956990241248
Prediction_local [222019.40482527]
Right: 199378.86
--------------- 2 ---------------
Intercept 71282.52029652814
Prediction_local [210807.18230684]
Right: 219710.56
--------------- 3 ---------------
Intercept 176547.1430182996
Prediction_local [192286.85155272]
Right: 193297.86
--------------- 4 ---------------
Intercept 179544.440478126
Prediction_local [146074.14177829]
Right: 156162.72
--------------- 5 ---------------
Intercept 345768.41995646374
Prediction_local [142381.29263007]
Right: 114518.49
--------------- 6 ---------------
Intercept 174312.97446556974
Prediction_local [213791.58317463]
Right: 225652.39
--------------- 7 ---------------
Intercept 188146.68941291625
Prediction_local [162328.34287617]
Right: 160729.67
--------------- 8 ---------------
Intercept 60717.12424938021
Prediction_local [33206

Intercept 351646.1937591959
Prediction_local [243153.89092061]
Right: 227932.28
--------------- 72 ---------------
Intercept 134000.30325201477
Prediction_local [137162.96025351]
Right: 109209.44
--------------- 73 ---------------
Intercept 69139.03599229397
Prediction_local [132433.13919792]
Right: 137124.27
--------------- 74 ---------------
Intercept 153636.34812641615
Prediction_local [228356.84695377]
Right: 201098.34
--------------- 75 ---------------
Intercept 102203.12219598284
Prediction_local [101493.86806629]
Right: 109452.266
--------------- 76 ---------------
Intercept 91201.69949470035
Prediction_local [133278.42205375]
Right: 135821.38
--------------- 77 ---------------
Intercept 39258.744124240126
Prediction_local [212472.74076812]
Right: 217067.33
--------------- 78 ---------------
Intercept 149538.5720869796
Prediction_local [263333.17116337]
Right: 235164.69
--------------- 79 ---------------
Intercept 231966.69600458952
Prediction_local [176799.37960864]
Right: 1871

Intercept 322345.3709104968
Prediction_local [194054.28705851]
Right: 175658.77
--------------- 143 ---------------
Intercept 350862.0864176149
Prediction_local [110473.08390663]
Right: 121816.47
--------------- 144 ---------------
Intercept 338837.87714914884
Prediction_local [82733.69430381]
Right: 74014.79
--------------- 145 ---------------
Intercept 280238.4504068827
Prediction_local [181122.82592189]
Right: 175010.48
--------------- 146 ---------------
Intercept 275363.22832001536
Prediction_local [120766.42769503]
Right: 110514.58
--------------- 147 ---------------
Intercept 44450.383097232814
Prediction_local [208067.36756814]
Right: 215534.83
--------------- 148 ---------------
Intercept -104777.92362925151
Prediction_local [193826.52326064]
Right: 174008.33
--------------- 149 ---------------
Intercept 206677.0486921756
Prediction_local [153838.6281358]
Right: 129590.0
--------------- 150 ---------------
Intercept 70635.4610807149
Prediction_local [208229.25204474]
Right: 20

Intercept 218818.80552022002
Prediction_local [211977.11060952]
Right: 198142.75
--------------- 214 ---------------
Intercept 60583.00606602545
Prediction_local [236838.24131377]
Right: 237776.53
--------------- 215 ---------------
Intercept 23179.24031341236
Prediction_local [248611.87083859]
Right: 268879.75
--------------- 216 ---------------
Intercept 66093.98879179783
Prediction_local [166554.45855155]
Right: 162919.23
--------------- 217 ---------------
Intercept 176952.54875605597
Prediction_local [141084.6064969]
Right: 118764.125
--------------- 218 ---------------
Intercept 275595.6253308348
Prediction_local [149695.42355731]
Right: 138285.06
--------------- 219 ---------------
Intercept 294347.2499950399
Prediction_local [152979.84817791]
Right: 144159.42
--------------- 220 ---------------
Intercept 126299.95745819011
Prediction_local [213322.78155261]
Right: 210456.72
--------------- 221 ---------------
Intercept 303799.9581684445
Prediction_local [109525.02503238]
Right:

Intercept 399181.17154678307
Prediction_local [174062.46774485]
Right: 181384.78
--------------- 285 ---------------
Intercept 173032.49682208546
Prediction_local [276606.41660397]
Right: 329613.12
--------------- 286 ---------------
Intercept 264013.35565642285
Prediction_local [85675.11789784]
Right: 91515.6
--------------- 287 ---------------
Intercept 97627.81007667362
Prediction_local [154088.75543606]
Right: 141341.42
--------------- 288 ---------------
Intercept 72792.84530588241
Prediction_local [165064.79231462]
Right: 156620.77
--------------- 289 ---------------
Intercept 177840.7033572353
Prediction_local [149934.95324762]
Right: 138217.7


# Saving results

In [141]:
results = {}
results['intensity_df'] = df_int1
results['right_local_pred_df'] = df_lcl_right
results['feature_value_df'] = df_val

In [142]:
with open("lime_analysis_data/results.pkl", 'wb') as handle1:
    pickle.dump(results, handle1, protocol=pickle.HIGHEST_PROTOCOL)

# Losses of Best Data on Validation and Test data

In [153]:
loss_valid, mse_valid = best_model.evaluate(x_valid,y_valid)

rmse_valid = loss_valid ** 0.5

rmse_valid



34498.26341896267

In [152]:
loss_test, mse_test = best_model.evaluate(x_test,y_test)

rmse_test = loss_test ** 0.5

rmse_test



30371.172441377985