In [1]:
import numpy as np
import pandas as pd
import sys
import psutil

from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 

Using TensorFlow backend.


In [2]:
df = pd.read_csv('iris.csv',header=None)

In [3]:
attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
df.columns = attributes

# Detecting no of classes 

In [157]:
no_of_classes = df['class'].nunique()

In [5]:
df['class'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [6]:
names_of_classes = list(df['class'].unique())

# Data Pre Processing

In [7]:
Y = df['class']
dummy_Y = pd.get_dummies(Y,sparse=True)

In [8]:
pre_processed_df = pd.merge(df,dummy_Y,left_index=True,right_index=True)

In [9]:
pre_processed_df.drop(['class'],axis=1,inplace=True)

# Data Splitting

In [10]:
def split_train_test(df,label_cols,test_size=0.2):    
    X = df[df.columns.difference(label_cols)]
    y = df[label_cols]
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=42)
    return train_x, test_x, train_y, test_y

In [11]:
x_train, x_test, y_train, y_test = split_train_test(df=pre_processed_df,label_cols=names_of_classes)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.25, random_state=42)

In [12]:
x_train.shape, x_valid.shape, x_test.shape

((90, 4), (30, 4), (30, 4))

In [13]:
y_train.shape, y_valid.shape, y_test.shape

((90, 3), (30, 3), (30, 3))

# Implementing Keras Model

In [14]:
def get_search_space():    
    space = {'num_layers': hp.choice('num_layers',['one_hidden', 'two_hidden']),
                'units1': hp.choice('units1', [32, 64, 128, 256,512]),
                'units2': hp.choice('units2', [32, 64, 128, 256,512]),
                'dropout1': hp.uniform('dropout1', .25,.75),
                'dropout2': hp.uniform('dropout2',  .25,.75),
                'batch_size' : hp.choice('batch_size', [16,32,64,128]),
                'nb_epochs' :  500,
                'optimizer': hp.choice('optimizer',['rmsprop', 'adam', 'nadam','sgd']),
                'activation': hp.choice('activation',['relu','sigmoid']),
                'early_stop_rounds': hp.choice('early_stop_rounds',[10,20,30,40,50]),
            }
    return space

In [81]:
def create_model(params):    
    x_train_temp = x_train.copy() 
    y_train_temp = y_train.copy()
    model = Sequential()
    model.add(Dense(params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(params['activation']))
    model.add(Dropout(params['dropout1']))
    if(params['num_layers'] == 'two_hidden'):
        model.add(Dense(params['units2']))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout2']))
    model.add(Dense(y_train_temp.shape[1]))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'],
                  optimizer=params['optimizer'])
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=params['early_stop_rounds'])
    terminate_nan = keras.callbacks.TerminateOnNaN()
    history = History()
    model.fit(x_train_temp, y_train_temp,
              batch_size=params['batch_size'],
              epochs=500,
              callbacks=[early_stop, terminate_nan, history],
              verbose=0,
              validation_data=(x_valid,y_valid)) 
    [loss, acc] = model.evaluate(x_valid,y_valid, verbose=0)
    global num
    mem = psutil.virtual_memory()
    if(np.isnan(acc)):
        print("{}) Validation set Accuracy: NaN".format(num),"\tAvailable Mem:",(mem.available/1024)/1024,"mb")
        num = num + 1
        return {'loss': np.inf, 'status': STATUS_OK, 'params': params}
    print("{}) Validation set Accuracy: {:7.2f}".format(num,acc*100),"\tAvailable Mem:",(mem.available/1024)/1024,"mb")
    num = num + 1
    return {'loss': -acc, 'status': STATUS_OK, 'params': params}

In [80]:
def train_best_model(best_params):   
    print('Training the best selected model...') 
    x_train_temp = x_train.copy() 
    y_train_temp = y_train.copy()
    model = Sequential()
    model.add(Dense(best_params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(best_params['activation']))
    model.add(Dropout(best_params['dropout1']))
    if(best_params['num_layers'] == 'two_hidden'):
        model.add(Dense(best_params['units2']))
        model.add(Activation(best_params['activation']))
        model.add(Dropout(best_params['dropout2']))
    model.add(Dense(y_train_temp.shape[1]))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'],
                  optimizer=best_params['optimizer'])
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=best_params['early_stop_rounds'])
    history = History()
    model.fit(x_train_temp, y_train_temp,
              batch_size=best_params['batch_size'],
              epochs=500,
              callbacks=[early_stop, history],
              verbose=0,
              validation_data=(x_valid,y_valid)) 

    return model

In [144]:
# Lime functions
def imp_features_lime():
    import lime
    import lime.lime_tabular
    selected_features = x_train.columns
    explainer = lime.lime_tabular.LimeTabularExplainer(np.array(x_train), feature_names=selected_features, class_names=[], verbose=True, mode='classification')
    df1 = get_intensity_dfs(explainer,x_valid)
    features_subset = get_best_cols_lime(df1)
    return features_subset

def predict1(ip):
    p = best_model.predict_proba(ip)
    return p

def get_intensity_dfs(explainer,x_valid):
    print('Generating intensity values for training data...')
    x_valid_copy = x_valid.head(3).copy()
    x_valid_copy.reset_index(drop=True,inplace=True)
    print('The LIME iterations will run {} times...'.format(x_valid_copy.shape[0]))
    for im in range(x_valid_copy.shape[0]):
        print('-'*25,im+1,'-'*25) 
        exp = explainer.explain_instance(x_valid_copy.loc[im], predict1, num_features=x_valid_copy.shape[1])
        name_pos = list(x_valid_copy.columns)
        intansity = [0]*len(name_pos)
        grt = [0]*len(name_pos)
        grt_and_eql = [0]*len(name_pos)
        less = [0]*len(name_pos)
        less_and_eql = [0]*len(name_pos)
        try:
            for i in exp.as_list():
                if i[0].find(' < ') != -1 and i[0].find(' <= ') != -1:
                    grt[name_pos.index(i[0][i[0].find(' < ')+3:i[0].find(' <= ')])] = float(i[0][0:i[0].find(' < ')])
                    less_and_eql[name_pos.index(i[0][i[0].find(' < ')+3:i[0].find(' <= ')])] = float(i[0][i[0].find(' <= ')+4:])
                    intansity[name_pos.index(i[0][i[0].find(' < ')+3:i[0].find(' <= ')])] = i[1]
                elif i[0].find(' <= ') != -1 and i[0].find(' < ') != -1:
                    grt_and_eql[name_pos.index(i[0][i[0].find(' <= ')+4:i[0].find(' < ')])] = float(i[0][0:i[0].find(' <= ')])
                    less[name_pos.index(i[0][i[0].find(' <= ')+4:i[0].find(' < ')])] = float(i[0][i[0].find(' < ')+3:])
                    intansity[name_pos.index(i[0][i[0].find(' <= ')+4:i[0].find(' < ')])] = i[1]
                elif i[0].find(' < ') != -1:
                    less[name_pos.index(i[0][0:i[0].find(' < ')])] = float(i[0][i[0].find(' < ')+3:])
                    intansity[name_pos.index(i[0][0:i[0].find(' < ')])] = i[1]
                elif i[0].find(' <= ') != -1:
                    less_and_eql[name_pos.index(i[0][0:i[0].find(' <= ')])] = float(i[0][i[0].find(' <= ')+4:])
                    intansity[name_pos.index(i[0][0:i[0].find(' <= ')])] = i[1]
                elif i[0].find(' > ') != -1:
                    grt[name_pos.index(i[0][0:i[0].find(' > ')])] = float(i[0][i[0].find(' > ')+3:])
                    intansity[name_pos.index(i[0][0:i[0].find(' > ')])] = i[1]
                elif i[0].find(' >= ') != -1:
                    grt_and_eql[name_pos.index(i[0][0:i[0].find(' >= ')])] = float(i[0][i[0].find(' >= ')+4:])
                    intansity[name_pos.index(i[0][0:i[0].find(' >= ')])] = i[1]
        except:
            pass
        if im == 0:
            intensity_dic = {'feature_name':name_pos, 'intensity0':intansity}
            df_int = pd.DataFrame(intensity_dic)
            
        else:
            df_int['intensity'+str(im)] = intansity
            
    return df_int.T

def get_best_cols_lime(intensity_df):
    header = intensity_df.iloc[0]
    intensity_df = intensity_df[1:]
    intensity_df.columns = header
    intensity_df_trans = intensity_df.T
    intensity_df_trans['sum_of_intensities'] = intensity_df_trans.abs().sum(axis=1)
    intensity_df_trans.sort_values(by=['sum_of_intensities'],ascending=False,inplace=True)
    from sklearn import preprocessing
    min_max_scaler = preprocessing.MinMaxScaler()
    intensity_df_trans['sum_of_intensities'] = min_max_scaler.fit_transform(intensity_df_trans['sum_of_intensities'].values.reshape(-1,1)) * 100
    features_subset = []
    valid_inputs = []
    for x in range(1,101):
        valid_inputs.append(x)
    valid_inputs.append(-1)
    while(True):
            thresh_lime = input("Enter the threshold to select features between 1 to 100. Enter -1 to exit feature selection. - ")
            thresh_lime = int(thresh_lime)
            try:
                if(thresh_lime not in valid_inputs):
                    raise ValueError("Please enter a valid input.")
                else:
                    if(thresh_lime == -1):
                        features_subset = []
                        break
                    else:
                        features_subset = list(intensity_df_trans[intensity_df_trans['sum_of_intensities'] >= thresh_lime].index)
                        no_of_features = len(features_subset)
                        print('The {} features selected are:\n'.format(no_of_features))
                        print(features_subset)
                        while(True):
                            what_to_do = input('Do you want to train with selected features? - y/n Enter -1 to exit feature selection. - ')
                            what_to_do = what_to_do.strip(" ")
                            try:
                                if(what_to_do not in ['Y','y','N','n','-1']):
                                    raise ValueError("Please enter a valid input.")
                                else:
                                    break
                            except Exception as e:
                                print(e)
                                continue
                        if(what_to_do == '-1'):
                            features_subset = []
                            break
                        elif(what_to_do in ['n','N']):
                            continue
                        else:
                            return features_subset
            except Exception as e:
                print(e)
                continue
    return features_subset

In [82]:
num= 1
trials=Trials()
space = get_search_space()
print("Selecting the best network architecture specifically for your data...")
best = fmin(create_model, space, algo=tpe.suggest, max_evals=10, trials=trials)
best_trials_temp = trials.best_trial['result'] 
best_model = train_best_model(best_trials_temp['params']) 
# scaled_feature_df = pd.concat([x_train,x_valid,x_test])
# label_df = pd.concat([y_train,y_valid,y_test])
# pred_df = make_predictions(model=best_model_temp,df=scaled_feature_df)
# output_df = pd.merge(input_df,pred_df['predictions'].to_frame(),left_index=True,right_index=True)
# return best_model_temp, output_df, test_arg_processed

Selecting the best network architecture specifically for your data...
1) Validation set Accuracy:   93.33 	Available Mem: 2816.0703125 mb
2) Validation set Accuracy:   93.33 	Available Mem: 2802.0390625 mb
3) Validation set Accuracy:   60.00 	Available Mem: 2792.59375 mb
4) Validation set Accuracy:   93.33 	Available Mem: 2785.1015625 mb
5) Validation set Accuracy:   93.33 	Available Mem: 2773.9375 mb
6) Validation set Accuracy:   93.33 	Available Mem: 2768.6875 mb
7) Validation set Accuracy:   90.00 	Available Mem: 2758.60546875 mb
8) Validation set Accuracy:   93.33 	Available Mem: 2745.44140625 mb
9) Validation set Accuracy:   93.33 	Available Mem: 2727.9453125 mb
10) Validation set Accuracy:   93.33 	Available Mem: 2712.953125 mb
Training the best selected model...


In [145]:
features_subset = imp_features_lime()

Generating intensity values for training data...
The LIME iterations will run 3 times...
------------------------- 1 -------------------------
Intercept 0.32704507420634554
Prediction_local [0.46663434]
Right: 0.8611476
------------------------- 2 -------------------------
Intercept 0.33924448749397307
Prediction_local [0.46960798]
Right: 0.74101204
------------------------- 3 -------------------------
Intercept 0.4765230690246954
Prediction_local [0.09706961]
Right: 0.01518198
Enter the threshold to select features between 1 to 100. Enter -1 to exit feature selection. - 56
The 1 features selected are:

['petal_length']
Do you want to train with selected features? - y/n Enter -1 to exit feature selection. - n
Enter the threshold to select features between 1 to 100. Enter -1 to exit feature selection. - 78
The 1 features selected are:

['petal_length']
Do you want to train with selected features? - y/n Enter -1 to exit feature selection. - n
Enter the threshold to select features betwee

In [83]:
import lime
import lime.lime_tabular
selected_features = x_train.columns
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(x_train), feature_names=selected_features, class_names=[], verbose=True, mode='classification')

In [91]:
def predict1(ip):
    p = best_model.predict_proba(ip)
    return p

In [139]:
x_valid_copy = x_valid.copy()
x_valid_copy.reset_index(drop=True,inplace=True)
exp = explainer.explain_instance(x_valid_copy.loc[2], predict1, num_features=x_valid_copy.shape[1])

Intercept 0.4578051763988422
Prediction_local [0.1111323]
Right: 0.01518198


In [143]:
exp.as_list()

[('petal_length <= 1.50', -0.42463830101396405),
 ('petal_width <= 0.30', 0.07370078133002056),
 ('3.00 < sepal_width <= 3.40', 0.01492378485936213),
 ('sepal_length <= 5.10', -0.010659141989413291)]

In [148]:
predicted = best_model.predict(x_test)

In [149]:
predicted1 = np.argmax(predicted, axis=1)

In [150]:
y_pred = pd.get_dummies(predicted1)

In [152]:
best_model.evaluate(x_test,y_test)



[0.17454342544078827, 0.9666666388511658]

In [153]:
y_pred.columns = y_test.columns

In [154]:
def reverse_dummies(df,label_col):
    predict_col = pd.DataFrame(df.idxmax(axis=1),columns=[label_col + '_prediction'])
    return predict_col

In [155]:
y_pred_reversed = reverse_dummies(y_pred,'class')