In [1]:
import pandas as pd
import numpy as np

In [2]:
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 

Using TensorFlow backend.


In [3]:
import tensorflow as tf
import random as rn
from keras import backend as K

In [4]:
from keras.models import load_model
model = load_model('models/iris.h5')

# Load Train, Valid and Test data

In [5]:
import pickle

In [6]:
global x_train, x_test, x_valid, y_train, y_test, y_valid

In [7]:
with open('x_train.pkl', 'rb') as fp:
    x_train = pickle.load(fp)
with open('x_test.pkl', 'rb') as fp:
    x_test = pickle.load(fp)
with open('x_valid.pkl', 'rb') as fp:
    x_valid = pickle.load(fp)
with open('y_train.pkl', 'rb') as fp:
    y_train = pickle.load(fp)
with open('y_test.pkl', 'rb') as fp:
    y_test = pickle.load(fp)
with open('y_valid.pkl', 'rb') as fp:
    y_valid = pickle.load(fp)

# Load Best Params

In [8]:
global best_params

In [9]:
with open('best_params.pkl', 'rb') as fp:
    best_params = pickle.load(fp)

In [10]:
best_params

{'activation': 'relu',
 'batch_size': 128,
 'dropout1': 0.5888517927868363,
 'dropout2': 0.4037961236257436,
 'early_stop_rounds': 50,
 'num_layers': 'two_hidden',
 'optimizer': 'adam',
 'units1': 32,
 'units2': 512}

# Results on all data

### Test data

In [11]:
loss,acc = model.evaluate(x_test,y_test,verbose=0)
acc

0.8999999761581421

### Valid data

In [12]:
loss_v,acc_v = model.evaluate(x_valid,y_valid,verbose=0)

In [13]:
acc_v

0.9333333373069763

# Intensity DF 

In [14]:
df = pd.read_csv('classify_intensity_df.csv')

In [15]:
df.reset_index(inplace=True)

In [16]:
t = y_train.isin([1]).stack().reset_index()
t = t[t[0]==True]
t = t.set_index('level_0').drop(0,axis=1)
t.index.name = ''
t.columns = ['class']

In [17]:
global class_weights
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',np.unique(t),t['class'])

In [18]:
def train_best_model(best_params,feature_subset):   
#     print('Training the best selected model...') 


    import os
    os.environ['PYTHONHASHSEED'] = '0'

    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.

    np.random.seed(42)

    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.

    rn.seed(12345)

    # Force TensorFlow to use single thread.
    # Multiple threads are a potential source of
    # non-reproducible results.
    # For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

    # The below tf.set_random_seed() will make random number generation
    # in the TensorFlow backend have a well-defined initial state.
    # For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

    tf.set_random_seed(1234)

    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    K.set_session(sess)

    # ---------------------------------------------------
    
    x_train_temp = x_train.copy() 
    y_train_temp = y_train.copy()
    
    x_valid_temp = x_valid.copy() 
    y_valid_temp = y_valid.copy()
    
    x_train_temp = x_train_temp[feature_subset]
    x_valid_temp = x_valid_temp[feature_subset]
    
    
    
    model = Sequential()
    model.add(Dense(best_params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(best_params['activation']))
    model.add(Dropout(best_params['dropout1']))
    if(best_params['num_layers'] == 'two_hidden'):
        model.add(Dense(best_params['units2']))
        model.add(Activation(best_params['activation']))
        model.add(Dropout(best_params['dropout2']))
    model.add(Dense(y_train_temp.shape[1]))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'],
                  optimizer=best_params['optimizer'])
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=best_params['early_stop_rounds'])
    history = History()
    model.fit(x_train_temp, y_train_temp,
              batch_size=best_params['batch_size'],
              epochs=500,
              callbacks=[early_stop, history],
              verbose=0,
              validation_data=(x_valid_temp,y_valid_temp),
              class_weight=class_weights) 
    return model

In [19]:
def get_results(best_params,feature_subset):
    model = train_best_model(best_params,feature_subset)
    x_valid_temp = x_valid.copy() 
    y_valid_temp = y_valid.copy()
    
    x_test_temp = x_test.copy() 
    y_test_temp = y_test.copy()
    
    x_valid_temp = x_valid_temp[feature_subset]
    x_test_temp = x_test_temp[feature_subset]
    
    loss,acc = model.evaluate(x_test_temp,y_test_temp,verbose=0)
    
    
    loss_v,acc_v = model.evaluate(x_valid_temp,y_valid_temp,verbose=0)
   
    
    return acc,acc_v

# Dropping Zero Intensities

In [20]:
df_non_zero = df[df['sum_of_intensities'] != 0]

In [21]:
non_zero_features = list(df_non_zero['feature_name'])

In [23]:
acc, acc_v = get_results(best_params,non_zero_features)

In [24]:
acc, acc_v

(0.9333333373069763, 0.9333333373069763)

In [29]:
df_non_zero_sorted = df_non_zero.sort_values(by=['sum_of_intensities'],ascending=False)


# Simulation

In [25]:
def simulation_of_features(df):
    feature_dict = {}
    if(len(df)<=100):
        for i in range(1,100):
            if(i < len(df)):
                drop_value = i
                print('No of features dropped:',drop_value)
                subset_df = df.head(len(df) - drop_value)
                subset_df.sort_values(by=['index'],inplace=True)
                feature_subset = list(subset_df['feature_name'])

                acc, acc_v = get_results(best_params,feature_subset)

                feature_dict[drop_value] = acc, acc_v
        return feature_dict
        
    else:
        for i in range(1,100):
            drop_value = int(np.floor((i / 100) * len(df)))
            print('No of features dropped:',drop_value)
            subset_df = df.head(len(df) - drop_value)
            print(list(subset_df['feature_name']))
            subset_df.sort_values(by=['index'],inplace=True)
            print(list(subset_df['feature_name']))
            feature_subset = list(subset_df['feature_name'])
            
            acc, acc_v = get_results(best_params,feature_subset)
            
            feature_dict[drop_value] = acc, acc_v
        return feature_dict

In [31]:
resultant_dict = simulation_of_features(df_non_zero_sorted)

No of features dropped: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


No of features dropped: 2
No of features dropped: 3


In [32]:
resultant_dict

{1: (0.8666666746139526, 0.8999999761581421),
 2: (0.6333333253860474, 0.6666666865348816),
 3: (0.3333333432674408, 0.30000001192092896)}

In [33]:
resultant_df = pd.DataFrame(resultant_dict).T

In [34]:
resultant_df.reset_index(inplace=True)

In [35]:
resultant_df.columns = ['no_of_dropped_features','acc_test', 'acc_valid']

In [36]:
temp_df = pd.DataFrame({'no_of_dropped_features':[0],'acc_test':[0.9333333373069763],'acc_valid':[0.9333333373069763]})

In [37]:
resultant_df = resultant_df.append(temp_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [38]:
resultant_df.sort_values(by=['no_of_dropped_features'],inplace=True)
resultant_df.reset_index(drop=True,inplace=True)

In [39]:
resultant_df.to_csv('iris_classify_lime_feature_selection_results.csv')

In [40]:
resultant_df

Unnamed: 0,acc_test,acc_valid,no_of_dropped_features
0,0.933333,0.933333,0
1,0.866667,0.9,1
2,0.633333,0.666667,2
3,0.333333,0.3,3
