In [1]:
import pandas as pd
import numpy as np

In [2]:
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 

Using TensorFlow backend.


In [3]:
import tensorflow as tf
import random as rn
from keras import backend as K

In [4]:
from keras.models import load_model
model = load_model('models/abalone.h5')

# Load Train, Valid and Test data

In [5]:
import pickle

In [6]:
global x_train, x_test, x_valid, y_train, y_test, y_valid

In [7]:
with open('x_train.pkl', 'rb') as fp:
    x_train = pickle.load(fp)
with open('x_test.pkl', 'rb') as fp:
    x_test = pickle.load(fp)
with open('x_valid.pkl', 'rb') as fp:
    x_valid = pickle.load(fp)
with open('y_train.pkl', 'rb') as fp:
    y_train = pickle.load(fp)
with open('y_test.pkl', 'rb') as fp:
    y_test = pickle.load(fp)
with open('y_valid.pkl', 'rb') as fp:
    y_valid = pickle.load(fp)

# Load Best Params

In [8]:
global best_params

In [9]:
with open('best_params.pkl', 'rb') as fp:
    best_params = pickle.load(fp)

In [10]:
best_params

{'activation': 'relu',
 'batch_size': 32,
 'dropout1': 0.39327924382539026,
 'dropout2': 0.43077080171461557,
 'early_stop_rounds': 40,
 'nb_epochs': 500,
 'num_layers': 'two_hidden',
 'optimizer': 'rmsprop',
 'units1': 32,
 'units2': 512}

# Results on all data

### Test data

In [11]:
loss,mse = model.evaluate(x_test,y_test,verbose=0)

In [12]:
rmse = loss ** 0.5
rmse

2.1113178751406

In [13]:
pct_error = (loss ** 0.5) / y_test.mean()*100
pct_error

21.348110106646608

### Valid data

In [14]:
loss_v,mse_v = model.evaluate(x_valid,y_valid,verbose=0)

In [15]:
rmse_v = loss_v ** 0.5
rmse_v

2.1426875863273063

In [16]:
pct_error_v = (loss_v ** 0.5) / y_valid.mean()*100
pct_error_v

21.86095706821611

# Intensity DF 

In [17]:
df = pd.read_csv('abalone_intensity_df.csv')

In [19]:
df.reset_index(inplace=True)

In [20]:
def train_best_model(best_params,feature_subset):   
#     print('Training the best selected model...') 

    import os
    os.environ['PYTHONHASHSEED'] = '0'

    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.

    np.random.seed(42)

    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.

    rn.seed(12345)

    # Force TensorFlow to use single thread.
    # Multiple threads are a potential source of
    # non-reproducible results.
    # For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

    # The below tf.set_random_seed() will make random number generation
    # in the TensorFlow backend have a well-defined initial state.
    # For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

    tf.set_random_seed(1234)

    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    K.set_session(sess)

    # ---------------------------------------------------
    
    x_train_temp = x_train.copy() 
    y_train_temp = y_train.copy()
    
    x_valid_temp = x_valid.copy() 
    y_valid_temp = y_valid.copy()
    
    x_train_temp = x_train_temp[feature_subset]
    x_valid_temp = x_valid_temp[feature_subset]
    
    
    
    model = Sequential()
    model.add(Dense(best_params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(best_params['activation']))
    model.add(Dropout(best_params['dropout1']))
    if(best_params['num_layers'] == 'two_hidden'):
        model.add(Dense(best_params['units2']))
        model.add(Activation(best_params['activation']))
        model.add(Dropout(best_params['dropout2']))
    model.add(Dense(1))
    model.add(Activation('linear'))
    model.compile(loss='mse', metrics=['mse'],
                  optimizer=best_params['optimizer'])
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=best_params['early_stop_rounds'])
    history = History()
    model.fit(x_train_temp, y_train_temp,
              batch_size=best_params['batch_size'],
              epochs=500,
              callbacks=[early_stop, history],
              verbose=0,
              validation_data=(x_valid_temp,y_valid_temp)) 

    return model

In [21]:
def get_results(best_params,feature_subset):
    model = train_best_model(best_params,feature_subset)
    x_valid_temp = x_valid.copy() 
    y_valid_temp = y_valid.copy()
    
    x_test_temp = x_test.copy() 
    y_test_temp = y_test.copy()
    
    x_valid_temp = x_valid_temp[feature_subset]
    x_test_temp = x_test_temp[feature_subset]
    
    loss,mse = model.evaluate(x_test_temp,y_test_temp,verbose=0)
    rmse = loss ** 0.5
    pct_error = (loss ** 0.5) / y_test_temp.mean()*100
    
    loss_v,mse_v = model.evaluate(x_valid_temp,y_valid_temp,verbose=0)
    rmse_v = loss_v ** 0.5
    pct_error_v = (loss_v ** 0.5) / y_valid_temp.mean()*100
    
    return rmse, pct_error, rmse_v, pct_error_v

# Dropping Zero Intensities

In [22]:
df_non_zero = df[df['sum_of_intensities'] != 0]

In [23]:
non_zero_features = list(df_non_zero['feature_name'])

In [25]:
rmse, pct_error, rmse_v, pct_error_v = get_results(best_params,non_zero_features)

In [26]:
rmse, pct_error, rmse_v, pct_error_v

(2.1643321645635916, 21.884152026792, 2.1450984150750565, 21.885553758881464)

In [27]:
df_non_zero_sorted = df_non_zero.sort_values(by=['sum_of_intensities'],ascending=False)

# Simulation

In [32]:
def simulation_of_features(df):
    feature_dict = {}
    if(len(df)<=100):
        for i in range(1,100):
            if(i < len(df)):
                drop_value = i
                print(i,'No of features dropped:',drop_value)
                subset_df = df.head(len(df) - drop_value)
                subset_df.sort_values(by=['index'],inplace=True)
                feature_subset = list(subset_df['feature_name'])

                rmse, pct_error, rmse_v, pct_error_v = get_results(best_params,feature_subset)

                feature_dict[drop_value] = rmse, pct_error, rmse_v, pct_error_v
        return feature_dict
        
    else:
        for i in range(1,100):
            drop_value = int(np.floor((i / 100) * len(df)))
            print(i,'No of features dropped:',drop_value)
            subset_df = df.head(len(df) - drop_value)
            subset_df.sort_values(by=['index'],inplace=True)
            feature_subset = list(subset_df['feature_name'])
            
            rmse, pct_error, rmse_v, pct_error_v = get_results(best_params,feature_subset)
            
            feature_dict[drop_value] = rmse, pct_error, rmse_v, pct_error_v
        return feature_dict

In [33]:
resultant_dict = simulation_of_features(df_non_zero_sorted)

1 No of features dropped: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


2 No of features dropped: 2
3 No of features dropped: 3
4 No of features dropped: 4
5 No of features dropped: 5
6 No of features dropped: 6
7 No of features dropped: 7
8 No of features dropped: 8
9 No of features dropped: 9


In [34]:
resultant_dict

{1: (2.142774743537629,
  21.666179071086813,
  2.1652376703955256,
  22.091026268619228),
 2: (2.116159224571491,
  21.397062309406945,
  2.182589119627752,
  22.268055943480604),
 3: (2.1316155661421488,
  21.553345588955448,
  2.1554717332563067,
  21.99138844279073),
 4: (2.116225664356815,
  21.397734100172922,
  2.100405575057783,
  21.429571158754047),
 5: (2.1693940703053753,
  21.935334334485894,
  2.1440706279585298,
  21.87506767114146),
 6: (2.1522599933741917,
  21.76208701573324,
  2.1501087024986743,
  21.936671653513447),
 7: (2.1707548424181633,
  21.949093471959173,
  2.1376810541374733,
  21.8098774866845),
 8: (2.232102192465942,
  22.569393237802704,
  2.1979924410949576,
  22.425209674827734),
 9: (2.8336154477389237,
  28.651457599295355,
  2.7325219488553447,
  27.878793620247354)}

In [35]:
resultant_df = pd.DataFrame(resultant_dict).T

In [36]:
resultant_df.reset_index(inplace=True)

In [37]:
resultant_df.columns = ['no_of_dropped_features', 'rmse_test', 'pct_error_test', 'rmse_valid', 'pct_error_valid']

In [38]:
temp_df = pd.DataFrame({'no_of_dropped_features':[0],'rmse_test':[2.1643321645635916],'pct_error_test':[21.884152026792],'rmse_valid':[2.1450984150750565],'pct_error_valid':[21.885553758881464]})

In [39]:
resultant_df = resultant_df.append(temp_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [40]:
resultant_df.sort_values(by=['no_of_dropped_features'],inplace=True)
resultant_df.reset_index(drop=True,inplace=True)

In [41]:
resultant_df.to_csv('abalone_lime_feature_selection_results.csv')

In [43]:
a = pd.read_csv('abalone_lime_feature_selection_results.csv',index_col=0)
a

Unnamed: 0,no_of_dropped_features,pct_error_test,pct_error_valid,rmse_test,rmse_valid
0,0,21.884152,21.885554,2.164332,2.145098
1,1,21.666179,22.091026,2.142775,2.165238
2,2,21.397062,22.268056,2.116159,2.182589
3,3,21.553346,21.991388,2.131616,2.155472
4,4,21.397734,21.429571,2.116226,2.100406
5,5,21.935334,21.875068,2.169394,2.144071
6,6,21.762087,21.936672,2.15226,2.150109
7,7,21.949093,21.809877,2.170755,2.137681
8,8,22.569393,22.42521,2.232102,2.197992
9,9,28.651458,27.878794,2.833615,2.732522
