In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer

In [2]:
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 
from keras.models import load_model

Using TensorFlow backend.


# Data fetch

In [3]:
def data_fetch(csv_name):
    
    print("Reading the data...")
    df = pd.read_csv('datasets/{}.csv'.format(csv_name))
    
    return df

# Pre Processing

In [4]:
def get_string_cols(df):
    
    string_cols = list(df.select_dtypes(include=['object','category']).columns)
    
    return string_cols

In [5]:
def get_num_cols(df):
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_cols = list(df.select_dtypes(include=numerics).columns)
    
    return num_cols

In [6]:
# def get_bool_cols(df):
    
#     bool_cols = list(df.select_dtypes(include='bool').columns)
    
#     return bool_cols

In [7]:
def pre_process(df,label_col):

    print("Pre-Processing the data...")
    # Remove y
    y = df[label_col]
    df = df.drop(label_col,axis=1)

    string_cols = get_string_cols(df)
    num_cols = get_num_cols(df)
#     bool_cols = get_bool_cols(df)
    
    # Categorical Columns
    substring = ':string'
    num_cat_cols = []
    for string in num_cols:
        if(substring in string):
            num_cat_cols.append(string)
    #Convert all num_cat_cols to cat_cols
    for col in num_cat_cols:
        df[col] = df[col].astype('object')
    categorical_cols = string_cols + num_cat_cols
    
    # Numerical Columns
#     numerical_cols = list(set(num_cols) - set(num_cat_cols))
    
    # Pre Processing Categorical Columns
    df = pd.get_dummies(df,columns=categorical_cols)
    global training_dummy_columns
    training_dummy_columns = df.columns
    
    # Pre Processing Numerical Columns
    
    # Remove :string from the df column names
    df.columns = df.columns.str.replace(':string','')
    
    # Add back y
    df[label_col] = y
    
    return df

# Train Test Split

In [8]:
def split_train_test(df,label_col,test_size=0.2):
    
    X = df.loc[:,df.columns != label_col]
    y = df[label_col]

    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=42)
    
    return train_x, test_x, train_y, test_y

# Feature Selection

In [9]:
def rfe_feat_selection(train_x,train_y,label_col,num_features):
    
    from sklearn.feature_selection import RFE
    from sklearn.ensemble import RandomForestRegressor
    
    print("Selecting the best features for training...")
    step = int(np.ceil(train_x.shape[1] / 100))
#     print('Step:',step)

    estimator = RandomForestRegressor(warm_start=True, random_state=42)
    selector = RFE(estimator,step=step,n_features_to_select=num_features,verbose=0) 
    selector = selector.fit(train_x, train_y)

    print('No of selected features:',selector.n_features_)

    selected_cols = []
    for val,col in zip(selector.support_,train_x.columns):
        if(val == True):
            selected_cols.append(col)

    return selected_cols

# Model Training

In [10]:
def get_search_space():
    
    space = {'num_layers': hp.choice('num_layers',['one_hidden', 'two_hidden']),

                'units1': hp.choice('units1', [32, 64, 128, 256,512]),
                'units2': hp.choice('units2', [32, 64, 128, 256,512]),

                'dropout1': hp.uniform('dropout1', .25,.75),
                'dropout2': hp.uniform('dropout2',  .25,.75),

                'batch_size' : hp.choice('batch_size', [16,32,64,128]),

                'nb_epochs' :  500,
                'optimizer': hp.choice('optimizer',['rmsprop', 'adam', 'nadam','sgd']),
                'activation': hp.choice('activation',['relu','sigmoid']),

                'early_stop_rounds': hp.choice('early_stop_rounds',[10,20,30,40,50]),
            }
    return space

In [11]:
def data(csv_name,label_col,num_features):
    
    data = data_fetch(csv_name)
    
    pre_processed_data = pre_process(df=data,label_col=label_col)
    
    train_x, test_x, train_y, test_y = split_train_test(df=pre_processed_data,label_col=label_col)
    
    best_features = rfe_feat_selection(train_x,train_y,label_col=label_col,num_features=num_features)
    
    best_features.append(label_col)
    
    feature_selected_data = pre_processed_data[best_features]
    
    x_train, x_test, y_train, y_test = split_train_test(df=feature_selected_data,label_col=label_col)
    
    return data, x_train, x_test, y_train, y_test

In [12]:
def create_model(params):
    
    x_train_temp = x_train.copy()
    x_test_temp = x_test.copy()
    y_train_temp = y_train.copy()
    y_test_temp = y_test.copy()
    
    model = Sequential()
    model.add(Dense(params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(params['activation']))
    model.add(Dropout(params['dropout1']))

    # If we choose 'two_hidden', add an additional layer
    if(params['num_layers'] == 'two_hidden'):
        model.add(Dense(params['units2']))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout2']))
        
    model.add(Dense(1))
    model.add(Activation('linear'))

    model.compile(loss='mse', metrics=['mae'],
                  optimizer=params['optimizer'])
    
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=params['early_stop_rounds'])
    history = History()
    
    model.fit(x_train_temp, y_train_temp,
              batch_size=params['batch_size'],
              epochs=500,
              callbacks=[early_stop, history],
              verbose=0,
              validation_split=0.2)
    
    [loss, mae] = model.evaluate(x_test_temp, y_test_temp, verbose=0)
    
    # In cases where the loss turns out to be nan (due to bad network architecture)
    # An Assertion error is raised by hyperopt. Because of the nan value of loss.
    # So, to avoid such a case, we update loss to infinity in that case.
    if(np.isnan(mae)):
        print("Testing set Mean Abs Error: NaN")
        return {'loss': np.inf, 'status': STATUS_OK, 'model': model}
    
    print("Testing set Mean Abs Error: {:7.2f}".format(mae))
    
    return {'loss': loss, 'status': STATUS_OK, 'model': model}

In [13]:
def get_best_model(csv_name,label_col,num_features):
    
    global x_train,x_test,y_train,y_test
    input_df, x_train, x_test, y_train, y_test = data(csv_name=csv_name,label_col=label_col,num_features=num_features)
    
    
    trials=Trials()
    space = get_search_space()
    print("Moulding the network architecture specifically for your data...")
    print("Running evaluatins with various architectures and hyper-parameters...")
    best = fmin(create_model, space, algo=tpe.suggest, max_evals=10, trials=trials)
    best_model = trials.best_trial['result']['model']
    
#     print('\nBest params are:\n')
#     print(best)
#     print('\nBest model:\n')
#     print(best_model.summary())
    
    
    scaled_feature_df = pd.concat([x_train,x_test])
    label_df = pd.concat([y_train,y_test])
    
    pred_df = make_predictions(model=best_model,df=scaled_feature_df)
    
    output_df = pd.merge(input_df,pred_df['predictions'].to_frame(),left_index=True,right_index=True)
    
    return best_model, output_df

# Make Predictions

In [14]:
def make_predictions(model,df):
    
    # Prdeicting on whole df
    
    predictions = model.predict(df).flatten()
    df['predictions'] = predictions
    
    return df

# Displaying Result

In [24]:
def display_results(best_model,output_df):
    
    print("##################################################")
    print("Results:")
    print("Training Size: {} rows".format(x_train.shape[0]))
    print("Testing Size: {} rows".format(x_test.shape[0]))
    
    # Evaluation on test data
    loss,mae = best_model.evaluate(x_test,y_test,verbose=0)
    
    print("RMSE on the test data: ",(loss**0.5))
    print("Percent error on the test data: ", (loss ** 0.5 / output_df.SalePrice.mean())*100, "%")
    print("##################################################")

# Save the model and update the config db

In [16]:
def save(best_model,output_df,training_dummy_columns):
    
    print(best_model)
    print(output_df.shape)
    print(training_dummy_columns)

# Driver function

In [17]:
def driver(csv_name,label_col,num_features):
    
    best_model, output_df = get_best_model(csv_name=csv_name,label_col=label_col,num_features=num_features)
    
    display_results(best_model=best_model, output_df=output_df)
    
    # Save the model and update config db if user wants to save 
    save(best_model=best_model, output_df=output_df,training_dummy_columns=training_dummy_columns)
    

# Main function

In [18]:
if __name__ == '__main__':

    csv_name = 'train_no_null'
    label_col = 'SalePrice'
    num_features = 10
    
    driver(csv_name=csv_name,label_col=label_col,num_features=num_features)

Reading the data...
Pre-Processing the data...
Selecting the best features for training...
No of selected features: 10
Moulding the network architecture specifically for your data...
Running evaluatins with various architectures and hyper-parameters...
Testing set Mean Abs Error: 178150.82
Testing set Mean Abs Error: 56339.97
Testing set Mean Abs Error: 30781.71
Testing set Mean Abs Error: 165743.17
Testing set Mean Abs Error: 28308.02
Testing set Mean Abs Error: 54410.58
Testing set Mean Abs Error: 29250.03
Testing set Mean Abs Error: 177557.11
Testing set Mean Abs Error: 179356.46
"Testing set Mean Abs Error: NaN
##################################################
Results:
Training Size: 1160 rows
Testing Size: 291 rows
RMSE on the test data:  40501.29110427718
Percent error on the test data:  22.422971605741882 %
##################################################
<keras.engine.sequential.Sequential object at 0x7fa1a2aaaf60>
(1451, 77)
Index(['LotFrontage', 'LotArea', 'OverallQual', '

In [19]:
# # No of epochs used
# len(best_model.history.history['loss'])

# Displaying graphical results

In [23]:
# import matplotlib.pyplot as plt

In [22]:
# %matplotlib inline
# fig, ax = plt.subplots(figsize=(10, 10))
# plt.scatter(output_df.SalePrice,output_df.predictions)
# fig.suptitle('Prediction Analysis', fontsize=20)
# ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c="r")
# plt.xlabel('Actual', fontsize=14)
# plt.ylabel('Predicted', fontsize=14)