# T81-558: Applications of Deep Neural Networks
**Fall 2017 Kaggle Solution**
* Instructor: [Jeff Heaton](https://sites.wustl.edu/jeffheaton/), School of Engineering and Applied Science, [Washington University in St. Louis](https://engineering.wustl.edu/Programs/Pages/default.aspx)
* For more information visit the [class website](https://sites.wustl.edu/jeffheaton/t81-558/).

# Useful Functions

Same functions as in class.

In [1]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

# Encode and Setup Network

These functions define the neural network I will use and how I will encode the feature vector. Also the BATCH_SIZE and MAX_EPOCHS are defined here.  I craft this area to get better scores.

In [27]:
import keras

BATCH_SIZE = 16
MAX_EPOCHS = 158

def encode(df):    
    
    # Feature engineering
    df['density'] = df['population'] / df['sqmiles']    
    df['dcomp'] = 0 
    df.loc[df['type']==0,'dcomp'] = df.loc[df['type']==0,'0_type_count']
    df.loc[df['type']==1,'dcomp'] = df.loc[df['type']==1,'1_type_count']
    df.loc[df['type']==2,'dcomp'] = df.loc[df['type']==2,'2_type_count']
    df.loc[df['type']==3,'dcomp'] = df.loc[df['type']==3,'3_type_count']
    df.loc[df['type']==4,'dcomp'] = df.loc[df['type']==4,'4_type_count']
    df['wealth'] = (df['income'] * df['population'])/1000
    df['tcomp'] = df['0_type_count']+df['1_type_count']+df['2_type_count']+df['3_type_count']+df['4_type_count']
    df['junk'] = np.random.uniform(size=len(df))
    
    # Encoding
    encode_numeric_zscore(df,'age') 
    encode_numeric_zscore(df,'sqft') 
    encode_numeric_zscore(df,'income')
    encode_numeric_zscore(df,'lot_size')
    encode_numeric_zscore(df,'pets')
    encode_numeric_zscore(df,'population')
    encode_numeric_zscore(df,'sqmiles')
    encode_numeric_zscore(df,'urban')
    encode_numeric_zscore(df,'density')
    encode_numeric_zscore(df,'wealth')
    encode_numeric_zscore(df,'zip') # bad idea, but I want to show where it ranks
    encode_text_dummy(df,'type_name')
    encode_numeric_zscore(df,'dcomp')
    
    # Feature selection
    df.drop('type', axis=1, inplace=True)
    df.drop('id', axis=1, inplace=True)
    
    #df.drop('sqft', axis=1, inplace=True)
    #df.drop('wealth', axis=1, inplace=True)
    #df.drop('density', axis=1, inplace=True)
    #df.drop('sqmiles', axis=1, inplace=True)
    #df.drop('population', axis=1, inplace=True)
    #df.drop('dcomp', axis=1, inplace=True)
    #df.drop('urban', axis=1, inplace=True)##
    df.drop('income', axis=1, inplace=True)
    df.drop('tcomp', axis=1, inplace=True)
    df.drop('1_type_count', axis=1, inplace=True)
    df.drop('0_type_count', axis=1, inplace=True)
    df.drop('3_type_count', axis=1, inplace=True)
    df.drop('2_type_count', axis=1, inplace=True)
    df.drop('lot_size', axis=1, inplace=True)
    df.drop('zip', axis=1, inplace=True)
    df.drop('pets', axis=1, inplace=True)
    df.drop('age', axis=1, inplace=True)
    df.drop('junk', axis=1, inplace=True)
    
    
    if 'sales' in df.columns:
        df.drop(df[df.sales == 0].index, inplace=True)
    
def build_network(x):
    model = Sequential()
    model.add(Dense(100, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(keras.layers.Dropout(0.1))
    model.add(Dense(25, activation='relu'))
    model.add(keras.layers.Dropout(0.1))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam())
    return model

# Feature Importance

I will evaluate feature importance, usually over 100 epochs, several times.  Each time I feature select (remove features) or engineer new ones, I will run at least one fold of the CV section, just to see what my validation error is.  This lets me get a rough idea of if I am helping or hurting anything.

In [3]:
from sklearn import metrics
import scipy as sp
import numpy as np
import math
from sklearn import metrics

def perturbation_rank(model, x, y, names, regression):
    errors = []

    for i in range(x.shape[1]):
        hold = np.array(x[:, i])
        np.random.shuffle(x[:, i])
        
        if regression:
            pred = model.predict(x)
            error = metrics.mean_squared_error(y, pred)
        else:
            pred = model.predict_proba(x)
            error = metrics.log_loss(y, pred)
            
        errors.append(error)
        x[:, i] = hold
        
    max_error = np.max(errors)
    importance = [e/max_error for e in errors]

    data = {'name':names,'error':errors,'importance':importance}
    result = pd.DataFrame(data, columns = ['name','error','importance'])
    result.sort_values(by=['importance'], ascending=[0], inplace=True)
    result.reset_index(inplace=True, drop=True)
    return result

In [3]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

path = "./"


filename_train = os.path.join(path,"preprocess_train.csv")
df_train = pd.read_csv(filename_train,na_values=['NA'])

# Preprocess
# encode(df_train)


# # Encode to a 2D matrix for training
# x_train,y_train = to_xy(df_train,'sales')

# # Fit on entire training
# model = build_network(x_train)
# model.fit(x_train,y_train,batch_size=BATCH_SIZE,verbose=2,epochs=MAX_EPOCHS)
    
# # Predict
# pred = model.predict(x_train)

ImportError: cannot import name np_utils

In [30]:
# This is my orig ranking, with no feature selection (removing low ranked columns)
from IPython.display import display, HTML

names = list(df_train.columns.values) # x column names
names.remove('sales') # Remove the target
rank = perturbation_rank(model, x_train, y_train, names, True)
display(rank)

Unnamed: 0,name,error,importance
0,type_name-grocery,2.884399,1.0
1,sqft,1.832996,0.635486
2,wealth,0.533841,0.185079
3,type_name-electronics,0.532955,0.184772
4,type_name-outdoors,0.462659,0.1604
5,type_name-farm,0.455983,0.158086
6,density,0.334284,0.115894
7,type_name-hardware,0.333903,0.115762
8,sqmiles,0.232331,0.080548
9,population,0.171833,0.059573


In [22]:
# This is my new ranking, with feature selection (removing low ranked columns)
from IPython.display import display, HTML

names = list(df_train.columns.values) # x column names
names.remove('sales') # Remove the target
rank = perturbation_rank(model, x_train, y_train, names, True)
display(rank)

Unnamed: 0,name,error,importance
0,type_name-grocery,2.814895,1.0
1,sqft,2.04513,0.726539
2,wealth,0.654666,0.232572
3,density,0.534584,0.189913
4,sqmiles,0.52001,0.184735
5,type_name-hardware,0.374401,0.133007
6,type_name-electronics,0.311244,0.11057
7,type_name-outdoors,0.29062,0.103244
8,type_name-farm,0.199498,0.070872
9,population,0.155126,0.055109


# Crossvalidate

This section has two purposes:

* I run this initially to determine if cutting features or engineering others is helping or hurting.
* I run this again to get an idea of what my CV score is (estimate of my Kaggle position) and to see how many epochs I might want.

In [18]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

path = "./data/"
MAX_EPOCH = 1000

filename_read = os.path.join(path,"train.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Preprocess
encode(df)

# Encode to a 2D matrix for training
x,y = to_xy(df,'sales')
    
oos_y = []
oos_pred = []
epochs = []

# Cross validate
kf = KFold(5)

fold = 0
for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = build_network(x)
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=50, verbose=1, mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],batch_size=BATCH_SIZE,
              verbose=0,epochs=MAX_EPOCH)
    
    if monitor.stopped_epoch == 0:
        epochs.append(MAX_EPOCH)
    else:
        epochs.append(monitor.stopped_epoch)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))


# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("Final, out of sample score (RMSE): {}".format(score))    

print("Fold early stop epochs: {}".format(epochs))
print("Mean epochs: {}".format(int(np.mean(epochs))))

# 0.2790030837059021 - 9 features
# 0.2442721426486969 - 7 features
# 0.2635799050331116 - 5 features

Fold #1
Epoch 00147: early stopping
Fold score (RMSE): 0.26310887932777405
Fold #2
Epoch 00184: early stopping
Fold score (RMSE): 0.24052371084690094
Fold #3
Epoch 00147: early stopping
Fold score (RMSE): 0.2813667356967926
Fold #4
Epoch 00184: early stopping
Fold score (RMSE): 0.2582513093948364
Fold #5
Epoch 00132: early stopping
Fold score (RMSE): 0.269981324672699
Final, out of sample score (RMSE): 0.26299387216567993
Fold early stop epochs: [147, 184, 147, 184, 132]
Mean epochs: 158


# Submit

Finally, generate a submission.  Train on the entire set, for the number of epochs that the CV indicated to do.

In [20]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

path = "./data/"

filename_train = os.path.join(path,"train.csv")
filename_test = os.path.join(path,"test.csv")
filename_write = os.path.join(path,"submit.csv")


df_train = pd.read_csv(filename_train,na_values=['NA','?'])
df_test = pd.read_csv(filename_test,na_values=['NA','?'])

test_ids = df_test['id']

# Preprocess
encode(df_train)
encode(df_test)


# Encode to a 2D matrix for training
x_train,y_train = to_xy(df_train,'sales')
x_test = df_test.as_matrix().astype(np.float32)

# Fit on entire training
model = build_network(x_train)
model.fit(x_train,y_train,batch_size=BATCH_SIZE,verbose=2,epochs=MAX_EPOCHS)
    
# Predict
pred = model.predict(x_test)
    
# Create submission data set
df_submit = pd.DataFrame(pred)
df_submit.insert(0,'id',test_ids)
df_submit.columns = ['id','sales']

df_submit.to_csv(filename_write, index=False)

Epoch 1/158
7s - loss: 0.4290
Epoch 2/158
7s - loss: 0.2051
Epoch 3/158
7s - loss: 0.1629
Epoch 4/158
7s - loss: 0.1473
Epoch 5/158
7s - loss: 0.1376
Epoch 6/158
7s - loss: 0.1285
Epoch 7/158
7s - loss: 0.1267
Epoch 8/158
7s - loss: 0.1219
Epoch 9/158
7s - loss: 0.1188
Epoch 10/158
7s - loss: 0.1185
Epoch 11/158
7s - loss: 0.1179
Epoch 12/158
7s - loss: 0.1120
Epoch 13/158
7s - loss: 0.1124
Epoch 14/158
7s - loss: 0.1108
Epoch 15/158
7s - loss: 0.1145
Epoch 16/158
7s - loss: 0.1126
Epoch 17/158
7s - loss: 0.1094
Epoch 18/158
7s - loss: 0.1092
Epoch 19/158
7s - loss: 0.1074
Epoch 20/158
7s - loss: 0.1070
Epoch 21/158
7s - loss: 0.1046
Epoch 22/158
7s - loss: 0.1068
Epoch 23/158
7s - loss: 0.1030
Epoch 24/158
7s - loss: 0.1035
Epoch 25/158
7s - loss: 0.1040
Epoch 26/158
7s - loss: 0.1055
Epoch 27/158
7s - loss: 0.1012
Epoch 28/158
7s - loss: 0.1007
Epoch 29/158
7s - loss: 0.1026
Epoch 30/158
7s - loss: 0.1018
Epoch 31/158
7s - loss: 0.1012
Epoch 32/158
7s - loss: 0.1014
Epoch 33/158
7s -