In [1]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = "./Data/"

filename_read = os.path.join(path,"mpg.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

In [3]:
# Shuffle
np.random.seed(12)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

In [4]:
# Preprocess
cars = df['name']
df.drop('name',1,inplace=True)

def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)



In [5]:
# 2D matrix for training

def missing_median(df,name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
missing_median(df, 'horsepower')
x,y = to_xy(df,'mpg')



In [6]:
# Cross-validate

kf = KFold(5)

oos_y = []
oos_pred = []
fold = 0

for train, test in kf.split(x):
    fold +=1
    print("fold #{}".format(fold))
    
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))


oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("Final, out of sample score (RMSE): {}".format(score))    


fold #1
Epoch 00379: early stopping
Fold score (RMSE): 3.2287333011627197
fold #2
Epoch 00239: early stopping
Fold score (RMSE): 4.114338397979736
fold #3
Epoch 00363: early stopping
Fold score (RMSE): 5.652881622314453
fold #4
Epoch 00241: early stopping
Fold score (RMSE): 6.445863723754883
fold #5
Epoch 00332: early stopping
Fold score (RMSE): 5.2656145095825195
Final, out of sample score (RMSE): 5.066740036010742
