In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [3]:
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
data = pd.concat([train, test], axis=0)
is_train = ~data.y.isnull()

print('Size of the data set: {} rows and {} columns'.format(*data.shape))

Size of the data set: 8418 rows and 378 columns


In [5]:
cat_feats = data.select_dtypes(include=['object']).columns
dummies = pd.get_dummies(data[cat_feats], drop_first=False)

print('oh-encoded shape: {} rows and {} columns'.format(*dummies.shape))

oh-encoded shape: 8418 rows and 211 columns


In [6]:
data = pd.concat([data.drop(cat_feats, axis=1), dummies], axis=1)

print('Size of the data set: {} rows and {} columns'.format(*data.shape))

Size of the data set: 8418 rows and 581 columns


In [7]:
train, test = data[is_train], data[~is_train]

X_train = train.drop(['ID', 'y'], axis=1).values
X_test = test.drop(['ID', 'y'], axis=1).values
y_train = train.y.values

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(4209, 579)
(4209,)
(4209, 579)


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

model = SelectFromModel(rf, prefit=True)
X_train = model.transform(X_train)
X_test = model.transform(X_test)

print(X_train.shape) 
print(X_test.shape)

(4209, 82)
(4209, 82)


In [9]:
from keras import backend as K

def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return (1 - SS_res/(SS_tot + K.epsilon()))

Using TensorFlow backend.


In [10]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping, ModelCheckpoint

model = Sequential()

model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1]))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(X_train.shape[1]))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))

model.add(Dense(X_train.shape[1]//2))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))

model.add(Dense(X_train.shape[1]//4))
model.add(Activation('sigmoid'))

model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=[r2_keras])

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import os

model_path = 'nn_model.h5'

X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.2)

callbacks = [EarlyStopping(monitor='val_loss', patience=10, verbose=0),
             ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, verbose=0)]

model.fit(X_tr, y_tr, batch_size=20, nb_epoch=100, validation_data=(X_te, y_te), 
              verbose=0, callbacks=callbacks, shuffle=True)

if os.path.isfile(model_path):
    model = load_model(model_path, custom_objects={'r2_keras': r2_keras})

In [12]:
y_pr = model.predict(X_tr)**0.5

print('MSE train: {:.2f}, R^2 train: {:.2f}'.format(
    mean_squared_error(y_tr, y_pr), r2_score(y_tr, y_pr)))

MSE train: 8393.93, R^2 train: -49.84


In [13]:
y_pr = model.predict(X_te)**0.5

print('MSE test: {:.2f}, R^2 test: {:.2f}'.format(
    mean_squared_error(y_te, y_pr), r2_score(y_te, y_pr)))

MSE test: 8247.52, R^2 test: -56.74


In [14]:
test_id = test['ID']
y_pred = model.predict(X_test).ravel()

output = pd.DataFrame({'id': test_id, 'y': y_pred})
output.to_csv('submission_nn.csv', index=False)