In [1]:
import numpy as np
import pandas as pd
import pprint

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import mean_squared_error as MSE, make_scorer

from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import ReLU, PReLU
from keras.optimizers import SGD, Adam

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import pickle
import datetime

In [2]:
train = pd.read_csv('../data/input/train.csv')
test = pd.read_csv('../data/input/test.csv')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

train_test = pd.concat([train, test], ignore_index=True, sort=False)

# Preprocessing

## car name, car brand

In [3]:
train_test['car name'] = train_test['car name'].str.replace('vw', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('vokswagen', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('toyouta', 'toyota')
train_test['car name'] = train_test['car name'].str.replace('chevy', 'chevrolet')
train_test['car name'] = train_test['car name'].str.replace('datsun 200-sx', 'datsun 200sx')

train_test['car name'] = train_test['car name'].str.replace('datsun 210 mpg', 'datsun 210')
train_test['car name'] = train_test['car name'].str.replace('ford gran torino (sw)', 'ford gran torino')

In [4]:
train_test['car_brand'] = train_test['car name'].apply(lambda x: x.strip().split(' ')[0])

## horse power

In [5]:
train_test['horsepower'].replace('?', np.nan, inplace=True)
train_test['horsepower'] = train_test['horsepower'].astype(float)

In [6]:
# car name毎のhorse powerの平均値を計算(nanは計算外)
name_hp_mean = train_test.groupby('car name').horsepower.mean()
df_hp_isnull = train_test[train_test['horsepower'].isnull()]

# nan位置のtrainデータのhorse powerを置換する
for i in df_hp_isnull.index:
    train_test.loc[i, 'horsepower'] = name_hp_mean[df_hp_isnull.loc[i, 'car name']]
    
del df_hp_isnull, name_hp_mean

# Encoding

In [7]:
cat_cols = ['car name', 'car_brand']
for cols in cat_cols:
    train_test[cols] = LabelEncoder().fit_transform(train_test[cols])

del cat_cols

# Create Model 

In [8]:
test_treated = train_test[train_test['mpg'].isnull()].copy()
train_treated = train_test.dropna(subset=['mpg'], axis=0).copy()

train_x = train_treated.drop(['id', 'mpg'], axis=1)
train_y = train_treated['mpg']

In [9]:
kf = KFold(n_splits=4, shuffle=True, random_state=1)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [10]:
print("tr_x:", tr_x.shape)
print("tr_y:", tr_y.shape)

print("va_x:", va_x.shape)
print("va_y:", va_y.shape)

tr_x: (375, 9)
tr_y: (375,)
va_x: (125, 9)
va_y: (125,)


In [57]:
class MLP:

    def __init__(self, params):
        self.params = params
        self.scaler = None
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):

        # パラメータ
        input_dropout = self.params['input_dropout']
        hidden_layers = int(self.params['hidden_layers'])
        hidden_units = int(self.params['hidden_units'])
        hidden_activation = self.params['hidden_activation']
        hidden_dropout = self.params['hidden_dropout']
        batch_norm = self.params['batch_norm']
        optimizer_type = self.params['optimizer']['type']
        optimizer_lr = self.params['optimizer']['lr']
        batch_size = int(self.params['batch_size'])

        # 標準化
        self.scaler = StandardScaler()
        tr_x = self.scaler.fit_transform(tr_x)
        va_x = self.scaler.transform(va_x)

        self.model = Sequential()

        # 入力層
        self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],)))

        # 中間層
        for i in range(hidden_layers):
            self.model.add(Dense(hidden_units))
            if batch_norm == 'before_act':
                self.model.add(BatchNormalization())
            if hidden_activation == 'prelu':
                self.model.add(PReLU())
            elif hidden_activation == 'relu':
                self.model.add(ReLU())
            else:
                raise NotImplementedError
            self.model.add(Dropout(hidden_dropout))

        # 出力層
        self.model.add(Dense(1))

        # オプティマイザ
        if optimizer_type == 'sgd':
            optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
        elif optimizer_type == 'adam':
            optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.)
        else:
            raise NotImplementedError

        # 目的関数、評価指標などの設定
        self.model.compile(loss='binary_crossentropy',
                           optimizer=optimizer, metrics=['mse'])

        # エポック数、アーリーストッピング
        # あまりepochを大きくすると、小さい学習率のときに終わらないことがあるので注意
        nb_epoch = 200
        patience = 20
        early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)

        # 学習の実行
        history = self.model.fit(tr_x, tr_y,
                                 epochs=nb_epoch,
                                 batch_size=batch_size, verbose=0,
                                 validation_data=(va_x, va_y),
                                 callbacks=[early_stopping])

    def predict(self, x):
        # 予測
        x = self.scaler.transform(x)
        y_pred = self.model.predict(x)
        y_pred = y_pred.flatten()
        return y_pred


In [54]:
def RMSE(y_true, y_pred):
    return(np.sqrt(MSE(y_true, y_pred)))

def objective(params):    
    model = MLP(params)
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = RMSE(va_y, va_pred)
    print(f'params:{params}, RMSE:{score:.4f}')
    
    history.append((params, score))
    
    return {
        'loss':score,
        'status':STATUS_OK,
        'params':params
    }

In [13]:
base_param = {
    'input_dropout': 0.0,
    'hidden_layers': 3,
    'hidden_units': 96,
    'hidden_activation': 'relu',
    'hidden_dropout': 0.2,
    'batch_norm': 'before_act',
    'optimizer': {'type': 'adam', 'lr': 0.001},
    'batch_size': 64,
}

In [14]:
param_space ={
    'input_dropout': hp.quniform('input_dropout', 0, 0.2, 0.05),
    'hidden_layers': hp.quniform('hidden_layers', 2, 4, 1),
    'hidden_units': hp.quniform('hidden_units', 32, 256, 32),
    'hidden_activation': hp.choice('hidden_activation', ['prelu', 'relu']),
    'hidden_dropout': hp.quniform('hidden_dropout', 0, 0.3, 0.05),
    'batch_norm': hp.choice('batch_norm', ['before_act', 'no']),
    'optimizer': hp.choice('optimizer',
                           [{'type': 'adam',
                             'lr': hp.loguniform('adam_lr', np.log(0.00001), np.log(0.01))},
                            {'type': 'sgd',
                             'lr': hp.loguniform('sgd_lr', np.log(0.00001), np.log(0.01))}]),
    'batch_size': hp.quniform('batch_size', 32, 128, 32),
}

In [64]:
max_evals = 10
nn_trials = Trials()
history = []
best = fmin(objective, param_space, algo=tpe.suggest, trials=nn_trials, max_evals=max_evals)

params:{'batch_norm': 'before_act', 'batch_size': 96.0, 'hidden_activation': 'prelu', 'hidden_dropout': 0.25, 'hidden_layers': 3.0, 'hidden_units': 96.0, 'input_dropout': 0.15000000000000002, 'optimizer': {'lr': 0.0016497304579874068, 'type': 'adam'}}, RMSE:26.8290
params:{'batch_norm': 'before_act', 'batch_size': 96.0, 'hidden_activation': 'relu', 'hidden_dropout': 0.1, 'hidden_layers': 3.0, 'hidden_units': 192.0, 'input_dropout': 0.15000000000000002, 'optimizer': {'lr': 0.0012792902485279035, 'type': 'adam'}}, RMSE:27.0367
params:{'batch_norm': 'before_act', 'batch_size': 64.0, 'hidden_activation': 'prelu', 'hidden_dropout': 0.0, 'hidden_layers': 3.0, 'hidden_units': 224.0, 'input_dropout': 0.15000000000000002, 'optimizer': {'lr': 0.0008111102288841103, 'type': 'sgd'}}, RMSE:53.5599
params:{'batch_norm': 'before_act', 'batch_size': 96.0, 'hidden_activation': 'prelu', 'hidden_dropout': 0.05, 'hidden_layers': 3.0, 'hidden_units': 64.0, 'input_dropout': 0.0, 'optimizer': {'lr': 0.007983

In [66]:
history

[({'batch_norm': 'before_act',
   'batch_size': 96.0,
   'hidden_activation': 'prelu',
   'hidden_dropout': 0.25,
   'hidden_layers': 3.0,
   'hidden_units': 96.0,
   'input_dropout': 0.15000000000000002,
   'optimizer': {'lr': 0.0016497304579874068, 'type': 'adam'}},
  26.829003796482798),
 ({'batch_norm': 'before_act',
   'batch_size': 96.0,
   'hidden_activation': 'relu',
   'hidden_dropout': 0.1,
   'hidden_layers': 3.0,
   'hidden_units': 192.0,
   'input_dropout': 0.15000000000000002,
   'optimizer': {'lr': 0.0012792902485279035, 'type': 'adam'}},
  27.036676561857483),
 ({'batch_norm': 'before_act',
   'batch_size': 64.0,
   'hidden_activation': 'prelu',
   'hidden_dropout': 0.0,
   'hidden_layers': 3.0,
   'hidden_units': 224.0,
   'input_dropout': 0.15000000000000002,
   'optimizer': {'lr': 0.0008111102288841103, 'type': 'sgd'}},
  53.55986256347308),
 ({'batch_norm': 'before_act',
   'batch_size': 96.0,
   'hidden_activation': 'prelu',
   'hidden_dropout': 0.05,
   'hidden_la

In [68]:
history[8]

({'batch_norm': 'no',
  'batch_size': 64.0,
  'hidden_activation': 'prelu',
  'hidden_dropout': 0.15000000000000002,
  'hidden_layers': 3.0,
  'hidden_units': 224.0,
  'input_dropout': 0.2,
  'optimizer': {'lr': 0.007221454504585187, 'type': 'adam'}},
 12.247204493711498)

# Create Submission

In [90]:
best_params = history[8][0]
best_model = MLP(best_params)
best_model.fit(tr_x, tr_y, va_x, va_y)
score = RMSE(va_y, best_model.predict(va_x))
pprint.pprint(best_params)
print(f'RMSE:{score:.4f}')

pred = best_model.predict(test_treated.drop(['id', 'mpg'], axis=1))

{'batch_norm': 'no',
 'batch_size': 64.0,
 'hidden_activation': 'prelu',
 'hidden_dropout': 0.15000000000000002,
 'hidden_layers': 3.0,
 'hidden_units': 224.0,
 'input_dropout': 0.2,
 'optimizer': {'lr': 0.007221454504585187, 'type': 'adam'}}
RMSE:10.2538


In [91]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
submission = pd.DataFrame({'id':sample_sub.id})

submission = pd.concat(
    [submission, pd.Series(pred, name='pred')],
    axis=1
)

display(submission)

Unnamed: 0,id,pred
0,1,22.808289
1,2,15.977661
2,5,17.584614
3,6,25.347910
4,8,23.612181
...,...,...
495,992,17.968962
496,993,24.335495
497,996,18.278454
498,998,17.542641


# Save Files

In [93]:
# features
train_treated.to_pickle('../features/feature_train_' + dt + '_treated.pkl')
test_treated.to_pickle('../features/feature_test_' + dt + '_treated.pkl')

# submission
submission[['id', 'pred']].to_csv('../data/output/sub_' + dt + '_nn.csv', header=False, index=False)
# model
pickle.dump(best_model, open('../models/model_' + dt + '_nn.pickle', 'wb'))
# best parameter
pickle.dump(best_params, open('../logs/params_' + dt + '_nn.pickle', 'wb'))
# best train score
pickle.dump(score, open('../logs/train_score' + dt + '_nn.pickle', 'wb'))