In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.utils import resample
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

from time import time
import operator

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

### Start training

In [2]:
# Import data
def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].replace(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

#columns = (["id"] + ["reflectance_" + str(i) for i in range(7)]
#           + ["solar_" + str(i) for i in range(5)] + ["y"])
#full_data = pd.read_csv("MODIS.csv", header=None, names=columns)
#split = 98000
#train, test = full_data[:split].copy(), full_data[split:].copy()
#train_copy, test_copy = load_data()
train, test = load_data()

# Parameters
n_threads = -1
random_seed = 22

In [3]:
train.head()

Unnamed: 0,reflectance_0,reflectance_1,reflectance_2,reflectance_3,reflectance_4,reflectance_5,reflectance_6,solar_0,solar_1,solar_2,solar_3,solar_4,id,y
0,1.580642,2.482233,5.887092,4.732722,4.408482,3.830171,4.388508,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
1,2.338455,3.627796,4.723716,3.324726,2.743442,4.727652,2.810193,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
2,2.224569,3.522241,6.188831,4.389783,4.177616,4.945918,4.122848,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
3,1.717218,2.712012,5.024211,3.944907,3.393424,3.931973,3.489578,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
4,2.378857,3.644976,4.515292,3.223825,2.739952,4.599662,2.781574,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082


In [3]:
#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

### Preprocessing

In [4]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Standardise data for LR
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

### Neural net

In [6]:
#feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_orig]

#regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                         # hidden_units=[9, 9],
                                         #)#model_dir="./temp_log")

#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

#def input_fn(data_set):
 #   feature_cols = {k: tf.constant(data_set[k].values) for k in cols_orig}
  #  labels = tf.constant(data_set["y"].values)
    
   # return feature_cols, labels

#validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
 #   input_fn=lambda: input_fn(test),
  #  early_stopping_rounds=100)

#regressor.fit(input_fn=lambda: input_fn(train), steps=10)
              #monitors=[validation_monitor])

#ev = regressor.evaluate(input_fn=lambda: input_fn(train), steps=1)
#loss_score = ev["loss"]
#print("Loss: {0:f}".format(loss_score))

#y = regressor.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to a list and print predictions
#predictions = np.array(list(itertools.islice(y, 0, None)))

### Cross-validation

In [6]:
cols_dnn = cols_orig

models_weights = {"dnn_1": 0.2, "dnn_2": 0.2, "dnn_3": 0.2,
                  "dnn_4": 0.2, "dnn_5": 0.2}
#models_cols = {"dnn_1": cols_dnn}
models_cols = {"dnn_1": cols_dnn, "dnn_2": cols_dnn, "dnn_3": cols_dnn,
               "dnn_4": cols_dnn, "dnn_5": cols_dnn}

# Scoring function in the hyperopt hyperparameters tuning.
def scoring_function(parameters):
    print("Training the model with parameters: ")
    print(parameters)
    average_RMSE = 0.0
    n_splits = 5
    
    # Generate random integer for model_dir
    random_int = np.random.randint(1000)
    
    kf = KFold(n_splits=n_splits)
    nb_fold = 0
    for train_index, validation_index in kf.split(train):
        nb_fold += 1
        train_fold, validation_fold = train.loc[train_index], train.loc[validation_index] 

        feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
        model_dir = ("./log_"
                     + str(parameters["steps"]) + "_"
                     + str(parameters["nb_neurons_1"]) + "_"
                     #+ str(parameters["nb_neurons_2"])
                     + str(nb_fold) + "_"
                     + str(random_int)
                    )
        
        # Tune number of layers
        model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                    hidden_units=[parameters["nb_neurons_1"]],
                                                                #parameters["nb_neurons_2"]],
                                                  #optimizer=tf.train.ProximalAdagradOptimizer(
                                                   #   learning_rate=0.1,
                                                    #  l1_regularization_strength=0.001),
                                                  model_dir=model_dir)

        def input_fn(data_set):
            feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
            labels = tf.constant(data_set["y"].values)
            return feature_cols, labels
        
        model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

        train_pred = train_fold[["id"]].assign(y_hat=0)
        #for i, m in models.items():
        temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
        # .predict() returns an iterator; convert to an array
        y_hat = np.array(list(itertools.islice(temp, 0, None)))
        train_pred["y_hat"] = y_hat

        # Use median value by id
        y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

        RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med).values, train_fold["y"]))
        
        # Prune outliers
        RMSE_decreasing = True
        count = 0
        while (RMSE_decreasing):
            count +=1
            train_pred["y_med"] = train_pred["id"].replace(y_hat_med)

            # Distance from the median for each bag
            train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
            # Rank of each instance by bag
            train_pred["rank"] = train_pred.groupby("id")["score"].rank()
            bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
            train_pred["bag_size"] = train_pred["id"].replace(bag_size_dict)
            train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

            # Remove outliers
            outliers_index = train_pred["rank"] > (1 - parameters["outliers_threshold"])
            train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)

            model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                      hidden_units=[parameters["nb_neurons_1"]],
                                                                    #parameters["nb_neurons_2"]],
                                                      #optimizer=tf.train.ProximalAdagradOptimizer(
                                                       #   learning_rate=0.1,
                                                        #  l1_regularization_strength=0.001),
                                                      model_dir=model_dir)

            model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

            # Compute new RMSE
            train_pred = train_fold[["id"]].assign(y_hat=0)
            
            #for i, m in models.items():
            temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            train_pred["y_hat"] = y_hat

            # Use median value by id
            y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

            new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med), train_fold["y"]))
            print("Pruning {0} RMSE: {1}".format(count, new_RMSE))

            if (abs(new_RMSE - RMSE) > parameters["gain_threshold"]):
                RMSE = new_RMSE
            else:
                RMSE_decreasing = False
        
        # Bagging of RNN
        # Bootstrap 1
        #train_fold_1 = train_fold
        #model_dnn_1 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
         #                                           hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        #model_dnn_1.fit(input_fn=lambda: input_fn(train_fold_1), steps=parameters["steps"])
        
        # Boostrap 2
        train_fold_2 = resample(train_fold, random_state=random_seed).sort_values(by=["id"]).reset_index(drop=True)
        model_dnn_2 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                    hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        model_dnn_2.fit(input_fn=lambda: input_fn(train_fold_2), steps=parameters["steps"])
            
        # Bootstrap 3
        train_fold_3 = resample(train_fold, random_state=(random_seed+1)).sort_values(by=["id"]).reset_index(drop=True)
        model_dnn_3 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                    hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        model_dnn_3.fit(input_fn=lambda: input_fn(train_fold_3), steps=parameters["steps"])
        
        # Bootstrap 4
        train_fold_4 = resample(train_fold, random_state=(random_seed+2)).sort_values(by=["id"]).reset_index(drop=True)
        model_dnn_4 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                    hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        model_dnn_4.fit(input_fn=lambda: input_fn(train_fold_4), steps=parameters["steps"])
        
        # Bootstrap 5
        train_fold_5 = resample(train_fold, random_state=(random_seed+3)).sort_values(by=["id"]).reset_index(drop=True)
        model_dnn_5 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                    hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        model_dnn_5.fit(input_fn=lambda: input_fn(train_fold_5), steps=parameters["steps"])
        
        # Changed to model_dnn instead of model_dnn_1
        models = {"dnn_1": model_dnn, "dnn_2": model_dnn_2, "dnn_3": model_dnn_3,
                  "dnn_4": model_dnn_4, "dnn_5": model_dnn_5}
        
        # Compute RMSE on validation set
        validation_pred = validation_fold[["id"]].assign(y_hat=0).reset_index(drop=True)
        for i, m in models.items():
            temp = m.predict(input_fn=lambda: input_fn(validation_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            validation_pred["y_hat"] += models_weights[i] * y_hat
            
        # Use median value by id
        y_hat_med = validation_pred.groupby("id").median()["y_hat"].to_dict()
        
        RMSE = np.sqrt(mean_squared_error(validation_pred["id"].replace(y_hat_med).values, validation_fold["y"]))
        average_RMSE += RMSE
        print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))

    average_RMSE /= n_splits

    print("Cross-validation score: {0}\n".format(average_RMSE))
    
    return {"loss": average_RMSE, "status": STATUS_OK}

In [7]:
t0 = time()

# Grid to pick parameters from.
parameters_grid = {"steps"             : hp.choice("steps", np.arange(3000, 5000, 200, dtype=int)),
                   "nb_neurons_1"      : hp.choice("nb_neurons_1", np.arange(9, 12, 1, dtype=int)),
                   "outliers_threshold": hp.quniform("outliers_threshold", 0.03, 0.06, 0.01),
                   "gain_threshold"    : hp.quniform("gain_threshold", 0.005, 0.02, 0.005)
                   #"nb_neurons_2": hp.choice("nb_neurons_2", np.arange(5, 10, 1, dtype=int))
                  }
# Record the information about the cross-validation.
trials = Trials()

best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=2, 
            trials=trials)

computing_time = time() - t0

Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 3800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.674073882641
Pruning 2 RMSE: 0.662346176042
Validation fold 1 RMSE: 0.641617979853
Pruning 1 RMSE: 0.635676000166
Pruning 2 RMSE: 0.619615432596
Pruning 3 RMSE: 0.610423043833
Validation fold 2 RMSE: 0.73246715849
Pruning 1 RMSE: 0.643307566169
Pruning 2 RMSE: 0.62938122591
Validation fold 3 RMSE: 0.710291994342
Pruning 1 RMSE: 0.659927208941
Pruning 2 RMSE: 0.647666579446
Validation fold 4 RMSE: 0.670328953596
Pruning 1 RMSE: 0.632532403066
Pruning 2 RMSE: 0.614925033513
Pruning 3 RMSE: 0.605334750968
Validation fold 5 RMSE: 0.753588096326
Cross-validation score: 0.701658836521

Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 4800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}


KeyboardInterrupt: 

In [None]:
5 DNN and use the pruning DNN
Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 3800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.674073882641
Pruning 2 RMSE: 0.662346176042
Validation fold 1 RMSE: 0.641617979853
Pruning 1 RMSE: 0.635676000166
Pruning 2 RMSE: 0.619615432596
Pruning 3 RMSE: 0.610423043833
Validation fold 2 RMSE: 0.73246715849
Pruning 1 RMSE: 0.643307566169
Pruning 2 RMSE: 0.62938122591
Validation fold 3 RMSE: 0.710291994342
Pruning 1 RMSE: 0.659927208941
Pruning 2 RMSE: 0.647666579446
Validation fold 4 RMSE: 0.670328953596
Pruning 1 RMSE: 0.632532403066
Pruning 2 RMSE: 0.614925033513
Pruning 3 RMSE: 0.605334750968
Validation fold 5 RMSE: 0.753588096326
Cross-validation score: 0.701658836521

In [None]:
5 DNN pruning with a first DNN then train 5 0.2 weighted DNN
Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 2200, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.694068399998
Pruning 2 RMSE: 0.68319691192
Validation fold 1 RMSE: 0.657984789376
Pruning 1 RMSE: 0.665720056564
Pruning 2 RMSE: 0.652525762923
Validation fold 2 RMSE: 0.746126089816
Pruning 1 RMSE: 0.670865372689
Pruning 2 RMSE: 0.657049715495
Validation fold 3 RMSE: 0.715898563369
Pruning 1 RMSE: 0.674144585081
Pruning 2 RMSE: 0.655254059805
Pruning 3 RMSE: 0.645841747765
Validation fold 4 RMSE: 0.686994886197
Pruning 1 RMSE: 0.640782640919
Pruning 2 RMSE: 0.62576629908
Pruning 3 RMSE: 0.6191962246
Validation fold 5 RMSE: 0.757705397434
Cross-validation score: 0.712941945239

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 3400, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.672635757823
Pruning 2 RMSE: 0.661982328413
Pruning 3 RMSE: 0.655412024265
Validation fold 1 RMSE: 0.632837032519
Pruning 1 RMSE: 0.642811068713
Pruning 2 RMSE: 0.632067871794
Pruning 3 RMSE: 0.624793548264
Validation fold 2 RMSE: 0.727289064772
Pruning 1 RMSE: 0.658106411403
Pruning 2 RMSE: 0.635442263112
Pruning 3 RMSE: 0.618591227467
Pruning 4 RMSE: 0.609851324476
Validation fold 3 RMSE: 0.717622664993
Pruning 1 RMSE: 0.655587115002
Pruning 2 RMSE: 0.64177101986
Pruning 3 RMSE: 0.634889111319
Validation fold 4 RMSE: 0.67796187099
Pruning 1 RMSE: 0.635166511006
Pruning 2 RMSE: 0.612866670615
Pruning 3 RMSE: 0.602643711575
Pruning 4 RMSE: 0.590786436001
Pruning 5 RMSE: 0.585929889116
Validation fold 5 RMSE: 0.745885863766
Cross-validation score: 0.700319299408

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 3200, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.673878243224
Pruning 2 RMSE: 0.657360188279
Pruning 3 RMSE: 0.642604850865
Pruning 4 RMSE: 0.635441991907
Validation fold 1 RMSE: 0.639493686463
Pruning 1 RMSE: 0.650316328335
Pruning 2 RMSE: 0.635764740628
Pruning 3 RMSE: 0.625281288299
Pruning 4 RMSE: 0.618857759816
Validation fold 2 RMSE: 0.732526425149
Pruning 1 RMSE: 0.663290232475
Pruning 2 RMSE: 0.651664810483
Pruning 3 RMSE: 0.638306582562
Pruning 4 RMSE: 0.625389960277
Pruning 5 RMSE: 0.619638207743
Validation fold 3 RMSE: 0.709389594885
Pruning 1 RMSE: 0.665109073757
Pruning 2 RMSE: 0.655310627332
Validation fold 4 RMSE: 0.684705184081
Pruning 1 RMSE: 0.639195545405
Pruning 2 RMSE: 0.624107169148
Pruning 3 RMSE: 0.614627025796
Validation fold 5 RMSE: 0.751663827286
Cross-validation score: 0.703555743573

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 2400, 'nb_neurons_1': 9, 'outliers_threshold': 0.06}
Pruning 1 RMSE: 0.672027783277
Pruning 2 RMSE: 0.658416156325
Pruning 3 RMSE: 0.650714100529
Validation fold 1 RMSE: 0.64726858528
Pruning 1 RMSE: 0.67959968961
Pruning 2 RMSE: 0.663599458586
Pruning 3 RMSE: 0.653446512383
Pruning 4 RMSE: 0.647451596973
Validation fold 2 RMSE: 0.73745400766
Pruning 1 RMSE: 0.671306058337
Pruning 2 RMSE: 0.652840696921
Pruning 3 RMSE: 0.641901681627
Pruning 4 RMSE: 0.634831914775
Validation fold 3 RMSE: 0.711275356156
Pruning 1 RMSE: 0.672597980531
Pruning 2 RMSE: 0.660307287653
Pruning 3 RMSE: 0.655366412873
Validation fold 4 RMSE: 0.682952884798
Pruning 1 RMSE: 0.652781515031
Pruning 2 RMSE: 0.638197522441
Pruning 3 RMSE: 0.628958394346
Validation fold 5 RMSE: 0.768046027259
Cross-validation score: 0.70939937223
    
    
Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 2400, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.686858771359
Pruning 2 RMSE: 0.672878467855
Pruning 3 RMSE: 0.665149234767
Validation fold 1 RMSE: 0.642083994677
Pruning 1 RMSE: 0.652773718198
Pruning 2 RMSE: 0.641075258377
Pruning 3 RMSE: 0.63371016034
Validation fold 2 RMSE: 0.738007531296
Pruning 1 RMSE: 0.663196334567
Pruning 2 RMSE: 0.643597311854
Pruning 3 RMSE: 0.630428357205
Pruning 4 RMSE: 0.623077216084
Validation fold 3 RMSE: 0.71170883265
Pruning 1 RMSE: 0.689810676757
Pruning 2 RMSE: 0.677621845341
Pruning 3 RMSE: 0.669599938078
Validation fold 4 RMSE: 0.690884317575
Pruning 1 RMSE: 0.651690464316
Pruning 2 RMSE: 0.639165758066
Pruning 3 RMSE: 0.630737742502
Validation fold 5 RMSE: 0.753819804982
Cross-validation score: 0.707300896236

In [None]:
5 DNN model_dir for pruning {"dnn_1": 0.4, "dnn_2": 0.15, "dnn_3": 0.15,
                  "dnn_4": 0.15, "dnn_5": 0.15}
Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 2100, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.676416816474
Pruning 2 RMSE: 0.663520827885
Pruning 3 RMSE: 0.655253952782
Validation fold 1 RMSE: 0.638520567696
Pruning 1 RMSE: 0.664297149376
Pruning 2 RMSE: 0.651764716202
Pruning 3 RMSE: 0.643706111477
Validation fold 2 RMSE: 0.739466649119
Pruning 1 RMSE: 0.664528195881
Pruning 2 RMSE: 0.651617243289
Pruning 3 RMSE: 0.644582271931
Validation fold 3 RMSE: 0.718805738469
Pruning 1 RMSE: 0.687431994827
Pruning 2 RMSE: 0.673373446494
Pruning 3 RMSE: 0.665013005853
Validation fold 4 RMSE: 0.67264303545
Pruning 1 RMSE: 0.659725291472
Pruning 2 RMSE: 0.647505977336
Pruning 3 RMSE: 0.640364412581
Validation fold 5 RMSE: 0.756788708091
Cross-validation score: 0.705244939765

    
Training the model with parameters: (same coefs)
{'gain_threshold': 0.01, 'steps': 2100, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.671783958403
Pruning 2 RMSE: 0.659132942345
Pruning 3 RMSE: 0.65087802989
Validation fold 1 RMSE: 0.642550227753
Pruning 1 RMSE: 0.653374572234
Pruning 2 RMSE: 0.63645978204
Pruning 3 RMSE: 0.624386509142
Pruning 4 RMSE: 0.6140196058
Pruning 5 RMSE: 0.609660735924
Validation fold 2 RMSE: 0.725755962849
Pruning 1 RMSE: 0.664299289878
Pruning 2 RMSE: 0.648366505728
Pruning 3 RMSE: 0.640081573857
Validation fold 3 RMSE: 0.714825658397
Pruning 1 RMSE: 0.66744880357
Pruning 2 RMSE: 0.651428349357
Pruning 3 RMSE: 0.639399741055
Pruning 4 RMSE: 0.631543676175
Validation fold 4 RMSE: 0.670998448018
Pruning 1 RMSE: 0.659382834695
Pruning 2 RMSE: 0.639853733358
Pruning 3 RMSE: 0.630634884058
Validation fold 5 RMSE: 0.757291856953
Cross-validation score: 0.702284430794
    
Training the model with parameters: (same coefs)
{'gain_threshold': 0.01, 'steps': 1100, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.687293470623
Pruning 2 RMSE: 0.673543089191
Pruning 3 RMSE: 0.665733359442
Validation fold 1 RMSE: 0.649368530503
Pruning 1 RMSE: 0.679928297676
Pruning 2 RMSE: 0.668609243864
Pruning 3 RMSE: 0.662262044876
Validation fold 2 RMSE: 0.744633907485
Pruning 1 RMSE: 0.669438977141
Pruning 2 RMSE: 0.658915490238
Pruning 3 RMSE: 0.649607975009
Validation fold 3 RMSE: 0.710778483402
Pruning 1 RMSE: 0.708289420438
Pruning 2 RMSE: 0.696308353637
Pruning 3 RMSE: 0.689003339174
Validation fold 4 RMSE: 0.710196884293
Pruning 1 RMSE: 0.682813446172
Pruning 2 RMSE: 0.670210695852
Pruning 3 RMSE: 0.661776509643
Validation fold 5 RMSE: 0.76190366844
Cross-validation score: 0.715376294825

In [None]:
# 1 DNN with model_dir corrected
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1000, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.708254362329
Pruning 2 RMSE: 0.690809364036
Pruning 3 RMSE: 0.681259286244
Pruning 4 RMSE: 0.675679762642
Pruning 5 RMSE: 0.670965511179
Validation fold 1 RMSE: 0.64861798817
Pruning 1 RMSE: 0.658292571898
Pruning 2 RMSE: 0.645737414652
Pruning 3 RMSE: 0.63797501753
Pruning 4 RMSE: 0.632583378549
Pruning 5 RMSE: 0.627584029004
Validation fold 2 RMSE: 0.723643911448
Pruning 1 RMSE: 0.687961545764
Pruning 2 RMSE: 0.672401033846
Pruning 3 RMSE: 0.660611288466
Pruning 4 RMSE: 0.650237174155
Pruning 5 RMSE: 0.64198570573
Pruning 6 RMSE: 0.636986731147
Validation fold 3 RMSE: 0.719981056486
Pruning 1 RMSE: 0.697781938911
Pruning 2 RMSE: 0.681268268398
Pruning 3 RMSE: 0.671483140797
Pruning 4 RMSE: 0.666088232133
Pruning 5 RMSE: 0.661858777702
Validation fold 4 RMSE: 0.715854578144
Pruning 1 RMSE: 0.66224449426
Pruning 2 RMSE: 0.644538081642
Pruning 3 RMSE: 0.634079022739
Pruning 4 RMSE: 0.624630852786
Pruning 5 RMSE: 0.619603795183
Pruning 6 RMSE: 0.61598970139
Validation fold 5 RMSE: 0.762927716346
Cross-validation score: 0.714205050119

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 1100, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.713960264442
Pruning 2 RMSE: 0.701637487701
Pruning 3 RMSE: 0.69375349128
Validation fold 1 RMSE: 0.641879797964
Pruning 1 RMSE: 0.671532366166
Pruning 2 RMSE: 0.656442799709
Pruning 3 RMSE: 0.645330625403
Pruning 4 RMSE: 0.638014896705
Validation fold 2 RMSE: 0.741045867638
Pruning 1 RMSE: 0.690110283558
Pruning 2 RMSE: 0.678068607072
Pruning 3 RMSE: 0.662923307388
Pruning 4 RMSE: 0.654881538105
Validation fold 3 RMSE: 0.687377107282
Pruning 1 RMSE: 0.70616839345
Pruning 2 RMSE: 0.692895897509
Pruning 3 RMSE: 0.686021953655
Validation fold 4 RMSE: 0.688327537634
Pruning 1 RMSE: 0.66943420746
Pruning 2 RMSE: 0.65286815522
Pruning 3 RMSE: 0.638691155949
Pruning 4 RMSE: 0.627600556022
Pruning 5 RMSE: 0.622316478
Validation fold 5 RMSE: 0.765910014041
Cross-validation score: 0.704908064912

In [None]:
# Ensemble of 3 DNN
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1000, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.71116941007
Pruning 2 RMSE: 0.710134244747
Validation fold 1 RMSE: 0.67016269301
Pruning 1 RMSE: 0.693788730369
Pruning 2 RMSE: 0.681699022039
Pruning 3 RMSE: 0.703264811229
Pruning 4 RMSE: 0.671334288662
Pruning 5 RMSE: 0.672755595177
Validation fold 2 RMSE: 0.742960274892
Pruning 1 RMSE: 0.693855159996
Pruning 2 RMSE: 0.691917725258
Validation fold 3 RMSE: 0.716453069035
Pruning 1 RMSE: 0.698029552278
Pruning 2 RMSE: 0.698402913616
Validation fold 4 RMSE: 0.702381816743
Pruning 1 RMSE: 0.669835541061
Pruning 2 RMSE: 0.670934743074
Validation fold 5 RMSE: 0.772065210064
Cross-validation score: 0.720804612749
    
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1100, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.710970210115
Pruning 2 RMSE: 0.723884803499
Pruning 3 RMSE: 0.705558405266
Pruning 4 RMSE: 0.69082150458
Pruning 5 RMSE: 0.709012637324
Pruning 6 RMSE: 0.689840083409
Pruning 7 RMSE: 0.682960915979
Pruning 8 RMSE: 0.689320580892
Pruning 9 RMSE: 0.733669462986
Pruning 10 RMSE: 0.702594016806
Pruning 11 RMSE: 0.700092614618
Validation fold 1 RMSE: 0.639888747346
Pruning 1 RMSE: 0.702693117724
Validation fold 2 RMSE: 0.756274354279
Pruning 1 RMSE: 0.691091309155
Pruning 2 RMSE: 0.691357233141
Validation fold 3 RMSE: 0.73679429174
Pruning 1 RMSE: 0.714865007836
Pruning 2 RMSE: 0.689779608571
Pruning 3 RMSE: 0.688207703398
Validation fold 4 RMSE: 0.686380994175
Pruning 1 RMSE: 0.684824585281
Pruning 2 RMSE: 0.669877290062
Pruning 3 RMSE: 0.686328240571
Pruning 4 RMSE: 0.663275388897
Pruning 5 RMSE: 0.668140496708
Validation fold 5 RMSE: 0.768409407507
Cross-validation score: 0.717549559009
    
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1100, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.705035094312
Pruning 2 RMSE: 0.714659640929
Pruning 3 RMSE: 0.712357459314
Validation fold 1 RMSE: 0.661191916714
Pruning 1 RMSE: 0.686965456513
Validation fold 2 RMSE: 0.749879491243
Pruning 1 RMSE: 0.689762043097
Pruning 2 RMSE: 0.69232873535
Validation fold 3 RMSE: 0.724016986696
Pruning 1 RMSE: 0.690028220234
Pruning 2 RMSE: 0.69337827918
Validation fold 4 RMSE: 0.703435739334
Pruning 1 RMSE: 0.681859928879
Pruning 2 RMSE: 0.67091896457
Pruning 3 RMSE: 0.677659388273
Pruning 4 RMSE: 0.664039872409
Pruning 5 RMSE: 0.667718583335
Validation fold 5 RMSE: 0.757494009563
Cross-validation score: 0.71920362871

In [8]:
min(trials.losses())

In [None]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in 
                                zip(space_eval(parameters_grid, best).keys(),
                                    space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("best_parameters_6.csv", encoding="utf-8", index=False)

best_parameters

### Training models

In [8]:
cols_dnn = cols_orig

models_weights = {"dnn_1": 0.2, "dnn_2": 0.2, "dnn_3": 0.2,
                  "dnn_4": 0.2, "dnn_5": 0.2}

models_cols = {"dnn_1": cols_dnn, "dnn_2": cols_dnn, "dnn_3": cols_dnn,
               "dnn_4": cols_dnn, "dnn_5": cols_dnn}

best_parameters = pd.read_csv("best_parameters_5.csv", encoding="utf-8")
#best_parameters = dict(zip(best_parameters.columns[:-1], best_parameters.iloc[0].values[:-1]))

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]

model_dir = "./log_submit_5"
        
# Tune number of layers
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                          hidden_units=[best_parameters["nb_neurons_1"][0]],
                                          model_dir=model_dir)

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels

model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])
        

train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to an array
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat

# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med).values, train["y"]))
        
# Prune outliers
RMSE_decreasing = True
count = 0
while (RMSE_decreasing):
    count +=1
    train_pred["y_med"] = train_pred["id"].replace(y_hat_med)

    # Distance from the median for each bag
    train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
    # Rank of each instance by bag
    train_pred["rank"] = train_pred.groupby("id")["score"].rank()
    bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
    train_pred["bag_size"] = train_pred["id"].replace(bag_size_dict)
    train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

    # Remove outliers
    outliers_index = train_pred["rank"] > (1 - best_parameters["outliers_threshold"][0])
    train = train.loc[~outliers_index, :].reset_index(drop=True)

    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[best_parameters["nb_neurons_1"][0]],
                                              model_dir=model_dir)

    model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])

    # Compute new RMSE
    train_pred = train[["id"]].assign(y_hat=0)
            
    temp = model_dnn.predict(input_fn=lambda: input_fn(train))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat

    # Use median value by id
    y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

    new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med), train["y"]))
    print("Pruning {0} RMSE: {1}".format(count, new_RMSE))

    if (abs(new_RMSE - RMSE) > best_parameters["gain_threshold"][0]):
        RMSE = new_RMSE
    else:
        RMSE_decreasing = False
        
# Bagging of RNN

# Bootstrap 1
train_1 = train
model_dnn_1 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"][0]])
model_dnn_1.fit(input_fn=lambda: input_fn(train_1), steps=best_parameters["steps"][0])

# Bootstrap 2
train_2 = resample(train, random_state=random_seed).sort_values(by=["id"]).reset_index(drop=True)
model_dnn_2 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"][0]])
model_dnn_2.fit(input_fn=lambda: input_fn(train_2), steps=best_parameters["steps"][0])

# Bootstrap 3
train_3 = resample(train, random_state=(random_seed+1)).sort_values(by=["id"]).reset_index(drop=True)
model_dnn_3 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"][0]])
model_dnn_3.fit(input_fn=lambda: input_fn(train_3), steps=best_parameters["steps"][0])

# Bootstrap 4
train_4 = resample(train, random_state=random_seed+2).sort_values(by=["id"]).reset_index(drop=True)
model_dnn_4 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"][0]])
model_dnn_4.fit(input_fn=lambda: input_fn(train_4), steps=best_parameters["steps"][0])

# Bootstrap 5
train_5 = resample(train, random_state=random_seed+3).sort_values(by=["id"]).reset_index(drop=True)
model_dnn_5 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"][0]])
model_dnn_5.fit(input_fn=lambda: input_fn(train_5), steps=best_parameters["steps"][0])
    
models = {"dnn_1": model_dnn_1, "dnn_2": model_dnn_2, "dnn_3": model_dnn_3,
          "dnn_4": model_dnn_4, "dnn_5": model_dnn_5}

Pruning 1 RMSE: 0.682372839094
Pruning 2 RMSE: 0.668013398984
Pruning 3 RMSE: 0.659537802041


### Predicting on test set

In [9]:
test_pred = test[["id"]].assign(y_hat=0).reset_index(drop=True)
for i, m in models.items():
    temp = m.predict(input_fn=lambda: input_fn(test))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    test_pred["y_hat"] += models_weights[i] * y_hat

# Use median value by id
y_hat_med = test_pred.groupby("id").median()["y_hat"]

In [10]:
#RMSE = np.sqrt(mean_squared_error(test_pred["id"].replace(y_hat_med).values, test["y"]))
#RMSE
#0.65725435012348465 for Pred 4
#0.65362864377856866 for Pred 5

In [11]:
kaggle_pred = pd.DataFrame({"Id": y_hat_med.index, "y": y_hat_med.values})
kaggle_pred.to_csv("Prediction_5.csv", encoding="utf-8", index=False)

### Benchmark:
* Submit 1 (ensemble of xgboost + 2 ridge with instances model)
eta	eval_metric	gamma	lambda	max_depth	min_child_weight	nthread	objective	seed	silent	subsample	score
0.91834 Public LB 300 trees 0.09	rmse	0.2	0.8	4	4.0	-1	reg:linear	22	0	0.7	0.883339 (cross-val)
    
* Submit 2

Pruning with linear regression
then add contributions of aggregated xgboost + linear model

0.78181 Public LB  0.779345 CV

* Submit 3
DNN pruning
(wrong CV)
0.73270 Public LB 0.663128 CV with DNN 10 neurons 900 steps gain_threshold = 0.01 outliers_threshold = 0.05

* Prediction 4
Ensemble of 5 DNN
{'gain_threshold': 0.015, 'steps': 3800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05} CV 0.701658836521
using fist pruning DNN
* Prediction 5
Ensemble of 5 DNN
{'gain_threshold': 0.01, 'steps': 2400, 'nb_neurons_1': 10, 'outliers_threshold': 0.05} CV 0.707300896236

In [None]:
Dropout
Regression
Validation set instead of cross val