In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.utils import resample
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

from time import time
import operator

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf

from scipy.stats import norm

tf.logging.set_verbosity(tf.logging.ERROR)

### Start training

In [2]:
# Import data
def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

columns = (["id"] + ["reflectance_" + str(i) for i in range(7)]
           + ["solar_" + str(i) for i in range(5)] + ["y"])
full_data = pd.read_csv("MODIS.csv", header=None, names=columns)
split = 98000
train, test = full_data[:split].copy(), full_data[split:].copy()

#train_copy, test_copy = load_data()

#train, test = load_data()

# Parameters
n_threads = -1
random_seed = 22

In [3]:
train.head()

Unnamed: 0,id,reflectance_0,reflectance_1,reflectance_2,reflectance_3,reflectance_4,reflectance_5,reflectance_6,solar_0,solar_1,solar_2,solar_3,solar_4,y
0,1,0.026993,0.012067,0.088535,0.050097,0.007748,0.004051,0.002929,28.420588,146.782941,20.686471,100.594706,159.884706,0.075627
1,1,0.029457,0.019613,0.087705,0.05213,0.01693,0.010574,0.003654,28.420588,146.782941,20.686471,100.594706,159.884706,0.075627
2,1,0.038491,0.150211,0.091345,0.062856,0.140568,0.076832,0.032414,28.420588,146.782941,20.686471,100.594706,159.884706,0.075627
3,1,0.041447,0.276798,0.089301,0.072769,0.23795,0.109721,0.03696,28.420588,146.782941,20.686471,100.594706,159.884706,0.075627
4,1,0.029073,0.027024,0.08895,0.052317,0.021162,0.011535,0.005997,28.420588,146.782941,20.686471,100.594706,159.884706,0.075627


In [4]:
#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

### Preprocessing

In [5]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Standardise data for LR
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

### Neural net

In [6]:
#feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_orig]

#regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                         # hidden_units=[9, 9],
                                         #)#model_dir="./temp_log")

#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

#def input_fn(data_set):
 #   feature_cols = {k: tf.constant(data_set[k].values) for k in cols_orig}
  #  labels = tf.constant(data_set["y"].values)
    
   # return feature_cols, labels

#validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
 #   input_fn=lambda: input_fn(test),
  #  early_stopping_rounds=100)

#regressor.fit(input_fn=lambda: input_fn(train), steps=10)
              #monitors=[validation_monitor])

#ev = regressor.evaluate(input_fn=lambda: input_fn(train), steps=1)
#loss_score = ev["loss"]
#print("Loss: {0:f}".format(loss_score))

#y = regressor.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to a list and print predictions
#predictions = np.array(list(itertools.islice(y, 0, None)))

### Cross-validation

In [9]:
cols_dnn = cols_orig

models_weights = {"dnn_1": 1}#, "dnn_2": 0.2, "dnn_3": 0.2,
                  #"dnn_4": 0.2, "dnn_5": 0.2}
models_cols = {"dnn_1": cols_dnn}
#models_cols = {"dnn_1": cols_dnn, "dnn_2": cols_dnn, "dnn_3": cols_dnn,
 #              "dnn_4": cols_dnn, "dnn_5": cols_dnn}
    
learning_rate = 0.1

# Scoring function in the hyperopt hyperparameters tuning.
def scoring_function(parameters):
    print("Training the model with parameters: ")
    print(parameters)
    average_RMSE = 0.0
    n_splits = 5
    
    # Generate random integer for model_dir
    random_int = np.random.randint(1000)
    
    kf = KFold(n_splits=n_splits)
    nb_fold = 0
    for train_index, validation_index in kf.split(train):
        nb_fold += 1
        train_fold, validation_fold = train.loc[train_index], train.loc[validation_index] 

        feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
        model_dir = ("./log_"
                     + str(parameters["steps"]) + "_"
                     + str(parameters["nb_neurons_1"]) + "_"
                     #+ str(parameters["nb_neurons_2"])
                     + str(nb_fold) + "_"
                     + str(random_int)
                    )
        
        # Tune number of layers
        model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                  hidden_units=[parameters["nb_neurons_1"]],
                                                                #parameters["nb_neurons_2"]],
                                                  optimizer=tf.train.ProximalAdagradOptimizer(
                                                      learning_rate=learning_rate,
                                                      l2_regularization_strength=parameters["l2_reg"]),
                                                  dropout=parameters["dropout"],
                                                  model_dir=model_dir)

        def input_fn(data_set):
            feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
            labels = tf.constant(data_set["y"].values)
            return feature_cols, labels
        
        model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

        train_pred = train_fold[["id"]].assign(y_hat=0)
        #for i, m in models.items():
        temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
        # .predict() returns an iterator; convert to an array
        y_hat = np.array(list(itertools.islice(temp, 0, None)))
        train_pred["y_hat"] = y_hat

        # Use median value by id
        y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

        RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train_fold["y"]))
        
        # Prune outliers
        RMSE_decreasing = True
        count = 0
        while (RMSE_decreasing):
            count +=1
            train_pred["y_med"] = train_pred["id"].map(y_hat_med)

            # Distance from the median for each bag
            train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
            # Rank of each instance by bag
            train_pred["rank"] = train_pred.groupby("id")["score"].rank()
            bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
            train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
            train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

            # Remove outliers
            outliers_index = train_pred["rank"] > (1 - parameters["outliers_threshold"])
            train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)

            model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                      hidden_units=[parameters["nb_neurons_1"]],
                                                                    #parameters["nb_neurons_2"]],
                                                      optimizer=tf.train.ProximalAdagradOptimizer(
                                                          learning_rate=learning_rate,
                                                          l2_regularization_strength=parameters["l2_reg"]),
                                                      dropout=parameters["dropout"],
                                                      model_dir=model_dir)

            model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

            # Compute new RMSE
            train_pred = train_fold[["id"]].assign(y_hat=0)
            
            #for i, m in models.items():
            temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            train_pred["y_hat"] = y_hat

            # Use median value by id
            y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

            new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train_fold["y"]))
            print("Pruning {0} RMSE: {1}".format(count, new_RMSE))
                
            # 5 iterations of pruning
            #if (abs(new_RMSE - RMSE) > parameters["gain_threshold"]):
            if (count < 5):
                RMSE = new_RMSE
            else:
                RMSE_decreasing = False
        
        # Bagging of RNN
        # Bootstrap 1
        train_fold_1 = train_fold
        model_dnn_1 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                    hidden_units=[parameters["nb_neurons_1"]],
                                                    optimizer=tf.train.ProximalAdagradOptimizer(
                                                        learning_rate=learning_rate,
                                                        l2_regularization_strength=parameters["l2_reg"]),
                                                    dropout=parameters["dropout"])
        model_dnn_1.fit(input_fn=lambda: input_fn(train_fold_1), steps=parameters["steps"])
        
        # Boostrap 2
        #train_fold_2 = resample(train_fold, random_state=random_seed).sort_values(by=["id"]).reset_index(drop=True)
        #model_dnn_2 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
         #                                           hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        #model_dnn_2.fit(input_fn=lambda: input_fn(train_fold_2), steps=parameters["steps"])
            
        # Bootstrap 3
        #train_fold_3 = resample(train_fold, random_state=(random_seed+1)).sort_values(by=["id"]).reset_index(drop=True)
        #model_dnn_3 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
         #                                           hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        #model_dnn_3.fit(input_fn=lambda: input_fn(train_fold_3), steps=parameters["steps"])
        
        # Bootstrap 4
        #train_fold_4 = resample(train_fold, random_state=(random_seed+2)).sort_values(by=["id"]).reset_index(drop=True)
        #model_dnn_4 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
         #                                           hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        #model_dnn_4.fit(input_fn=lambda: input_fn(train_fold_4), steps=parameters["steps"])
        
        # Bootstrap 5
        #train_fold_5 = resample(train_fold, random_state=(random_seed+3)).sort_values(by=["id"]).reset_index(drop=True)
        #model_dnn_5 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
         #                                           hidden_units=[parameters["nb_neurons_1"]])
                                                    #model_dir=model_dir)
        #model_dnn_5.fit(input_fn=lambda: input_fn(train_fold_5), steps=parameters["steps"])
        
        # Changed to model_dnn instead of model_dnn_1
        models = {"dnn_1": model_dnn_1}#, "dnn_2": model_dnn_2, "dnn_3": model_dnn_3,
                  #"dnn_4": model_dnn_4, "dnn_5": model_dnn_5}
        
        # Compute RMSE on validation set
        validation_pred = validation_fold[["id"]].assign(y_hat=0).reset_index(drop=True)
        for i, m in models.items():
            temp = m.predict(input_fn=lambda: input_fn(validation_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            validation_pred["y_hat"] += models_weights[i] * y_hat
            
        # Use median value by id
        y_hat_med = validation_pred.groupby("id").median()["y_hat"].to_dict()
        
        RMSE = np.sqrt(mean_squared_error(validation_pred["id"].map(y_hat_med).values, validation_fold["y"]))
        average_RMSE += RMSE
        print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))

    average_RMSE /= n_splits

    print("Cross-validation score: {0}\n".format(average_RMSE))
    
    return {"loss": average_RMSE, "status": STATUS_OK}

In [10]:
t0 = time()

# Grid to pick parameters from.
parameters_grid = {"steps"             : hp.choice("steps", np.arange(600, 1000, 100, dtype=int)),
                   "nb_neurons_1"      : hp.choice("nb_neurons_1", np.arange(9, 12, 1, dtype=int)),
                   "outliers_threshold": hp.quniform("outliers_threshold", 0.05, 0.051, 0.01),
                   "gain_threshold"    : hp.quniform("gain_threshold", 0.005, 0.02, 0.005),
                   "dropout": hp.quniform("dropout", 0.0, 0.015, 0.1),
                   "l2_reg": hp.quniform("l2_reg", 0.00, 0.005, 0.01)
                   #"nb_neurons_2": hp.choice("nb_neurons_2", np.arange(5, 10, 1, dtype=int))
                  }
# Record the information about the cross-validation.
trials = Trials()

best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=1, 
            trials=trials)

computing_time = time() - t0

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 800, 'dropout': 0.0, 'l2_reg': 0.0, 'outliers_threshold': 0.05, 'nb_neurons_1': 10}
Pruning 1 RMSE: 0.101383996718
Pruning 2 RMSE: 0.0978095948641
Pruning 3 RMSE: 0.0953234314593
Pruning 4 RMSE: 0.0935930969832
Pruning 5 RMSE: 0.0917166903942
Validation fold 1 RMSE: 0.103266539533
Pruning 1 RMSE: 0.103283954457
Pruning 2 RMSE: 0.0978579181647
Pruning 3 RMSE: 0.0943521111809
Pruning 4 RMSE: 0.0918541919196
Pruning 5 RMSE: 0.0899483757189
Validation fold 2 RMSE: 0.126386504768
Pruning 1 RMSE: 0.106117454357
Pruning 2 RMSE: 0.098471412368
Pruning 3 RMSE: 0.0941309863349
Pruning 4 RMSE: 0.0913804097672
Pruning 5 RMSE: 0.0895612757944
Validation fold 3 RMSE: 0.124222866517
Pruning 1 RMSE: 0.0995391028902
Pruning 2 RMSE: 0.0951715062324
Pruning 3 RMSE: 0.0921694052346
Pruning 4 RMSE: 0.0903328714767
Pruning 5 RMSE: 0.0891820281494
Validation fold 4 RMSE: 0.109456731591
Pruning 1 RMSE: 0.0952387198159
Pruning 2 RMSE: 0.091

In [None]:
5 DNN and use the pruning DNN
Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 3800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.674073882641
Pruning 2 RMSE: 0.662346176042
Validation fold 1 RMSE: 0.641617979853
Pruning 1 RMSE: 0.635676000166
Pruning 2 RMSE: 0.619615432596
Pruning 3 RMSE: 0.610423043833
Validation fold 2 RMSE: 0.73246715849
Pruning 1 RMSE: 0.643307566169
Pruning 2 RMSE: 0.62938122591
Validation fold 3 RMSE: 0.710291994342
Pruning 1 RMSE: 0.659927208941
Pruning 2 RMSE: 0.647666579446
Validation fold 4 RMSE: 0.670328953596
Pruning 1 RMSE: 0.632532403066
Pruning 2 RMSE: 0.614925033513
Pruning 3 RMSE: 0.605334750968
Validation fold 5 RMSE: 0.753588096326
Cross-validation score: 0.701658836521

In [None]:
# 1 DNN with model_dir corrected
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1000, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.708254362329
Pruning 2 RMSE: 0.690809364036
Pruning 3 RMSE: 0.681259286244
Pruning 4 RMSE: 0.675679762642
Pruning 5 RMSE: 0.670965511179
Validation fold 1 RMSE: 0.64861798817
Pruning 1 RMSE: 0.658292571898
Pruning 2 RMSE: 0.645737414652
Pruning 3 RMSE: 0.63797501753
Pruning 4 RMSE: 0.632583378549
Pruning 5 RMSE: 0.627584029004
Validation fold 2 RMSE: 0.723643911448
Pruning 1 RMSE: 0.687961545764
Pruning 2 RMSE: 0.672401033846
Pruning 3 RMSE: 0.660611288466
Pruning 4 RMSE: 0.650237174155
Pruning 5 RMSE: 0.64198570573
Pruning 6 RMSE: 0.636986731147
Validation fold 3 RMSE: 0.719981056486
Pruning 1 RMSE: 0.697781938911
Pruning 2 RMSE: 0.681268268398
Pruning 3 RMSE: 0.671483140797
Pruning 4 RMSE: 0.666088232133
Pruning 5 RMSE: 0.661858777702
Validation fold 4 RMSE: 0.715854578144
Pruning 1 RMSE: 0.66224449426
Pruning 2 RMSE: 0.644538081642
Pruning 3 RMSE: 0.634079022739
Pruning 4 RMSE: 0.624630852786
Pruning 5 RMSE: 0.619603795183
Pruning 6 RMSE: 0.61598970139
Validation fold 5 RMSE: 0.762927716346
Cross-validation score: 0.714205050119

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 1100, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.713960264442
Pruning 2 RMSE: 0.701637487701
Pruning 3 RMSE: 0.69375349128
Validation fold 1 RMSE: 0.641879797964
Pruning 1 RMSE: 0.671532366166
Pruning 2 RMSE: 0.656442799709
Pruning 3 RMSE: 0.645330625403
Pruning 4 RMSE: 0.638014896705
Validation fold 2 RMSE: 0.741045867638
Pruning 1 RMSE: 0.690110283558
Pruning 2 RMSE: 0.678068607072
Pruning 3 RMSE: 0.662923307388
Pruning 4 RMSE: 0.654881538105
Validation fold 3 RMSE: 0.687377107282
Pruning 1 RMSE: 0.70616839345
Pruning 2 RMSE: 0.692895897509
Pruning 3 RMSE: 0.686021953655
Validation fold 4 RMSE: 0.688327537634
Pruning 1 RMSE: 0.66943420746
Pruning 2 RMSE: 0.65286815522
Pruning 3 RMSE: 0.638691155949
Pruning 4 RMSE: 0.627600556022
Pruning 5 RMSE: 0.622316478
Validation fold 5 RMSE: 0.765910014041
Cross-validation score: 0.704908064912

In [8]:
min(trials.losses())

In [None]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in 
                                zip(space_eval(parameters_grid, best).keys(),
                                    space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("best_parameters_6.csv", encoding="utf-8", index=False)

best_parameters

### Training models

In [None]:
def gamma_i_j(j, pi_i, X_i, y_i, delta):
    out = 0.0
    out = pi_i[j] * norm.pdf(y_i, X_i[j], delta)
    out /= sum(pi_i * norm.pdf(y_i, X_i, delta))
    return(out)

def EM_Q(pi, X, y, delta):
    
    
    

In [6]:
cols_dnn = cols_orig

best_parameters = pd.read_csv("best_parameters_6.csv", encoding="utf-8")

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]

model_dir_f = "./f_log_EM"

# Fit DNN regressor
model_dnn_f = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_f"][0]],
                                            model_dir=model_dir_f)

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels

model_dnn_f.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps_f"][0])

# Train prediction
train_pred = train[["id"]].assign(y_hat=0, pi_hat=0)
temp = model_dnn_f.predict(input_fn=lambda: input_fn(train))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat

# Fit DNN softmax
model_dir_g = "./g_log_EM"
model_dnn_g = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_g"][0]],
                                            model_dir=model_dir_g)

model_dnn_g.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps_g"][0])
temp = model_dnn_g.predict(input_fn=lambda: input_fn(train))
pi_hat = np.array(list(itertools.islice(temp, 0, None)))

# Compute softmax
train_pred["pi_hat"] = np.exp(pi_hat)
pi_hat_sum_dict = train_pred.groupby("id")["pi_hat"].sum().to_dict()
# "map" is actually much faster than "replace"
train_pred["pi_hat_sum"] = train_pred["id"].map(pi_hat_sum_dict)
train_pred["pi_hat"] /= train_pred["pi_hat_sum"]

In [None]:
# EM algorithm
# Change to Q
EM_decreasing = True
nb_iteration = 0
while (EM_decreasing):
    nb_iteration +=1

    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[best_parameters["nb_neurons_f"][0]],
                                              model_dir=model_dir)

    model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps_f"][0])

    # Compute new RMSE
    train_pred = train[["id"]].assign(y_hat=0)
            
    temp = model_dnn.predict(input_fn=lambda: input_fn(train))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat


    print("Iteration {0} EM: {1}".format(nb_iteration, new_RMSE))

    if (abs(new_RMSE - RMSE) > 0.1):
        RMSE = new_RMSE
    else:
        EM_decreasing = False

models = {"dnn_1": model_dnn}

### Predicting on test set

In [7]:
test_pred = test[["id"]].assign(y_hat=0).reset_index(drop=True)
for i, m in models.items():
    temp = m.predict(input_fn=lambda: input_fn(test))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    test_pred["y_hat"] += models_weights[i] * y_hat

# Use median value by id
y_hat_med = test_pred.groupby("id").median()["y_hat"]

In [8]:
RMSE = np.sqrt(mean_squared_error(test_pred["id"].map(y_hat_med).values, test["y"]))
RMSE
#0.65725435012348465 for Pred 4
#0.65362864377856866 for Pred 5

0.69482286537295845

In [11]:
kaggle_pred = pd.DataFrame({"Id": y_hat_med.index, "y": y_hat_med.values})
kaggle_pred.to_csv("Prediction_6.csv", encoding="utf-8", index=False)

### Benchmark:
* Submit 1 (ensemble of xgboost + 2 ridge with instances model)
eta	eval_metric	gamma	lambda	max_depth	min_child_weight	nthread	objective	seed	silent	subsample	score
0.91834 Public LB 300 trees 0.09	rmse	0.2	0.8	4	4.0	-1	reg:linear	22	0	0.7	0.883339 (cross-val)
    
* Submit 2

Pruning with linear regression
then add contributions of aggregated xgboost + linear model

0.78181 Public LB  0.779345 CV

* Submit 3
DNN pruning
(wrong CV)
0.73270 Public LB 0.663128 CV with DNN 10 neurons 900 steps gain_threshold = 0.01 outliers_threshold = 0.05

* Prediction 4
LB 0.74713
Ensemble of 5 DNN
{'gain_threshold': 0.015, 'steps': 3800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05} CV 0.701658836521
using fist pruning DNN
* Prediction 5
LB 0.74453
Ensemble of 5 DNN
{'gain_threshold': 0.01, 'steps': 2400, 'nb_neurons_1': 10, 'outliers_threshold': 0.05} CV 0.707300896236

In [None]:
Dropout
Regression
Validation set instead of cross val