In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.utils import resample
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

from time import time
import operator

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

### Start training

In [11]:
# Import data
def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].replace(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

#columns = (["id"] + ["reflectance_" + str(i) for i in range(7)]
#           + ["solar_" + str(i) for i in range(5)] + ["y"])
#full_data = pd.read_csv("MODIS.csv", header=None, names=columns)
#split = 98000
#train, test = full_data[:split].copy(), full_data[split:].copy()
train, test = load_data()

# Parameters
n_threads = -1
random_seed = 22

In [12]:
train.head()

Unnamed: 0,reflectance_0,reflectance_1,reflectance_2,reflectance_3,reflectance_4,reflectance_5,reflectance_6,solar_0,solar_1,solar_2,solar_3,solar_4,id,y
0,1.580642,2.482233,5.887092,4.732722,4.408482,3.830171,4.388508,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
1,2.338455,3.627796,4.723716,3.324726,2.743442,4.727652,2.810193,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
2,2.224569,3.522241,6.188831,4.389783,4.177616,4.945918,4.122848,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
3,1.717218,2.712012,5.024211,3.944907,3.393424,3.931973,3.489578,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
4,2.378857,3.644976,4.515292,3.223825,2.739952,4.599662,2.781574,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082


In [13]:
#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

### Preprocessing

In [14]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Standardise data for LR
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

### Neural net

In [6]:
#feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_orig]

#regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                         # hidden_units=[9, 9],
                                         #)#model_dir="./temp_log")

#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

#def input_fn(data_set):
 #   feature_cols = {k: tf.constant(data_set[k].values) for k in cols_orig}
  #  labels = tf.constant(data_set["y"].values)
    
   # return feature_cols, labels

#validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
 #   input_fn=lambda: input_fn(test),
  #  early_stopping_rounds=100)

#regressor.fit(input_fn=lambda: input_fn(train), steps=10)
              #monitors=[validation_monitor])

#ev = regressor.evaluate(input_fn=lambda: input_fn(train), steps=1)
#loss_score = ev["loss"]
#print("Loss: {0:f}".format(loss_score))

#y = regressor.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to a list and print predictions
#predictions = np.array(list(itertools.islice(y, 0, None)))

### Cross-validation

In [19]:
cols_dnn = cols_orig

models_weights = {"dnn_1": 1.0}#, "dnn_2": 0., "dnn_3": 0.}
models_cols = {"dnn_1": cols_dnn}
#models_cols = {"dnn_1": cols_dnn, "dnn_2": cols_dnn, "dnn_3": cols_dnn}

# Scoring function in the hyperopt hyperparameters tuning.
def scoring_function(parameters):
    print("Training the model with parameters: ")
    print(parameters)
    average_RMSE = 0.0
    n_splits = 5

    kf = KFold(n_splits=n_splits)
    nb_fold = 0
    for train_index, validation_index in kf.split(train):
        nb_fold += 1
        train_fold, validation_fold = train.loc[train_index], train.loc[validation_index] 

        feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
        model_dir = ("./log_"
                     + str(parameters["steps"]) + "_"
                     + str(parameters["nb_neurons_1"]) + "_"
                     #+ str(parameters["nb_neurons_2"])
                     + str(nb_fold)
                    )
        
        # Tune number of layers
        model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                    hidden_units=[parameters["nb_neurons_1"]],
                                                                #parameters["nb_neurons_2"]],
                                                  #optimizer=tf.train.ProximalAdagradOptimizer(
                                                   #   learning_rate=0.1,
                                                    #  l1_regularization_strength=0.001),
                                                  model_dir=model_dir)

        def input_fn(data_set):
            feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
            labels = tf.constant(data_set["y"].values)
            return feature_cols, labels
        
        model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

        train_pred = train_fold[["id"]].assign(y_hat=0)
        #for i, m in models.items():
        temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
        # .predict() returns an iterator; convert to an array
        y_hat = np.array(list(itertools.islice(temp, 0, None)))
        train_pred["y_hat"] = y_hat

        # Use median value by id
        y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

        RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med).values, train_fold["y"]))
        
        # Prune outliers
        RMSE_decreasing = True
        count = 0
        while (RMSE_decreasing):
            count +=1
            train_pred["y_med"] = train_pred["id"].replace(y_hat_med)

            # Distance from the median for each bag
            train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
            # Rank of each instance by bag
            train_pred["rank"] = train_pred.groupby("id")["score"].rank()
            bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
            train_pred["bag_size"] = train_pred["id"].replace(bag_size_dict)
            train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

            # Remove outliers
            outliers_index = train_pred["rank"] > (1 - parameters["outliers_threshold"])
            train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)

            model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                      hidden_units=[parameters["nb_neurons_1"]],
                                                                    #parameters["nb_neurons_2"]],
                                                      #optimizer=tf.train.ProximalAdagradOptimizer(
                                                       #   learning_rate=0.1,
                                                        #  l1_regularization_strength=0.001),
                                                      model_dir=model_dir)

            model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

            # Compute new RMSE
            train_pred = train_fold[["id"]].assign(y_hat=0)
            
            #for i, m in models.items():
            temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            train_pred["y_hat"] = y_hat

            # Use median value by id
            y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

            new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med), train_fold["y"]))
            print("Pruning {0} RMSE: {1}".format(count, new_RMSE))

            if (abs(new_RMSE - RMSE) > parameters["gain_threshold"]):
                RMSE = new_RMSE
            else:
                RMSE_decreasing = False
        
        # Bagging of RNN
        #train_fold_2 = train_fold
        #resample(train_fold, random_state=random_seed).sort_values(by=["id"]).reset_index(drop=True)
        #model_dnn_2 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
         #                                           hidden_units=[parameters["nb_neurons_1"]])
        #model_dnn_2.fit(input_fn=lambda: input_fn(train_fold_2), steps=parameters["steps"])
            
        #train_fold_3 = train_fold
        #resample(train_fold, random_state=(random_seed+1)).sort_values(by=["id"]).reset_index(drop=True)
        #model_dnn_3 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
         #                                           hidden_units=[parameters["nb_neurons_1"]])
        #model_dnn_3.fit(input_fn=lambda: input_fn(train_fold_3), steps=parameters["steps"])
        
        models = {"dnn_1": model_dnn}#, "dnn_2": model_dnn_2, "dnn_3": model_dnn_3}
        
        # Compute RMSE on validation set
        validation_pred = validation_fold[["id"]].assign(y_hat=0).reset_index(drop=True)
        for i, m in models.items():
            temp = m.predict(input_fn=lambda: input_fn(validation_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            validation_pred["y_hat"] += models_weights[i] * y_hat
            
        # Use median value by id
        y_hat_med = validation_pred.groupby("id").median()["y_hat"].to_dict()
        
        RMSE = np.sqrt(mean_squared_error(validation_pred["id"].replace(y_hat_med).values, validation_fold["y"]))
        average_RMSE += RMSE
        print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))

    average_RMSE /= n_splits

    print("Cross-validation score: {0}\n".format(average_RMSE))
    
    return {"loss": average_RMSE, "status": STATUS_OK}

In [20]:
t0 = time()

# Grid to pick parameters from.
parameters_grid = {"steps"             : hp.choice("steps", np.arange(1000, 1200, 100, dtype=int)),
                   "nb_neurons_1"      : hp.choice("nb_neurons_1", np.arange(9, 12, 1, dtype=int)),
                   "outliers_threshold": hp.quniform("outliers_threshold", 0.05, 0.051, 0.01),
                   "gain_threshold"    : hp.quniform("gain_threshold", 0.005, 0.01, 0.005)
                   #"nb_neurons_2": hp.choice("nb_neurons_2", np.arange(5, 10, 1, dtype=int))
                  }
# Record the information about the cross-validation.
trials = Trials()

best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=2, 
            trials=trials)

computing_time = time() - t0

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 1100, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}


KeyboardInterrupt: 

In [None]:
# 1 DNN

Training the model with parameters: 
{'gain_threshold': 0.025, 'steps': 1100, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.700342336831
Pruning 2 RMSE: 0.683664524531
Validation fold 1 RMSE: 0.663971921362
Pruning 1 RMSE: 0.660612635699
Validation fold 2 RMSE: 0.732783980385
Pruning 1 RMSE: 0.660259614581
Validation fold 3 RMSE: 0.708127090467
Pruning 1 RMSE: 0.664479795953
Validation fold 4 RMSE: 0.665647830521
Pruning 1 RMSE: 0.636416118638
Validation fold 5 RMSE: 0.721588699982
Cross-validation score: 0.698423904543

Training the model with parameters: 
{'gain_threshold': 0.03, 'steps': 800, 'nb_neurons_1': 9, 'outliers_threshold': 0.03}
Pruning 1 RMSE: 0.703224845125
Validation fold 1 RMSE: 0.651244395037
Pruning 1 RMSE: 0.663640955349
Validation fold 2 RMSE: 0.722869083504
Pruning 1 RMSE: 0.65823296086
Validation fold 3 RMSE: 0.715028888531
Pruning 1 RMSE: 0.667840098194
Validation fold 4 RMSE: 0.66820045969
Pruning 1 RMSE: 0.643973678981
Validation fold 5 RMSE: 0.723758625768
Cross-validation score: 0.696220290506

Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 1100, 'nb_neurons_1': 10, 'outliers_threshold': 0.06}
Pruning 1 RMSE: 0.696402262934
Pruning 2 RMSE: 0.684233689822
Validation fold 1 RMSE: 0.662438155738
Pruning 1 RMSE: 0.655915610952
Pruning 2 RMSE: 0.640449669189
Pruning 3 RMSE: 0.630974914112
Validation fold 2 RMSE: 0.72024219129
Pruning 1 RMSE: 0.644386455452
Validation fold 3 RMSE: 0.68446664115
Pruning 1 RMSE: 0.648451319918
Validation fold 4 RMSE: 0.663538573658
Pruning 1 RMSE: 0.624842257454
Validation fold 5 RMSE: 0.725033742805
Cross-validation score: 0.691143860928

Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.701936043686
Pruning 2 RMSE: 0.689333809624
Validation fold 1 RMSE: 0.662374324032
Pruning 1 RMSE: 0.663232629294
Pruning 2 RMSE: 0.652688952585
Validation fold 2 RMSE: 0.712087465413
Pruning 1 RMSE: 0.657387693419
Validation fold 3 RMSE: 0.695908105366
Pruning 1 RMSE: 0.6626187413
Validation fold 4 RMSE: 0.671406531478
Pruning 1 RMSE: 0.634658387406
Validation fold 5 RMSE: 0.730619559026
Cross-validation score: 0.694479197063

Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 900, 'nb_neurons_1': 9, 'outliers_threshold': 0.02}
Pruning 1 RMSE: 0.712312120081
Pruning 2 RMSE: 0.700133610941
Validation fold 1 RMSE: 0.652112646975
Pruning 1 RMSE: 0.669131328166
Validation fold 2 RMSE: 0.730146340241
Pruning 1 RMSE: 0.658446172822
Validation fold 3 RMSE: 0.715088217896
Pruning 1 RMSE: 0.661766796728
Validation fold 4 RMSE: 0.670220241847
Pruning 1 RMSE: 0.638323674242
Validation fold 5 RMSE: 0.720681799097
Cross-validation score: 0.697649849211
    
Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 1200, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.681984923569
Pruning 2 RMSE: 0.671898839038
Pruning 3 RMSE: 0.664809771556
Validation fold 1 RMSE: 0.646249838724
Pruning 1 RMSE: 0.641771731527
Pruning 2 RMSE: 0.63034966952
Pruning 3 RMSE: 0.622956268666
Validation fold 2 RMSE: 0.720135473218
Pruning 1 RMSE: 0.629556959358
Pruning 2 RMSE: 0.620030839437
Validation fold 3 RMSE: 0.712835930051
Pruning 1 RMSE: 0.644140036894
Validation fold 4 RMSE: 0.65774900689
Pruning 1 RMSE: 0.620800446764
Pruning 2 RMSE: 0.612234982067
Validation fold 5 RMSE: 0.719291596371
Cross-validation score: 0.691252369051

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 1100, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.66243735956
Pruning 2 RMSE: 0.655361539751
Validation fold 1 RMSE: 0.606231781715
Pruning 1 RMSE: 0.639680967378
Validation fold 2 RMSE: 0.688452392698
Pruning 1 RMSE: 0.63857148314
Pruning 2 RMSE: 0.632223577865
Validation fold 3 RMSE: 0.683210872718
Pruning 1 RMSE: 0.647718960343
Validation fold 4 RMSE: 0.648524153738
Pruning 1 RMSE: 0.620070519044
Pruning 2 RMSE: 0.61311807415
Validation fold 5 RMSE: 0.714036547973
Cross-validation score: 0.668091149768

Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1000, 'nb_neurons_1': 8, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.71663405381
Pruning 2 RMSE: 0.70585493758
Pruning 3 RMSE: 0.697783895958
Pruning 4 RMSE: 0.691414098683
Pruning 5 RMSE: 0.685042809559
Pruning 6 RMSE: 0.681459908161
Validation fold 1 RMSE: 0.654525188219
Pruning 1 RMSE: 0.657971019653
Pruning 2 RMSE: 0.651200844353
Pruning 3 RMSE: 0.647123131296
Validation fold 2 RMSE: 0.729519695627
Pruning 1 RMSE: 0.656954301982
Pruning 2 RMSE: 0.648846091759
Pruning 3 RMSE: 0.6455255593
Validation fold 3 RMSE: 0.701263760765
Pruning 1 RMSE: 0.660935643932
Pruning 2 RMSE: 0.655368945165
Pruning 3 RMSE: 0.651542971748
Validation fold 4 RMSE: 0.660783788318
Pruning 1 RMSE: 0.634539135894
Pruning 2 RMSE: 0.627608705883
Pruning 3 RMSE: 0.62264754794
Validation fold 5 RMSE: 0.723802887759
Cross-validation score: 0.693979064138

Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1200, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.677056777292
Pruning 2 RMSE: 0.663781964117
Pruning 3 RMSE: 0.654660031868
Pruning 4 RMSE: 0.648608423614
Pruning 5 RMSE: 0.644466297196
Validation fold 1 RMSE: 0.61313175237
Pruning 1 RMSE: 0.63183821921
Pruning 2 RMSE: 0.621615691038
Pruning 3 RMSE: 0.614506177558
Pruning 4 RMSE: 0.607921613778
Pruning 5 RMSE: 0.603495134396
Validation fold 2 RMSE: 0.697307123977
Pruning 1 RMSE: 0.622905122933
Pruning 2 RMSE: 0.614568338842
Pruning 3 RMSE: 0.608182075849
Pruning 4 RMSE: 0.603313122073
Validation fold 3 RMSE: 0.678467300754
Pruning 1 RMSE: 0.627963748226
Pruning 2 RMSE: 0.620024138614
Pruning 3 RMSE: 0.614310327957
Pruning 4 RMSE: 0.610923717848
Validation fold 4 RMSE: 0.64372418181
Pruning 1 RMSE: 0.604920983479
Pruning 2 RMSE: 0.597286581163
Pruning 3 RMSE: 0.591850681886
Pruning 4 RMSE: 0.587721979882
Validation fold 5 RMSE: 0.703312550726
Cross-validation score: 0.667188581927

In [None]:
# Ensemble of 3 DNN
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1000, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.71116941007
Pruning 2 RMSE: 0.710134244747
Validation fold 1 RMSE: 0.67016269301
Pruning 1 RMSE: 0.693788730369
Pruning 2 RMSE: 0.681699022039
Pruning 3 RMSE: 0.703264811229
Pruning 4 RMSE: 0.671334288662
Pruning 5 RMSE: 0.672755595177
Validation fold 2 RMSE: 0.742960274892
Pruning 1 RMSE: 0.693855159996
Pruning 2 RMSE: 0.691917725258
Validation fold 3 RMSE: 0.716453069035
Pruning 1 RMSE: 0.698029552278
Pruning 2 RMSE: 0.698402913616
Validation fold 4 RMSE: 0.702381816743
Pruning 1 RMSE: 0.669835541061
Pruning 2 RMSE: 0.670934743074
Validation fold 5 RMSE: 0.772065210064
Cross-validation score: 0.720804612749
    
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1100, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.710970210115
Pruning 2 RMSE: 0.723884803499
Pruning 3 RMSE: 0.705558405266
Pruning 4 RMSE: 0.69082150458
Pruning 5 RMSE: 0.709012637324
Pruning 6 RMSE: 0.689840083409
Pruning 7 RMSE: 0.682960915979
Pruning 8 RMSE: 0.689320580892
Pruning 9 RMSE: 0.733669462986
Pruning 10 RMSE: 0.702594016806
Pruning 11 RMSE: 0.700092614618
Validation fold 1 RMSE: 0.639888747346
Pruning 1 RMSE: 0.702693117724
Validation fold 2 RMSE: 0.756274354279
Pruning 1 RMSE: 0.691091309155
Pruning 2 RMSE: 0.691357233141
Validation fold 3 RMSE: 0.73679429174
Pruning 1 RMSE: 0.714865007836
Pruning 2 RMSE: 0.689779608571
Pruning 3 RMSE: 0.688207703398
Validation fold 4 RMSE: 0.686380994175
Pruning 1 RMSE: 0.684824585281
Pruning 2 RMSE: 0.669877290062
Pruning 3 RMSE: 0.686328240571
Pruning 4 RMSE: 0.663275388897
Pruning 5 RMSE: 0.668140496708
Validation fold 5 RMSE: 0.768409407507
Cross-validation score: 0.717549559009
    
Training the model with parameters: 
{'gain_threshold': 0.005, 'steps': 1100, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.705035094312
Pruning 2 RMSE: 0.714659640929
Pruning 3 RMSE: 0.712357459314
Validation fold 1 RMSE: 0.661191916714
Pruning 1 RMSE: 0.686965456513
Validation fold 2 RMSE: 0.749879491243
Pruning 1 RMSE: 0.689762043097
Pruning 2 RMSE: 0.69232873535
Validation fold 3 RMSE: 0.724016986696
Pruning 1 RMSE: 0.690028220234
Pruning 2 RMSE: 0.69337827918
Validation fold 4 RMSE: 0.703435739334
Pruning 1 RMSE: 0.681859928879
Pruning 2 RMSE: 0.67091896457
Pruning 3 RMSE: 0.677659388273
Pruning 4 RMSE: 0.664039872409
Pruning 5 RMSE: 0.667718583335
Validation fold 5 RMSE: 0.757494009563
Cross-validation score: 0.71920362871

In [8]:
min(trials.losses())

0.6911438609280351

In [10]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in 
                                zip(space_eval(parameters_grid, best).keys(),
                                    space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("best_parameters_4.csv", encoding="utf-8", index=False)

best_parameters

Unnamed: 0,nb_neurons_1,steps,score
0,10,900,0.663128


### Training models

In [6]:
cols_dnn = cols_orig

models_weights = {"dnn_1": 0.33, "dnn_2": 0.33, "dnn_3": 0.34}
models_cols = {"dnn_1": cols_dnn, "dnn_2": cols_dnn, "dnn_3": cols_dnn}

best_parameters = pd.read_csv("best_parameters_4.csv", encoding="utf-8")
parameters = dict(zip(best_parameters.columns[:-1], best_parameters.iloc[0].values[:-1]))

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
# Tune number of layers
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                          hidden_units=[best_parameters["nb_neurons_1"][0]])
                                          #model_dir=model_dir)

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels

model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])
        

train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to an array
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat

# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med).values, train["y"]))
        
# Prune outliers
RMSE_decreasing = True
count = 0
while (RMSE_decreasing):
    count +=1
    train_pred["y_med"] = train_pred["id"].replace(y_hat_med)

    # Distance from the median for each bag
    train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
    # Rank of each instance by bag
    train_pred["rank"] = train_pred.groupby("id")["score"].rank()
    bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
    train_pred["bag_size"] = train_pred["id"].replace(bag_size_dict)
    train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

    # Remove outliers
    outliers_index = train_pred["rank"] > (1 - best_parameters["outliers_threshold"])
    train = train.loc[~outliers_index, :].reset_index(drop=True)

    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[best_parameters["nb_neurons_1"][0]])
                                              #model_dir=model_dir)

    model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])

    # Compute new RMSE
    train_pred = train[["id"]].assign(y_hat=0)
            
    temp = model_dnn.predict(input_fn=lambda: input_fn(train))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat

    # Use median value by id
    y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

    new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med), train["y"]))
    print("Pruning {0} RMSE: {1}".format(count, new_RMSE))

    if (abs(new_RMSE - RMSE) > best_parameters["gain_threshold"]):
        RMSE = new_RMSE
    else:
        RMSE_decreasing = False
        
# Bagging of RNN
train_2 = train
#resample(train, random_state=random_seed).sort_values(by=["id"]).reset_index(drop=True)
model_dnn_2 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"]])
model_dnn_2.fit(input_fn=lambda: input_fn(train_2), steps=best_parameters["steps"])
            
train_3 = train
#resample(train, random_state=(random_seed+1)).sort_values(by=["id"]).reset_index(drop=True)
model_dnn_3 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"]])
model_dnn_3.fit(input_fn=lambda: input_fn(train_3), steps=best_parameters["steps"])
        
models = {"dnn_1": model_dnn, "dnn_2": model_dnn_2, "dnn_3": model_dnn_3}

# Compute RMSE on validation set
test_pred = test[["id"]].assign(y_hat=0).reset_index(drop=True)
for i, m in models.items():
    temp = m.predict(input_fn=lambda: input_fn(test))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    test_pred["y_hat"] += models_weights[i] * y_hat
            
# Use median value by id
y_hat_med = test_pred.groupby("id").median()["y_hat"]

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': 'local', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1110fe890>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': ''}
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into ./temp_log_submit_3/mod

In [12]:
y_hat_med = test_pred.groupby("id").median()["y_hat"]

In [15]:
#RMSE = np.sqrt(mean_squared_error(test_pred["id"].replace(y_hat_med).values, test["y"]))
#RMSE

0.67988427854320532

### Predicting

In [13]:
kaggle_pred = pd.DataFrame({"Id": y_hat_med.index, "y": y_hat_med.values})
kaggle_pred.to_csv("Prediction_3.csv", encoding="utf-8", index=False)

### Benchmark:
* Submit 1 (ensemble of xgboost + 2 ridge with instances model)
eta	eval_metric	gamma	lambda	max_depth	min_child_weight	nthread	objective	seed	silent	subsample	score
0.91834 Public LB 300 trees 0.09	rmse	0.2	0.8	4	4.0	-1	reg:linear	22	0	0.7	0.883339 (cross-val)
    
* Submit 2

Pruning with linear regression
then add contributions of aggregated xgboost + linear model

0.78181 Public LB  0.779345 CV

* Submit 3
DNN pruning

0.73270 Public LB 0.663128 CV with DNN 10 neurons 900 steps gain_threshold = 0.01 outliers_threshold = 0.05


In [None]:
Dropout
Regression
Validation set instead of cross val