In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.utils import resample
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

from time import time
import operator

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf

from scipy.stats import norm

tf.logging.set_verbosity(tf.logging.ERROR)

### Start training

In [2]:
# Import data
def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

#columns = (["id"] + ["reflectance_" + str(i) for i in range(7)]
 #          + ["solar_" + str(i) for i in range(5)] + ["y"])
#full_data = pd.read_csv("MODIS.csv", header=None, names=columns)
#split = 98000
#train, test = full_data[:split].copy(), full_data[split:].copy()
#train = full_data.copy()

#train_copy, test_copy = load_data()

train, test = load_data()

# Parameters
n_threads = -1
random_seed = 22

In [3]:
train.head()

Unnamed: 0,reflectance_0,reflectance_1,reflectance_2,reflectance_3,reflectance_4,reflectance_5,reflectance_6,solar_0,solar_1,solar_2,solar_3,solar_4,id,y
0,1.580642,2.482233,5.887092,4.732722,4.408482,3.830171,4.388508,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
1,2.338455,3.627796,4.723716,3.324726,2.743442,4.727652,2.810193,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
2,2.224569,3.522241,6.188831,4.389783,4.177616,4.945918,4.122848,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
3,1.717218,2.712012,5.024211,3.944907,3.393424,3.931973,3.489578,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
4,2.378857,3.644976,4.515292,3.223825,2.739952,4.599662,2.781574,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082


In [3]:
#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

### Preprocessing

In [3]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Remove outliers
#outliers_id = [21, 31, 72, 85, 135, 154, 165,
 #              199, 232, 252, 255, 262, 289, 387,
  #             393, 404, 408, 434, 488, 516, 578,
   #            615, 617, 624, 633, 647, 683, 778,
    #           785, 792, 817, 828, 917, 946, 960]

#train = train[~train["id"].isin(outliers_id)].reset_index(drop=True)

# Standardise data for LR
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

### Neural net

In [26]:
#feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_orig]

#regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                         # hidden_units=[9, 9],
                                         #)#model_dir="./temp_log")

#train, test = train_copy[19600:].copy(), train_copy[:19600].copy()

#def input_fn(data_set):
 #   feature_cols = {k: tf.constant(data_set[k].values) for k in cols_orig}
  #  labels = tf.constant(data_set["y"].values)
    
   # return feature_cols, labels

#validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
 #   input_fn=lambda: input_fn(test),
  #  early_stopping_rounds=100)

#regressor.fit(input_fn=lambda: input_fn(train), steps=10)
              #monitors=[validation_monitor])

#ev = regressor.evaluate(input_fn=lambda: input_fn(train), steps=1)
#loss_score = ev["loss"]
#print("Loss: {0:f}".format(loss_score))

#y = regressor.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to a list and print predictions
#predictions = np.array(list(itertools.islice(y, 0, None)))

### Cross-validation

In [4]:
def pdf_weight(data):
    cols = list(data.columns)
    X = data.copy()
    
    y_mean_dict = X.groupby("id")["y_hat"].mean().to_dict()
    y_std_dict = X.groupby("id")["y_hat"].std().to_dict()
    X["y_hat_mean"] = X["id"].map(y_mean_dict)
    X["y_hat_std"] = X["id"].map(y_std_dict)
    X["pdf"] = norm.pdf(X["y_hat"], X["y_hat_mean"], 
                        X["y_hat_std"])
    y_pdf_sum_dict = X.groupby("id")["pdf"].sum().to_dict()
    X["pdf_sum"] = X["id"].map(y_pdf_sum_dict)
    X["pdf"] /= X["pdf_sum"]
    X["y_hat_weighted"] = X["y_hat"] * X["pdf"]
    
    y_weighted_dict = X.groupby("id")["y_hat_weighted"].sum().to_dict()
    X["y_hat_weighted_sum"] = X["id"].map(y_weighted_dict)
    
    return(X[cols + ["y_hat_weighted_sum"]])

def discard_noisy_pixels(X, reflect_col, dark_lim, bright_lim):
    reflect_rank = X.groupby("id")[reflect_col].rank()
    index = (reflect_rank > dark_lim) & (reflect_rank < bright_lim)
    return(X.loc[index].reset_index(drop=True))

In [5]:
cols_dnn = cols_orig

models_weights = {"dnn_1": 1}#, "dnn_2": 0.2, "dnn_3": 0.2,
                  #"dnn_4": 0.2, "dnn_5": 0.2}
models_cols = {"dnn_1": cols_dnn}
#models_cols = {"dnn_1": cols_dnn, "dnn_2": cols_dnn, "dnn_3": cols_dnn,
 #              "dnn_4": cols_dnn, "dnn_5": cols_dnn}
    
#learning_rate = 0.1
reflect_col = "reflectance_3"

# Scoring function in the hyperopt hyperparameters tuning.
def scoring_function(parameters):
    print("Training the model with parameters: ")
    print(parameters)
    average_RMSE = 0.0
    n_splits = 5
    
    # Generate random integer for model_dir
    random_int = np.random.randint(1000)
    
    kf = KFold(n_splits=n_splits)
    nb_fold = 0
    for train_index, validation_index in kf.split(train):
        nb_fold += 1
        train_fold, validation_fold = train.loc[train_index], train.loc[validation_index]
        
        # Remove outliers
        #train_fold = train_fold[~train_fold["id"].isin(outliers_id)].reset_index(drop=True)
        
        # Remove darkest and brightest pixels
        train_fold = discard_noisy_pixels(train_fold, reflect_col,
                                          parameters["dark_lim"], parameters["bright_lim"])

        feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
        count = 0
        model_dir = ("./log_"
                     + str(parameters["steps"]) + "_"
                     + str(parameters["nb_neurons_1"]) + "_"
                     #+ str(parameters["nb_neurons_2"])
                     + str(nb_fold) + "_"
                     + str(count) + "_"
                     + str(random_int)
                    )
        
        # Tune number of layers
        model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                  hidden_units=[parameters["nb_neurons_1"]],
                                                                #parameters["nb_neurons_2"]],
                                                  #optimizer=tf.train.ProximalAdagradOptimizer(
                                                   #   learning_rate=learning_rate,
                                                    #  l2_regularization_strength=parameters["l2_reg"]),
                                                  #dropout=parameters["dropout"],
                                                  model_dir=model_dir)

        def input_fn(data_set):
            feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
            labels = tf.constant(data_set["y"].values)
            return feature_cols, labels
        
        model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

        train_pred = train_fold[["id"]].assign(y_hat=0)
        #for i, m in models.items():
        temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
        # .predict() returns an iterator; convert to an array
        y_hat = np.array(list(itertools.islice(temp, 0, None)))
        train_pred["y_hat"] = y_hat

        # Use median value by id
        y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

        RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train_fold["y"]))
        print("Pruning {0} RMSE: {1}".format(count, RMSE))
        
        # Prune outliers
        RMSE_decreasing = False
        while (RMSE_decreasing):
            count +=1
            train_pred["y_med"] = train_pred["id"].map(y_hat_med)

            # Distance from the median for each bag
            train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
            # Rank of each instance by bag
            train_pred["rank"] = train_pred.groupby("id")["score"].rank()
            bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
            train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
            train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

            # Remove outliers
            outliers_index = train_pred["rank"] > (1 - parameters["outliers_threshold"])
            train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)
            
            model_dir = ("./log_"
                         + str(parameters["steps"]) + "_"
                         + str(parameters["nb_neurons_1"]) + "_"
                         #+ str(parameters["nb_neurons_2"])
                         + str(nb_fold) + "_"
                         + str(count) + "_"
                         + str(random_int)
                        )

            model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                      hidden_units=[parameters["nb_neurons_1"]],
                                                                    #parameters["nb_neurons_2"]],
                                                      #optimizer=tf.train.ProximalAdagradOptimizer(
                                                       #   learning_rate=learning_rate,
                                                        #  l2_regularization_strength=parameters["l2_reg"]),
                                                      #dropout=parameters["dropout"],
                                                      model_dir=model_dir)

            model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

            # Compute new RMSE
            train_pred = train_fold[["id"]].assign(y_hat=0)
            
            #for i, m in models.items():
            temp = model_dnn.predict(input_fn=lambda: input_fn(train_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            train_pred["y_hat"] = y_hat

            # Use median value by id
            y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

            new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train_fold["y"]))
            print("Pruning {0} RMSE: {1}".format(count, new_RMSE))
            
            if (abs(new_RMSE - RMSE) > parameters["gain_threshold"]):
            # 5 iterations of pruning
            #if (count < 5):
                RMSE = new_RMSE
            else:
                RMSE_decreasing = False
        
        # Changed to model_dnn instead of model_dnn_1
        models = {"dnn_1": model_dnn}
        
        # Compute RMSE on validation set
        validation_pred = validation_fold[["id", "y", reflect_col]].assign(y_hat=0).reset_index(drop=True)
        for i, m in models.items():
            temp = m.predict(input_fn=lambda: input_fn(validation_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            validation_pred["y_hat"] += models_weights[i] * y_hat
        
        # Discard 20% darkest pixels and 50% brightest
        validation_pred = discard_noisy_pixels(validation_pred, reflect_col,
                                               parameters["dark_lim"], parameters["bright_lim"])
        
        # Weight each instance by gaussian pdf
        #validation_pred = pdf_weight(validation_pred)
        #RMSE = np.sqrt(mean_squared_error(validation_pred["y_hat_weighted_sum"], validation_pred["y"]))
        
        # Use median value by id
        y_hat_med = validation_pred.groupby("id").median()["y_hat"].to_dict()
        RMSE = np.sqrt(mean_squared_error(validation_fold["id"].map(y_hat_med).values, validation_fold["y"]))
        
        average_RMSE += RMSE
        print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))

    average_RMSE /= n_splits

    print("Cross-validation score: {0}\n".format(average_RMSE))
    
    return {"loss": average_RMSE, "status": STATUS_OK}

In [6]:
t0 = time()

# Grid to pick parameters from.
parameters_grid = {"steps"             : hp.choice("steps", np.arange(1500, 3000, 100, dtype=int)),
                   "nb_neurons_1"      : hp.choice("nb_neurons_1", np.arange(8, 11, 1, dtype=int)),
                   "outliers_threshold": hp.quniform("outliers_threshold", 0.05, 0.051, 0.01),
                   "gain_threshold"    : hp.quniform("gain_threshold", 0.01, 0.015, 0.005),
                   "dark_lim"      : hp.choice("dark_lim", np.arange(5, 15, 2, dtype=int)),
                   "bright_lim"      : hp.choice("bright_lim", np.arange(45, 65, 3, dtype=int))
                   #"dropout": hp.quniform("dropout", 0.2, 0.4, 0.1)
                   #"l2_reg": hp.quniform("l2_reg", 0.00, 0.005, 0.01)
                   #"nb_neurons_2": hp.choice("nb_neurons_2", np.arange(5, 10, 1, dtype=int))
                  }
# Record the information about the cross-validation.
trials = Trials()

best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=10, 
            trials=trials)

computing_time = time() - t0

Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 1800, 'bright_lim': 57, 'dark_lim': 7, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.707926010804
Validation fold 1 RMSE: 0.833349159011
Pruning 0 RMSE: 0.666755427479
Validation fold 2 RMSE: 0.731854603826
Pruning 0 RMSE: 0.693536447791
Validation fold 3 RMSE: 0.733400432007
Pruning 0 RMSE: 0.68043379367
Validation fold 4 RMSE: 0.677572193735
Pruning 0 RMSE: 0.67317091118
Validation fold 5 RMSE: 0.761922833772
Cross-validation score: 0.74761984447

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 1900, 'bright_lim': 45, 'dark_lim': 5, 'nb_neurons_1': 10, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.679712874649
Validation fold 1 RMSE: 0.820366717526
Pruning 0 RMSE: 0.655368022442
Validation fold 2 RMSE: 0.749181939289
Pruning 0 RMSE: 0.664312444281
Validation fold 3 RMSE: 0.714015154714
Pruning 0 RMSE: 0.671914445365
Validation fold 4 RMSE: 0.677189442398
Pruning 0 RMSE: 0

In [None]:
Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 3800, 'bright_lim': 57, 'dark_lim': 5, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.676104764855
Validation fold 1 RMSE: 0.665041688234
Pruning 0 RMSE: 0.656412476559
Validation fold 2 RMSE: 0.748346517177
Pruning 0 RMSE: 0.68494431793
Validation fold 3 RMSE: 0.727735868085
Pruning 0 RMSE: 0.666319748271
Validation fold 4 RMSE: 0.689188331182
Pruning 0 RMSE: 0.648822767318
Validation fold 5 RMSE: 0.781039517149
Cross-validation score: 0.722270384365
    
Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 3800, 'bright_lim': 57, 'dark_lim': 5, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.662375939065
Validation fold 1 RMSE: 0.643900356172
Pruning 0 RMSE: 0.652475471155
Validation fold 2 RMSE: 0.734333964908
Pruning 0 RMSE: 0.657422623728
Validation fold 3 RMSE: 0.724885333197
Pruning 0 RMSE: 0.668428420927
Validation fold 4 RMSE: 0.660678209034
Pruning 0 RMSE: 0.641852729063
Validation fold 5 RMSE: 0.766109689459
Cross-validation score: 0.705981510554

In [None]:
Discard darkest and brightest pixels median 10% 50% reflectance_3

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 3700, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.689159277446
Validation fold 1 RMSE: 0.655020591793
Pruning 0 RMSE: 0.651604181078
Validation fold 2 RMSE: 0.742418209554
Pruning 0 RMSE: 0.642671871749
Validation fold 3 RMSE: 0.720319036944
Pruning 0 RMSE: 0.67571478665
Validation fold 4 RMSE: 0.688608452945
Pruning 0 RMSE: 0.64606435882
Validation fold 5 RMSE: 0.758060023685
Cross-validation score: 0.712885262984


Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 3700, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.665400041022
Validation fold 1 RMSE: 0.660441234798
Pruning 0 RMSE: 0.654655116457
Validation fold 2 RMSE: 0.742938722534
Pruning 0 RMSE: 0.698126608064
Validation fold 3 RMSE: 0.745117735753
Pruning 0 RMSE: 0.661949196442
Validation fold 4 RMSE: 0.673310937534
Pruning 0 RMSE: 0.634165512306
Validation fold 5 RMSE: 0.757140157901
Cross-validation score: 0.715789757704


Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 3700, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.679150582438
Validation fold 1 RMSE: 0.802537439314
Pruning 0 RMSE: 0.629692451165
Validation fold 2 RMSE: 0.727601475198
Pruning 0 RMSE: 0.655218441308
Validation fold 3 RMSE: 0.709988029457
Pruning 0 RMSE: 0.668796556874
Validation fold 4 RMSE: 0.67992545289
Pruning 0 RMSE: 0.636853380555
Validation fold 5 RMSE: 0.758553027167
Cross-validation score: 0.735721084805

Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 2000, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 0 RMSE: 0.670019304715
Validation fold 1 RMSE: 0.669165593689
Pruning 0 RMSE: 0.685869124116
Validation fold 2 RMSE: 0.766783748327
Pruning 0 RMSE: 0.653148077411
Validation fold 3 RMSE: 0.736898856297
Pruning 0 RMSE: 0.676924728634
Validation fold 4 RMSE: 0.673257464038
Pruning 0 RMSE: 0.653467276753
Validation fold 5 RMSE: 0.763890032813
Cross-validation score: 0.721999139033

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 900, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Validation fold 1 RMSE: 0.667057389166
Validation fold 2 RMSE: 0.745337904245
Validation fold 3 RMSE: 0.720320570752
Validation fold 4 RMSE: 0.709991037768
Validation fold 5 RMSE: 0.765899342935
Cross-validation score: 0.721721248973

In [None]:
MODIS
Discard darkest and brightest pixels median 10% 50% reflectance_2
Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 900, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Validation fold 1 RMSE: 0.0927701092668
Validation fold 2 RMSE: 0.179845503027
Validation fold 3 RMSE: 0.113763215729
Validation fold 4 RMSE: 0.0987231631294
Validation fold 5 RMSE: 0.160676485502
Cross-validation score: 0.129155695331

Discard darkest and brightest pixels median 20% 50% reflectance_0
Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 900, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Validation fold 1 RMSE: 0.112719750528
Validation fold 2 RMSE: 0.141518078026
Validation fold 3 RMSE: 0.13111356705
Validation fold 4 RMSE: 0.117360147513
Validation fold 5 RMSE: 0.212927251907
Cross-validation score: 0.143127759005

In [None]:
MODIS train: 980 bags
Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 900, 'nb_neurons_1': 9, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.105343361018
Validation fold 1 RMSE: 0.0987015791108
Pruning 1 RMSE: 0.100121169819
Pruning 2 RMSE: 0.0939525682587
Validation fold 2 RMSE: 0.139847916988
Pruning 1 RMSE: 0.121320922084
Pruning 2 RMSE: 0.111532420256
Validation fold 3 RMSE: 0.13168961676
Pruning 1 RMSE: 0.10949595882
Pruning 2 RMSE: 0.103120828612
Validation fold 4 RMSE: 0.107883013046
Pruning 1 RMSE: 0.112995428702
Validation fold 5 RMSE: 0.188379683693
Cross-validation score: 0.13330036192

In [None]:
MODIS train: 1364 bags
    
Training the model with parameters: 
{'gain_threshold': 0.015, 'steps': 900, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.119817931796
Validation fold 1 RMSE: 0.111377722376
Pruning 1 RMSE: 0.112396098948
Pruning 2 RMSE: 0.101808581135
Validation fold 2 RMSE: 0.148817162914
Pruning 1 RMSE: 0.115222994126
Pruning 2 RMSE: 0.108615616905
Validation fold 3 RMSE: 0.110455499827
Pruning 1 RMSE: 0.111609881409
Validation fold 4 RMSE: 0.158362010857
Pruning 1 RMSE: 0.108284906653
Validation fold 5 RMSE: 0.120915922243
Cross-validation score: 0.129985663643


In [None]:
1 DNN

Training the model with parameters: 
{'gain_threshold': 0.01, 'steps': 900, 'nb_neurons_1': 11, 'outliers_threshold': 0.05}
Pruning 1 RMSE: 0.720018452178
Pruning 2 RMSE: 0.694637808863
Pruning 3 RMSE: 0.682604548885
Pruning 4 RMSE: 0.675240927353
Validation fold 1 RMSE: 0.657330001629
Pruning 1 RMSE: 0.67466681128
Pruning 2 RMSE: 0.65951277037
Pruning 3 RMSE: 0.649785148888
Validation fold 2 RMSE: 0.748604123648
Pruning 1 RMSE: 0.68920733019
Pruning 2 RMSE: 0.673928761686
Pruning 3 RMSE: 0.663794898173
Pruning 4 RMSE: 0.656852425765
Validation fold 3 RMSE: 0.75238232566
Pruning 1 RMSE: 0.695945822399
Pruning 2 RMSE: 0.677436347483
Pruning 3 RMSE: 0.667360192164
Pruning 4 RMSE: 0.661270048569
Validation fold 4 RMSE: 0.703611985649
Pruning 1 RMSE: 0.675491954494
Pruning 2 RMSE: 0.654833366872
Pruning 3 RMSE: 0.641692836958
Pruning 4 RMSE: 0.633366415513
Validation fold 5 RMSE: 0.784596896669
Cross-validation score: 0.729305066651


In [42]:
min(trials.losses())

0.6523287945244072

In [43]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in 
                                zip(space_eval(parameters_grid, best).keys(),
                                    space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("best_parameters_7.csv", encoding="utf-8", index=False)

best_parameters

Unnamed: 0,gain_threshold,nb_neurons_1,outliers_threshold,steps,score
0,0.0075,10,0.05,900,0.652329


### Training models

In [8]:
cols_dnn = cols_orig
models_weights = {"dnn_1": 1.0}
models_cols = {"dnn_1": cols_dnn}
best_parameters = pd.read_csv("best_parameters_6.csv", encoding="utf-8")
model_dir = "./log_submit_6"

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
# Tune number of layers
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                          hidden_units=[best_parameters["nb_neurons_1"][0]],
                                          model_dir=model_dir)

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels

model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])
        
train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to an array
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat

# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train["y"]))
        
# Prune outliers
RMSE_decreasing = True
count = 0
while (RMSE_decreasing):
    count += 1
    train_pred["y_med"] = train_pred["id"].map(y_hat_med)

    # Distance from the median for each bag
    train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
    # Rank of each instance by bag
    train_pred["rank"] = train_pred.groupby("id")["score"].rank()
    bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
    train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
    train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

    # Remove outliers
    outliers_index = train_pred["rank"] > (1 - best_parameters["outliers_threshold"][0])
    train = train.loc[~outliers_index, :].reset_index(drop=True)

    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[best_parameters["nb_neurons_1"][0]],
                                              model_dir=model_dir)

    model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])

    # Compute new RMSE
    train_pred = train[["id"]].assign(y_hat=0)
            
    temp = model_dnn.predict(input_fn=lambda: input_fn(train))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat

    # Use median value by id
    y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

    new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train["y"]))
    print("Pruning {0} RMSE: {1}".format(count, new_RMSE))

    if (abs(new_RMSE - RMSE) > best_parameters["gain_threshold"][0]):
        RMSE = new_RMSE
    else:
        RMSE_decreasing = False
        
# Training model
model_dnn_1 = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_1"][0]])
model_dnn_1.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps"][0])

models = {"dnn_1": model_dnn_1}

Pruning 1 RMSE: 0.623854414971
Pruning 2 RMSE: 0.612112933147
Pruning 3 RMSE: 0.605667733108


### Predicting on test set

In [9]:
test_pred = test[["id"]].assign(y_hat=0).reset_index(drop=True)
for i, m in models.items():
    temp = m.predict(input_fn=lambda: input_fn(test))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    test_pred["y_hat"] += models_weights[i] * y_hat

# Use median value by id
y_hat_med = test_pred.groupby("id").median()["y_hat"]

In [10]:
RMSE = np.sqrt(mean_squared_error(test_pred["id"].map(y_hat_med).values, test["y"]))
RMSE
#0.65725435012348465 for Pred 4
#0.65362864377856866 for Pred 5

0.71423164615804291

In [11]:
kaggle_pred = pd.DataFrame({"Id": y_hat_med.index, "y": y_hat_med.values})
kaggle_pred.to_csv("Prediction_6.csv", encoding="utf-8", index=False)

### Benchmark:
* Submit 1 (ensemble of xgboost + 2 ridge with instances model)
eta	eval_metric	gamma	lambda	max_depth	min_child_weight	nthread	objective	seed	silent	subsample	score
0.91834 Public LB 300 trees 0.09	rmse	0.2	0.8	4	4.0	-1	reg:linear	22	0	0.7	0.883339 (cross-val)
    
* Submit 2

Pruning with linear regression
then add contributions of aggregated xgboost + linear model

0.78181 Public LB  0.779345 CV

* Submit 3
DNN pruning
(wrong CV)
0.73270 Public LB 0.663128 CV with DNN 10 neurons 900 steps gain_threshold = 0.01 outliers_threshold = 0.05

* Prediction 4
LB 0.74713
Ensemble of 5 DNN
{'gain_threshold': 0.015, 'steps': 3800, 'nb_neurons_1': 10, 'outliers_threshold': 0.05} CV 0.701658836521
using fist pruning DNN
* Prediction 5
LB 0.74453
Ensemble of 5 DNN
{'gain_threshold': 0.01, 'steps': 2400, 'nb_neurons_1': 10, 'outliers_threshold': 0.05} CV 0.707300896236

In [None]:
Dropout
L2 regu
Validation set instead of cross val

### EM algorithm

In [None]:
def gamma_i_j(j, pi_i, X_i, y_i, delta):
    out = 0.0
    out = pi_i[j] * norm.pdf(y_i, X_i[j], delta)
    out /= sum(pi_i * norm.pdf(y_i, X_i, delta))
    return(out)

def EM_Q(pi, X, y, delta):
    
    
    

cols_dnn = cols_orig

best_parameters = pd.read_csv("best_parameters_6.csv", encoding="utf-8")

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]

model_dir_f = "./log_submit_6"

# Fit DNN regressor
model_dnn_f = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_f"][0]],
                                            model_dir=model_dir_f)

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels

model_dnn_f.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps_f"][0])

# Train prediction
train_pred = train[["id"]].assign(y_hat=0, pi_hat=0)
temp = model_dnn_f.predict(input_fn=lambda: input_fn(train))
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat

# Fit DNN softmax
model_dir_g = "./g_log_EM"
model_dnn_g = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                            hidden_units=[best_parameters["nb_neurons_g"][0]],
                                            model_dir=model_dir_g)

model_dnn_g.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps_g"][0])
temp = model_dnn_g.predict(input_fn=lambda: input_fn(train))
pi_hat = np.array(list(itertools.islice(temp, 0, None)))

# Compute softmax
train_pred["pi_hat"] = np.exp(pi_hat)
pi_hat_sum_dict = train_pred.groupby("id")["pi_hat"].sum().to_dict()
# "map" is actually much faster than "replace"
train_pred["pi_hat_sum"] = train_pred["id"].map(pi_hat_sum_dict)
train_pred["pi_hat"] /= train_pred["pi_hat_sum"]

# EM algorithm
# Change to Q
EM_decreasing = True
nb_iteration = 0
while (EM_decreasing):
    nb_iteration +=1

    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[best_parameters["nb_neurons_f"][0]],
                                              model_dir=model_dir)

    model_dnn.fit(input_fn=lambda: input_fn(train), steps=best_parameters["steps_f"][0])

    # Compute new RMSE
    train_pred = train[["id"]].assign(y_hat=0)
            
    temp = model_dnn.predict(input_fn=lambda: input_fn(train))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat


    print("Iteration {0} EM: {1}".format(nb_iteration, new_RMSE))

    if (abs(new_RMSE - RMSE) > 0.1):
        RMSE = new_RMSE
    else:
        EM_decreasing = False

models = {"dnn_1": model_dnn}