In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

from time import time
import operator

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

### Start training

In [2]:
# Import data
def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].replace(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

#columns = (["id"] + ["reflectance_" + str(i) for i in range(7)]
#           + ["solar_" + str(i) for i in range(5)] + ["y"])
#full_data = pd.read_csv("MODIS.csv", header=None, names=columns)
#split = 98000
#train, test = full_data[:split].copy(), full_data[split:].copy()
train, test = load_data()

# Parameters
outliers_threshold = 0.05
n_threads = -1
random_seed = 22

In [3]:
train.head()

Unnamed: 0,reflectance_0,reflectance_1,reflectance_2,reflectance_3,reflectance_4,reflectance_5,reflectance_6,solar_0,solar_1,solar_2,solar_3,solar_4,id,y
0,1.580642,2.482233,5.887092,4.732722,4.408482,3.830171,4.388508,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
1,2.338455,3.627796,4.723716,3.324726,2.743442,4.727652,2.810193,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
2,2.224569,3.522241,6.188831,4.389783,4.177616,4.945918,4.122848,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
3,1.717218,2.712012,5.024211,3.944907,3.393424,3.931973,3.489578,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082
4,2.378857,3.644976,4.515292,3.223825,2.739952,4.599662,2.781574,22.572888,63.58724,88.05048,4.495216,-50.699904,1,-3.998082


### Preprocessing

In [4]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

### Cross-validation

In [7]:
## Gradient boosting
cols_dnn = cols_orig

models_weights = {"dnn": 1.0}
models_cols = {"dnn": cols_dnn}
tf.logging.set_verbosity(tf.logging.ERROR)

# Scoring function in the hyperopt hyperparameters tuning.
def scoring_function(parameters):
    print("Training the model with parameters: ")
    print(parameters)
    average_RMSE = 0.0
    n_splits = 5

    kf = KFold(n_splits=n_splits)
    for train_index, validation_index in kf.split(train):
        train_fold, validation_fold = train.loc[train_index], train.loc[validation_index] 

        #model_lr0 = LinearRegression()
        #model_lr0.fit(train_fold[cols_lr0], train_fold["y"])
        feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
        # Tune number of layers
        model_dir = ("./temp_log_"
                     + str(parameters["steps"]) + "_"
                     + str(parameters["nb_neurons_1"]) #+ "_"
                     #+ str(parameters["nb_neurons_2"])
                    )
        model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                  hidden_units=[parameters["nb_neurons_1"]],
                                                                #parameters["nb_neurons_2"]],
                                                  #optimizer=tf.train.ProximalAdagradOptimizer(
                                                   #   learning_rate=0.1,
                                                    #  l1_regularization_strength=0.001),
                                                  model_dir=model_dir)

        def input_fn(data_set):
            feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
            labels = tf.constant(data_set["y"].values)
            return feature_cols, labels

        model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])
        
        models = {"dnn": model_dnn}

        train_pred = train_fold[["id"]].assign(y_hat=0)
        for i, m in models.items():
            temp = m.predict(input_fn=lambda: input_fn(train_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            train_pred["y_hat"] += models_weights[i] * y_hat

        # Use median value by id
        y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

        RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med).values, train_fold["y"]))
        
        # Prune outliers
        RMSE_decreasing = True
        count = 0
        while (RMSE_decreasing):
            count +=1
            if ((count % 2) == 0):
                print(count)
            train_pred["y_med"] = train_pred["id"].replace(y_hat_med)

            # Distance from the median for each bag
            train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
            # Rank of each instance by bag
            train_pred["rank"] = train_pred.groupby("id")["score"].rank()
            bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
            train_pred["bag_size"] = train_pred["id"].replace(bag_size_dict)
            train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

            # Remove outliers
            outliers_index = train_pred["rank"] > (1 - outliers_threshold)
            train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)

            #model_lr0 = LinearRegression()
            #model_lr0.fit(train_fold[cols_lr0], train_fold["y"])
            model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                      hidden_units=[parameters["nb_neurons_1"]],
                                                                    #parameters["nb_neurons_2"]],
                                                      #optimizer=tf.train.ProximalAdagradOptimizer(
                                                       #   learning_rate=0.1,
                                                        #  l1_regularization_strength=0.001),
                                                      model_dir=model_dir)

            model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=parameters["steps"])

            models = {"dnn": model_dnn}

            # Compute new RMSE
            train_pred = train_fold[["id"]].assign(y_hat=0)
            
            for i, m in models.items():
                temp = m.predict(input_fn=lambda: input_fn(train_fold))
                # .predict() returns an iterator; convert to an array
                y_hat = np.array(list(itertools.islice(temp, 0, None)))
                train_pred["y_hat"] += models_weights[i] * y_hat

            # Use median value by id
            y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

            new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med), train_fold["y"]))
            if ((count % 2) == 0):
                print(new_RMSE)

            if (new_RMSE < RMSE):
                RMSE = new_RMSE
            else:
                RMSE_decreasing = False

        # Compute RMSE on validation set
        validation_pred = validation_fold[["id"]].assign(y_hat=0).reset_index(drop=True)
        for i, m in models.items():
            if (i == "dnn"):
                temp = m.predict(input_fn=lambda: input_fn(validation_fold))
                # .predict() returns an iterator; convert to an array
                y_hat = np.array(list(itertools.islice(temp, 0, None)))
                validation_pred["y_hat"] += models_weights[i] * y_hat
            else:
                validation_pred["y_hat"] += models_weights[i] * m.predict(validation_fold[models_cols[i]])
            
        # Use median value by id
        y_hat_med = validation_pred.groupby("id").median()["y_hat"].to_dict()
        
        RMSE = np.sqrt(mean_squared_error(validation_pred["id"].replace(y_hat_med).values, validation_fold["y"]))
        average_RMSE += RMSE
        print("Current validation RMSE: {0}".format(RMSE))

    average_RMSE /= n_splits

    print("Cross-validation score: {0}\n".format(average_RMSE))
    
    return {"loss": average_RMSE, "status": STATUS_OK}

In [8]:
t0 = time()

# Grid to pick parameters from.
parameters_grid = {#"eta"              : hp.quniform("eta", 0.1, 0.3, 0.1),
                   #"max_depth"        : hp.choice("max_depth", np.arange(4, 5, dtype=int)),
                   #"min_child_weight": hp.quniform("min_child_weight", 10, 15, 1),
                   "steps": hp.choice("steps", np.arange(500, 1000, 100, dtype=int)),
                   "nb_neurons_1": hp.choice("nb_neurons_1", np.arange(9, 11, 1, dtype=int))
                   #"nb_neurons_2": hp.choice("nb_neurons_2", np.arange(5, 10, 1, dtype=int))
                  }
# Record the information about the cross-validation.
trials = Trials()

best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=5, 
            trials=trials) 

computing_time = time() - t0

Training the model with parameters: 
{'nb_neurons_1': 9, 'steps': 500}


ValueError: ('num_outputs should be int or long, got %s.', 9)

In [None]:
0.818739649008
0.770475036604
0.771639976693
0.718388782706
0.817479974058
Cross-validation score: 0.779344683814

In [43]:
min(trials.losses())

0.7793446838139222

In [45]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in 
                                zip(space_eval(parameters_grid, best).keys(),
                                    space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("best_parameters_2.csv", encoding="utf-8", index=False)

best_parameters

Unnamed: 0,eta,eval_metric,lambda,max_depth,nthread,objective,seed,silent,subsample,score
0,0.4,rmse,0.4,4,-1,reg:linear,22,0,0.9,0.779345


### Training models

### Predicting

In [13]:
kaggle_pred = pd.DataFrame({"Id": y_hat_med.index, "y": y_hat_med.values})
kaggle_pred.to_csv("Prediction_2.csv", encoding="utf-8", index=False)

### Benchmark:
Submit 1 (ensemble of xgboost + 2 ridge with instances model)
eta	eval_metric	gamma	lambda	max_depth	min_child_weight	nthread	objective	seed	silent	subsample	score
0.91834 Public LB 300 trees 0.09	rmse	0.2	0.8	4	4.0	-1	reg:linear	22	0	0.7	0.883339 (cross-val)
    
Submit 2

Pruning with linear regression
then add contributions of aggregated xgboost + linear model
0.78181 Public LB  0.779345 CV

In [22]:
tf.logging.set_verbosity(tf.logging.DEBUG)
average_RMSE = 0.0
n_splits = 5
steps = 900
nb_neurons_1 = 9

kf = KFold(n_splits=n_splits)
for train_index, validation_index in kf.split(train):
    train_fold, validation_fold = train.loc[train_index], train.loc[validation_index] 

    #model_lr0 = LinearRegression()
    #model_lr0.fit(train_fold[cols_lr0], train_fold["y"])
    feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]

    # Tune number of layers
    model_dir = ("./temp_log_"
                 + str(steps) + "_"
                 + str(nb_neurons_1) #+ "_"
                 #+ str(parameters["nb_neurons_2"])
                )
    
    validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
        input_fn=lambda: input_fn(train_fold),
        early_stopping_metric="loss",
        early_stopping_metric_minimize=True,
        early_stopping_rounds=200)

    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[nb_neurons_1],
                                                            #parameters["nb_neurons_2"]],
                                              #optimizer=tf.train.ProximalAdagradOptimizer(
                                               #   learning_rate=0.1,
                                                #  l1_regularization_strength=0.001),
                                              model_dir=model_dir)

    def input_fn(data_set):
        feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
        labels = tf.constant(data_set["y"].values)
        return feature_cols, labels

    model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=steps)

    models = {"dnn": model_dnn}

    train_pred = train_fold[["id"]].assign(y_hat=0)
    for i, m in models.items():
        temp = m.predict(input_fn=lambda: input_fn(train_fold))
        # .predict() returns an iterator; convert to an array
        y_hat = np.array(list(itertools.islice(temp, 0, None)))
        train_pred["y_hat"] += models_weights[i] * y_hat

    # Use median value by id
    y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

    RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med).values, train_fold["y"]))

    # Prune outliers
    RMSE_decreasing = True
    count = 0
    while (RMSE_decreasing):
        count +=1
        if ((count % 2) == 0):
            print(count)
        train_pred["y_med"] = train_pred["id"].replace(y_hat_med)

        # Distance from the median for each bag
        train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
        # Rank of each instance by bag
        train_pred["rank"] = train_pred.groupby("id")["score"].rank()
        bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
        train_pred["bag_size"] = train_pred["id"].replace(bag_size_dict)
        train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

        # Remove outliers
        outliers_index = train_pred["rank"] > (1 - outliers_threshold)
        train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)
        
        validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
        input_fn=lambda: input_fn(train_fold),
        early_stopping_metric="loss",
        early_stopping_metric_minimize=True,
        early_stopping_rounds=200)

        #model_lr0 = LinearRegression()
        #model_lr0.fit(train_fold[cols_lr0], train_fold["y"])
        model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                                  hidden_units=[nb_neurons_1],
                                                                #parameters["nb_neurons_2"]],
                                                  #optimizer=tf.train.ProximalAdagradOptimizer(
                                                   #   learning_rate=0.1,
                                                    #  l1_regularization_strength=0.001),
                                                  model_dir=model_dir)

        model_dnn.fit(input_fn=lambda: input_fn(train_fold), steps=steps)

        models = {"dnn": model_dnn}

        # Compute new RMSE
        train_pred = train_fold[["id"]].assign(y_hat=0)

        for i, m in models.items():
            temp = m.predict(input_fn=lambda: input_fn(train_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            train_pred["y_hat"] += models_weights[i] * y_hat

        # Use median value by id
        y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

        new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].replace(y_hat_med), train_fold["y"]))
        if ((count % 2) == 0):
            print(new_RMSE)

        if (new_RMSE < RMSE):
            RMSE = new_RMSE
        else:
            RMSE_decreasing = False

    # Compute RMSE on validation set
    validation_pred = validation_fold[["id"]].assign(y_hat=0).reset_index(drop=True)
    for i, m in models.items():
        if (i == "dnn"):
            temp = m.predict(input_fn=lambda: input_fn(validation_fold))
            # .predict() returns an iterator; convert to an array
            y_hat = np.array(list(itertools.islice(temp, 0, None)))
            validation_pred["y_hat"] += models_weights[i] * y_hat
        else:
            validation_pred["y_hat"] += models_weights[i] * m.predict(validation_fold[models_cols[i]])

    # Use median value by id
    y_hat_med = validation_pred.groupby("id").median()["y_hat"].to_dict()

    RMSE = np.sqrt(mean_squared_error(validation_pred["id"].replace(y_hat_med).values, validation_fold["y"]))
    average_RMSE += RMSE
    print("Current validation RMSE: {0}".format(RMSE))

average_RMSE /= n_splits

print("Cross-validation score: {0}\n".format(average_RMSE))

Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_task_id': 0, '_master': '', '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_type': None, '_is_chief': True, '_environment': 'local', '_num_ps_replicas': 0, '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000021664DC4C50>, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None}
DEBUG:tensorflow:Setting feature info to {'reflectance_6': TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(78400)]), is_sparse=False), 'solar_1': TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(78400)]), is_sparse=False), 'reflectance_2': TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(78400)]), is_sp

KeyboardInterrupt: 