In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#import keras as keras

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.utils import resample
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

from time import time
import operator

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

In [20]:

# IMPORT DATA

def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

train, test = load_data()


In [21]:
# SET UP DATA

cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]


# Standardise data for LR
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])
cols_dnn = cols_orig


In [22]:
# SET UP MODEL FIT

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols, hidden_units=[10])

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels


In [23]:
# MODEL FIT

model_dnn.fit(input_fn=lambda: input_fn(train), steps=900)

DNNRegressor(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._RegressionHead object at 0x11bac3150>, 'hidden_units': [10], 'feature_columns': (_RealValuedColumn(column_name='reflectance_0', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='reflectance_1', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='reflectance_2', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='reflectance_3', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='reflectance_4', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='reflectance_5', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='reflectance_6', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(colu

In [24]:
# GET THE COMPUTED TRAINED PREDICTIONS

train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to an array
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat

# Use median value by id
y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train["y"]))

In [8]:
RMSE

0.7353768786645285

In [15]:
# PRUNE 

RMSE_decreasing = True
count = 0
while (RMSE_decreasing):
    count +=1
    train_pred["y_med"] = train_pred["id"].map(y_hat_med)

    # Distance from the median for each bag
    train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
    # Rank of each instance by bag
    train_pred["rank"] = train_pred.groupby("id")["score"].rank()
    bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
    train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
    train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

    # Remove outliers
    outliers_index = train_pred["rank"] > (1 - 0.05)
    train = train.loc[~outliers_index, :].reset_index(drop=True)

    model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
                                              hidden_units=[10])
                                              #model_dir=model_dir)

    model_dnn.fit(input_fn=lambda: input_fn(train), steps=900)

    # Compute new RMSE
    train_pred = train[["id"]].assign(y_hat=0)
            
    temp = model_dnn.predict(input_fn=lambda: input_fn(train))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat

    # Use median value by id
    y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

    new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med), train["y"]))
    print("Pruning {0} RMSE: {1}".format(count, new_RMSE))

    if (abs(new_RMSE - RMSE) > 0.005):
        RMSE = new_RMSE
    else:
        RMSE_decreasing = False

Pruning 1 RMSE: 0.71365308287
Pruning 2 RMSE: 0.714609723543


In [7]:
id_y_dict = dict(zip(train["id"], train["y"]))

train_pred["y"] = train_pred["id"].map(id_y_dict)


In [29]:
train_pred.sort_values(by=["id", "y_hat"]).reset_index(drop=True)

Unnamed: 0,id,y_hat
0,1,-4.261970
1,1,-4.007077
2,1,-3.996592
3,1,-3.906450
4,1,-3.856299
5,1,-3.813319
6,1,-3.784270
7,1,-3.781947
8,1,-3.777163
9,1,-3.768548


In [11]:
train_pred.groupby("id")["y_hat"].count().iloc[0]

100

In [44]:
nb_inst_bag = train_pred.groupby("id")["y"].count()[1]
train_ensemble = train_pred.groupby("id")["y"].first().reset_index()
cols_ensemble = []

for i in range(nb_inst_bag):
    cols_ensemble.append("y_hat_" + str(i))
    train_ensemble["y_hat_" + str(i)] = train_pred.groupby("id")["y_hat"].nth(i).values
train_ensemble

Unnamed: 0,id,y,y_hat_0,y_hat_1,y_hat_2,y_hat_3,y_hat_4,y_hat_5,y_hat_6,y_hat_7,...,y_hat_80,y_hat_81,y_hat_82,y_hat_83,y_hat_84,y_hat_85,y_hat_86,y_hat_87,y_hat_88,y_hat_89
0,1,-3.998082,-3.859344,-3.456238,-3.693035,-3.419001,-3.702095,-3.358011,-3.927630,-3.377776,...,-3.443006,-3.281841,-3.453025,-3.466390,-3.547405,-3.332742,-3.511861,-3.413926,-3.531065,-3.599279
1,2,-4.137141,-4.803325,-4.774505,-4.821686,-4.343932,-4.968723,-4.434314,-4.572211,-4.709503,...,-4.559758,-4.700127,-4.652413,-4.448805,-4.435921,-4.699460,-4.684771,-4.763186,-4.892138,-4.942204
2,3,-2.694732,-3.128916,-2.192714,-3.240518,-2.851692,-2.912837,-2.751898,-2.601789,-2.559187,...,-2.997061,-2.656528,-2.801065,-3.006781,-2.862188,-2.981581,-3.072716,-2.972567,-2.720337,-3.045454
3,4,-3.296275,-3.816792,-4.116613,-3.891380,-3.577295,-3.499766,-3.690180,-2.847764,-3.421826,...,-3.883896,-3.259821,-3.015216,-4.127945,-3.308504,-3.513927,-3.504202,-3.389528,-3.675390,-3.356594
4,5,-3.181391,-4.140049,-3.637087,-3.894979,-3.552995,-4.039518,-3.677432,-4.048384,-3.866789,...,-3.649606,-3.680826,-3.964107,-3.923158,-3.697424,-3.952579,-3.372876,-3.663009,-3.639478,-4.069396
5,6,-3.146784,-1.527227,-0.854673,-2.679666,-2.384652,-2.665643,-2.514122,-1.603060,-2.659630,...,-2.268713,-2.513000,-1.365386,-2.281217,-1.969198,-1.984604,-2.418079,-2.459711,-2.372190,-2.673518
6,7,-3.438566,-3.531390,-2.822509,-2.824214,-2.948595,-2.636122,-2.984314,-2.866295,-2.783478,...,-3.205682,-3.124938,-2.508930,-2.675796,-2.744712,-2.362571,-2.214516,-2.080705,-3.439501,-2.685672
7,8,-4.399075,-4.116776,-4.435511,-4.211863,-4.240291,-4.149982,-4.206227,-4.123328,-4.211972,...,-4.318620,-4.324581,-4.377769,-4.259295,-4.139483,-4.249089,-4.233586,-4.079061,-4.369022,-4.363676
8,9,-3.392373,-3.821851,-4.219061,-3.564632,-3.617903,-3.468403,-3.482295,-4.021415,-3.767387,...,-3.790689,-4.127939,-4.002644,-4.029432,-3.967799,-3.616828,-3.970571,-4.059723,-4.011108,-3.959126
9,10,-2.602236,-3.410192,-2.534967,-3.166157,-3.284461,-3.055584,-3.407826,-2.537336,-3.196321,...,-3.088762,-3.200032,-3.443712,-2.394433,-3.571020,-3.159284,-3.388689,-3.365442,-2.844266,-2.805498


In [8]:
#cols_ensemble

In [9]:
true_y = []
predicted_y_training = []


for i in range(1,981):
    xx = (train_pred[train_pred["id"] == i])
    true_y.append(xx.iloc[0,2])
    
for i in range(1,981):
    xx = (train_pred[train_pred["id"] == i])
    predicted_y_training.append(xx.loc[:,"y_hat"])

    

In [10]:
np.shape(predicted_y_training)

(980, 100)

In [11]:
FEATURES2 = []
COLUMNS2 = []
for i in range(1,101):
    COLUMNS2.append(str(i))
    FEATURES2.append(str(i))
    
LABEL = ["y"]
ID = ["id"]

id_column = []
for i in range(1,981):
    id_column.append(i)
    

In [12]:
predicted_y_trainingArray = np.asarray(predicted_y_training)
true_yArray = np.asarray(true_y)
id_columnArray = np.asarray(id_column)

df_predicted_y = pd.DataFrame(predicted_y_trainingArray, columns=COLUMNS2)
df_true_y =  pd.DataFrame(true_yArray, columns=LABEL)
df_id_columnArray = pd.DataFrame(id_columnArray, columns=ID)

df_predicted_true = pd.concat([df_predicted_y,df_id_columnArray,df_true_y], axis=1)


In [13]:
df_predicted_true

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,93,94,95,96,97,98,99,100,id,y
0,-3.159446,-3.501325,-3.042236,-2.843617,-3.723236,-2.918945,-3.710563,-3.116051,-3.651973,-3.027824,...,-3.177308,-3.406432,-3.510448,-3.418299,-2.865390,-3.366580,-3.588528,-2.936657,1,-3.998082
1,-4.804088,-4.660945,-5.182220,-4.823056,-3.907509,-4.907156,-3.809191,-4.299606,-4.309426,-4.609455,...,-4.581844,-4.090062,-4.127841,-4.604957,-4.614108,-4.660537,-4.870795,-4.763065,2,-4.137141
2,-1.790119,-2.883604,-2.673269,-3.081977,-2.741258,-2.796185,-2.684700,-2.632223,-2.906919,-2.871891,...,-2.727520,-2.698284,-2.479816,-2.794676,-2.714730,-2.267010,-2.669554,-2.824945,3,-2.694732
3,-3.856306,-4.246832,-4.204176,-3.522083,-2.720106,-3.519457,-3.702021,-3.127253,-2.426281,-3.831623,...,-3.232083,-4.270637,-3.504523,-3.560226,-3.562913,-3.492655,-3.782131,-3.469902,4,-3.296275
4,-4.002868,-3.451197,-3.807913,-3.543169,-3.859925,-3.585530,-3.918734,-3.786559,-3.864620,-3.984488,...,-3.546720,-3.927731,-3.325034,-4.258395,-3.644309,-4.406235,-3.435799,-4.017644,5,-3.181391
5,-1.608765,-1.417577,-2.423932,-2.472132,-1.344228,-2.976734,-2.802207,-1.585454,-2.920187,-2.036543,...,-1.252967,-2.513019,-1.844701,-1.922057,-2.524942,-2.619901,-2.638763,-2.540039,6,-3.146784
6,-3.318175,-3.112370,-2.902468,-1.931896,-3.125812,-2.756730,-1.537858,-3.199245,-2.904653,-2.749606,...,-2.062240,-2.812541,-2.989025,-2.609334,-2.501808,-2.821502,-3.600509,-2.880526,7,-3.438566
7,-4.004420,-4.312627,-4.602446,-4.410610,-4.699922,-4.374997,-3.657599,-4.631783,-4.281044,-4.482018,...,-4.409747,-4.366132,-4.513160,-4.437128,-4.312216,-4.512620,-3.586151,-4.587131,8,-4.399075
8,-3.807797,-4.416164,-3.586531,-3.779976,-3.912961,-3.836113,-3.941078,-4.261414,-3.940553,-3.473589,...,-3.988688,-4.192327,-4.077269,-3.927782,-4.007051,-4.136343,-4.203362,-3.962286,9,-3.392373
9,-3.331349,-2.762888,-3.100077,-3.216733,-3.554101,-3.122984,-3.256404,-2.420988,-3.172643,-3.574921,...,-3.639903,-2.608058,-3.655092,-3.163979,-3.270485,-3.308025,-2.771565,-2.646424,10,-2.602236


In [14]:
cols_excl2 = ["y","id"]
cols_orig2 = [c for c in df_predicted_true.columns if c not in cols_excl2]

In [16]:
# CROSS VAL
tf.logging.set_verbosity(tf.logging.ERROR)

average_RMSE = 0.0
n_splits = 5
    
kf = KFold(n_splits=n_splits)
nb_fold = 0
for train_index, validation_index in kf.split(df_predicted_true):
    
    nb_fold += 1
    train_fold, validation_fold = df_predicted_true.loc[train_index], df_predicted_true.loc[validation_index] 

    feature_cols2 = [tf.contrib.layers.real_valued_column(k) for k in cols_orig2]
    
    model_dir = ("tmp/model102")
        
    regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols2,hidden_units=[24],model_dir=model_dir)

    def input_fn2(data_set):
        feature_cols2 = {k: tf.constant(data_set[k].values) for k in cols_orig2}
        labels2 = tf.constant(data_set["y"].values)
        return feature_cols2, labels2
        
    regressor.fit(input_fn=lambda: input_fn2(df_predicted_true), steps=1000)

    train_pred = train_fold[["id"]].assign(y_hat=0)
    #for i, m in models.items():
    temp = regressor.predict(input_fn=lambda: input_fn2(train_fold))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat
    
    
    validation_pred = validation_fold[["id"]].assign(y_hat=0).reset_index(drop=True)
   
    temp = regressor.predict(input_fn=lambda: input_fn2(validation_fold))
    # .predict() returns an iterator; convert to an array
    y_hat = np.array(list(itertools.islice(temp, 0, None)))
    validation_pred["y_hat"] += y_hat
            
    RMSE = np.sqrt(mean_squared_error(validation_pred["y_hat"],validation_fold["y"]))
        
    average_RMSE += RMSE
    print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))

average_RMSE /= n_splits

print("Cross-validation score: {0}\n".format(average_RMSE))

Validation fold 1 RMSE: 0.658269448616
Validation fold 2 RMSE: 0.743730352313


KeyboardInterrupt: 