In [2]:
%matplotlib inline

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
import xgboost as xgb
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
from datetime import datetime
import tensorflow as tf
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt



In [3]:
def preprocess_features(bimbo_dataframe):
  """This function takes an input dataframe and returns a version of it that has
  various features selected and pre-processed.  The input dataframe contains
  data from the california_housing data set."""
  # Select fewer columns to allow training a bit faster.
  output_features = bimbo_dataframe[
    ["d1",
     "d2",
     "d3",
     "d4",
     "d5",
     "d6",
     "p_total_demand",
     "total_demand"]].copy()
  return output_features


def preprocess_targets(bimbo_dataframe):
  """This function selects and potentially transforms the output target from
  an input dataframe containing data from the california_housing data set.
  The object returned is a pandas Series."""
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["Demanda_uni_equil"] = (
    bimbo_dataframe["Demanda_uni_equil"])
  return output_targets

In [4]:
train_joined = pd.read_csv('../input/train_1000000_alld_totals.csv')
test_joined = pd.read_csv('../input/test_1000000_alld_totals.csv')

In [5]:
# Randomize the data before selecting train / validation splits.
raw_training_df = train_joined.reindex(np.random.permutation(
  train_joined.index))

In [6]:
training_examples = preprocess_features(raw_training_df.head(100000))
training_targets = preprocess_targets(raw_training_df.head(100000))

validation_examples = preprocess_features(raw_training_df.tail(500))
validation_targets = preprocess_targets(raw_training_df.tail(500))

In [7]:
training_examples=training_examples.replace(-999,np.NaN)
validation_examples=training_examples.replace(-999,np.NaN)

In [8]:
# Sanity check that we've done the right thing.
print("Training examples summary:")
print(training_examples.describe())
print("Validation examples summary:")
print(validation_examples.describe())

print("Training targets summary:")
print(training_targets.describe())
print("Validation targets summary:")
print(validation_targets.describe())

Training examples summary:
                 d1            d2            d3            d4            d5  \
count  85042.000000  70237.000000  55974.000000  42293.000000  28231.000000   
mean       5.614896      4.789541      4.686765      4.588726      4.461514   
std       26.031033     17.158641     18.098319     16.162761     16.162597   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        2.000000      1.000000      0.000000      0.000000      0.000000   
75%        5.000000      4.000000      4.000000      4.000000      4.000000   
max     3960.000000   1089.000000   1695.000000    806.000000    710.000000   

                 d6  p_total_demand   total_demand  
count  14161.000000    1.000000e+05  100000.000000  
mean       4.427159    4.915619e+06    1547.898540  
std       17.374971    5.352383e+06    5129.317755  
min        0.000000    0.000000e+00       0.00000

In [21]:
d1 = tf.contrib.layers.real_valued_column("d1")
d2 = tf.contrib.layers.real_valued_column("d2")
d3 = tf.contrib.layers.real_valued_column("d3")
d4 = tf.contrib.layers.real_valued_column("d4")
d5 = tf.contrib.layers.real_valued_column("d5")
d6 = tf.contrib.layers.real_valued_column("d6")

p_total_demand = tf.contrib.layers.real_valued_column("p_total_demand")
total_demand = tf.contrib.layers.real_valued_column("total_demand")


feature_columns=set([
  d1,
  d2,
  d3,
  d4,
  d5,
  d6,
  p_total_demand,
  total_demand])

In [22]:
#@test {"output": "ignore"}

def _input_fn(examples_df, targets_df):
  # Converts a pair of examples/targets DataFrames to Tensors. The Tensors are
  # reshaped into (N,1) where N is number of examples in the DataFrames.
  # -1 is a special value to tf.reshape that means "replace me with whatever
  # size allows all the input values to fit into the output tensor."
  features = {}
  for column_name in examples_df.keys():
    features[column_name] = tf.to_float(
      tf.reshape(tf.constant(examples_df[column_name].values), [-1,1]))
  label_tensor = tf.to_float(
    tf.reshape(tf.constant(targets_df[targets_df.keys()[0]].values), [-1,1]))

  # Return a dict of feature Tensors and label Tensor.
  return features, label_tensor

In [15]:
#@test {"output": "ignore", "timeout": 180}

LEARNING_RATE = 0.0007  # @param
STEPS = 5000  # @param
BATCH_SIZE = 70  # @param
HIDDEN_UNITS = [10, 10]  # @param
periods = 10
steps_per_period = STEPS / periods

# Set up our NN with the desired learning settings.
dnn_regressor = tf.contrib.learn.DNNRegressor(
  feature_columns=feature_columns,
  hidden_units=HIDDEN_UNITS,
  optimizer=tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE),
#  gradient_clip_norm=5.0
)
print "Training model..."
print "RMSE:"
root_mean_squared_errors_training = []
root_mean_squared_errors_validation = []
for period in range (0, periods):
  dnn_regressor.fit(
    training_examples,
    training_targets,
    steps=steps_per_period,
    batch_size=BATCH_SIZE
  )
  predictions_validation = dnn_regressor.predict(validation_examples)
  predictions_training = dnn_regressor.predict(training_examples)

  root_mean_squared_error_validation = math.sqrt(metrics.mean_squared_error(
    predictions_validation, validation_targets))
  root_mean_squared_error_training = math.sqrt(metrics.mean_squared_error(
    predictions_training, training_targets))

  root_mean_squared_errors_validation.append(root_mean_squared_error_validation)
  root_mean_squared_errors_training.append(root_mean_squared_error_training)

  print "  period %02d : %3.2f" % (period, root_mean_squared_error_training)

# Output a graph of loss metrics over periods.
plt.ylabel("RMSE")
plt.xlabel("Periods")
plt.title("Root Mean Squared Error vs. Periods")
plt.plot(root_mean_squared_errors_training, label='training')
plt.plot(root_mean_squared_errors_validation, label='validation')
plt.legend()

# Display some summary information.
print "Final RMSE (on training data):   %0.2f" % root_mean_squared_error_training
print "Final RMSE (on validation data): %0.2f" % root_mean_squared_error_validation

Training model...
RMSE:


KeyError: 'd1'

In [None]:
plt.ylabel("RMSE")
plt.xlabel("Periods")
plt.title("Root Mean Squared Error vs. Periods")
plt.tight_layout()
plt.plot(training_rmse, label="training")
plt.plot(validation_rmse, label="validation")
plt.legend()
plt.show()
print("Final RMSE (on training data): %0.2f" % (
  training_root_mean_squared_error))
print("Final RMSE (on validation data): %0.2f" % (
  validation_root_mean_squared_error))