In [11]:
%matplotlib inline

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
import xgboost as xgb
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
from datetime import datetime
import tensorflow as tf
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt

In [12]:
def preprocess_features(bimbo_dataframe):
  """This function takes an input dataframe and returns a version of it that has
  various features selected and pre-processed.  The input dataframe contains
  data from the california_housing data set."""
  # Select fewer columns to allow training a bit faster.
  output_features = bimbo_dataframe[
    ["d1",
     "d2",
     "d3",
     "d4",
     "d5",
     "d6",
     "p_total_demand",
     "total_demand"]].copy()
  return output_features


def preprocess_targets(bimbo_dataframe):
  """This function selects and potentially transforms the output target from
  an input dataframe containing data from the california_housing data set.
  The object returned is a pandas Series."""
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["Demanda_uni_equil"] = (
    bimbo_dataframe["Demanda_uni_equil"])
  return output_targets

In [13]:
train_joined = pd.read_csv('../input/train_1000000_alld_totals.csv')
test_joined = pd.read_csv('../input/test_1000000_alld_totals.csv')

In [14]:
# Randomize the data before selecting train / validation splits.
raw_training_df = train_joined.reindex(np.random.permutation(
  train_joined.index))

In [22]:
training_examples = preprocess_features(raw_training_df.head(100000))
training_targets = preprocess_targets(raw_training_df.head(100000))

validation_examples = preprocess_features(raw_training_df.tail(500))
validation_targets = preprocess_targets(raw_training_df.tail(500))

In [23]:
training_examples=training_examples.replace(-999,np.NaN)
validation_examples=training_examples.replace(-999,np.NaN)

In [24]:
# Sanity check that we've done the right thing.
print("Training examples summary:")
print(training_examples.describe())
print("Validation examples summary:")
print(validation_examples.describe())

print("Training targets summary:")
print(training_targets.describe())
print("Validation targets summary:")
print(validation_targets.describe())

Training examples summary:
                 d1            d2            d3            d4            d5  \
count  85162.000000  70330.000000  55896.000000  42091.000000  28287.000000   
mean       6.033454      4.616053      4.518284      4.425578      4.348110   
std      159.630247     16.348238     17.292253     17.025387     16.123085   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        2.000000      1.000000      0.000000      0.000000      0.000000   
75%        5.000000      4.000000      4.000000      4.000000      4.000000   
max    46326.000000   1820.000000   2008.000000   1706.000000    732.000000   

                 d6  p_total_demand  total_demand  
count  14142.000000    1.000000e+05  1.000000e+05  
mean       4.072974    4.899412e+06  1.909206e+03  
std       15.329737    5.343123e+06  8.005744e+04  
min        0.000000    0.000000e+00  0.000000e+00  
2

In [25]:
d1 = tf.contrib.layers.real_valued_column("d1")
d2 = tf.contrib.layers.real_valued_column("d2")
d3 = tf.contrib.layers.real_valued_column("d3")
d4 = tf.contrib.layers.real_valued_column("d4")
d5 = tf.contrib.layers.real_valued_column("d5")
d6 = tf.contrib.layers.real_valued_column("d6")

p_total_demand = tf.contrib.layers.real_valued_column("p_total_demand")
total_demand = tf.contrib.layers.real_valued_column("total_demand")


feature_columns=set([
  d1,
  d2,
  d3,
  d4,
  d5,
  d6,
  p_total_demand,
  total_demand])

In [26]:
#@test {"output": "ignore"}

def _input_fn(examples_df, targets_df):
  # Converts a pair of examples/targets DataFrames to Tensors. The Tensors are
  # reshaped into (N,1) where N is number of examples in the DataFrames.
  # -1 is a special value to tf.reshape that means "replace me with whatever
  # size allows all the input values to fit into the output tensor."
  features = {}
  for column_name in examples_df.keys():
    features[column_name] = tf.to_float(
      tf.reshape(tf.constant(examples_df[column_name].values), [-1,1]))
  label_tensor = tf.to_float(
    tf.reshape(tf.constant(targets_df[targets_df.keys()[0]].values), [-1,1]))

  # Return a dict of feature Tensors and label Tensor.
  return features, label_tensor

In [29]:
#@test {"output": "ignore", "timeout": 180}

LEARNING_RATE = 0.0000001  #@param
STEPS = 500000  #@param
# No batch size - train expects input_fn to batch the data, but
# here we'll just train on the full set for simplicity.
periods = 100
steps_per_period = STEPS / periods

# Build a linear regression model, making sure we pass
# our explicit list of feature columns.

#linear_regressor = tf.contrib.learn.LinearRegressor(feature_columns=feature_columns, optimizer=tf.train.FtrlOptimizer(LEARNING_RATE))

linear_regressor = tf.contrib.learn.LinearRegressor(feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE),)

# Train and evaluate the model.
training_losses = []
validation_losses = []
print("Training model...")
print("RMSE (on training data):")
training_rmse = []
validation_rmse = []
for period in range (0, periods):
  # Train the model, starting from the prior state.
  linear_regressor.fit(
    input_fn=lambda: _input_fn(training_examples, training_targets),
    steps=steps_per_period
  )
  training_predictions = linear_regressor.predict(
    input_fn=lambda: _input_fn(training_examples, training_targets))
  validation_predictions = linear_regressor.predict(
    input_fn=lambda: _input_fn(validation_examples, validation_targets))
  training_root_mean_squared_error = math.sqrt(
    metrics.mean_squared_error(training_predictions, training_targets))
  validation_root_mean_squared_error = math.sqrt(
    metrics.mean_squared_error(validation_predictions, validation_targets))
  print "  period %02d : %0.2f" % (period, training_root_mean_squared_error)
  training_rmse.append(training_root_mean_squared_error)
  validation_rmse.append(validation_root_mean_squared_error)

Training model...
RMSE (on training data):


ERROR:tensorflow:Model diverged with loss = NaN.


NanLossDuringTrainingError: NaN loss during training.

In [None]:
plt.ylabel("RMSE")
plt.xlabel("Periods")
plt.title("Root Mean Squared Error vs. Periods")
plt.tight_layout()
plt.plot(training_rmse, label="training")
plt.plot(validation_rmse, label="validation")
plt.legend()
plt.show()
print("Final RMSE (on training data): %0.2f" % (
  training_root_mean_squared_error))
print("Final RMSE (on validation data): %0.2f" % (
  validation_root_mean_squared_error))