In [1]:
#import python libraries
import gcp.bigquery as bq
import pandas as pd
import numpy as np

In [2]:
%%bigquery schema --table "nyc-tlc:green.trips_2015"

In [3]:
%%sql --module taxiquery
#declare the query that retrieves how many taxi rides there were every day
SELECT daynumber, COUNT(*) AS numtrips FROM
    (SELECT DAYOFYEAR(pickup_datetime) AS daynumber FROM [nyc-tlc:green.trips_$YEAR])
GROUP BY daynumber ORDER BY daynumber

In [4]:
#run the taxi rides query
trips = bq.Query(taxiquery, YEAR=2015).to_dataframe()

In [5]:
%%sql --module wxquery
#access weather database, on New York station
SELECT DAYOFYEAR(TIMESTAMP('$YEAR'+mo+da)) daynumber,
       FIRST(DAYOFWEEK(TIMESTAMP('$YEAR'+mo+da))) dayofweek,
       MIN(min) mintemp, MAX(max) maxtemp, MAX(IF(prcp=99.99,0,prcp)) rain
FROM [fh-bigquery:weather_gsod.gsod$YEAR]
WHERE stn='725030' GROUP BY 1 ORDER BY daynumber DESC

In [6]:
#run the weather query
weather = bq.Query(wxquery, YEAR=2015).to_dataframe()

In [7]:
#merge taxi rides with weather information
data = pd.merge(weather, trips, on='daynumber')

In [8]:
#invoke TensorFlow and train the neural network
import tensorflow as tf
shuffled = data.sample(frac=1)
predictors = shuffled.iloc[:,1:5]
targets = shuffled.iloc[:,5]

SCALE_NUM_TRIPS = 100000
trainsize = int(len(shuffled['numtrips']) * 0.8)
testsize = len(shuffled['numtrips']) - trainsize
npredictors = len(predictors.columns)
noutputs = 1
nhidden = 5
numiter = 10000
modelfile = '/tmp/trained_model'
with tf.Session() as sess:
  feature_data = tf.placeholder("float", [None, npredictors])
  target_data = tf.placeholder("float", [None, noutputs])
  weights1 = tf.Variable(tf.truncated_normal([npredictors, nhidden], stddev=0.01))
  weights2 = tf.Variable(tf.truncated_normal([nhidden, noutputs], stddev=0.01))
  biases1 = tf.Variable(tf.ones([nhidden]))
  biases2 = tf.Variable(tf.ones([noutputs]))
  model = (tf.matmul(tf.nn.relu(tf.matmul(feature_data, weights1) + biases1), weights2) + biases2) * SCALE_NUM_TRIPS
  cost = tf.nn.l2_loss(model - target_data)
  training_step = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
  init = tf.initialize_all_variables()
  sess.run(init)

  saver = tf.train.Saver({'weights1' : weights1, 'biases1' : biases1, 'weights2' : weights2, 'biases2' : biases2})
  for iter in xrange(0, numiter):
    sess.run(training_step, feed_dict = {
        feature_data : predictors[:trainsize].values,
        target_data : targets[:trainsize].values.reshape(trainsize, noutputs)
      })
    
   # if iter%1000 == 0:
   #   print '{0} error={1}'.format(iter, np.sqrt(cost.eval(feed_dict = {
   #      feature_data : predictors[:trainsize].values,
   #      target_data : targets[:trainsize].values.reshape(trainsize, noutputs)
   #  }) / trainsize))
    
  filename = saver.save(sess, modelfile, global_step=numiter)
  print 'Model written to {0}'.format(filename)

  #print 'testerror={0}'.format(np.sqrt(cost.eval(feed_dict = {
  #        feature_data : predictors[trainsize:].values,
  #        target_data : targets[trainsize:].values.reshape(testsize, noutputs)
  #    }) / testsize))

Model written to /tmp/trained_model-10000


In [16]:
#Training completed!
#How many taxi rides we’ll see tomorrow if expected min/max temp is 50/80 and expected rain is 0?
input = pd.DataFrame.from_dict(data = 
                               {'dayofweek' : [4, 5, 6],
                                'mintemp' : [50, 36, 54],
                                'maxtemp' : [80, 40, 75],
                                'rain' : [0, 0.8, 0.3]})
with tf.Session() as sess:
    filename = modelfile + '-' + str(numiter)
    saver = tf.train.Saver({'weights1' : weights1, 'biases1' : biases1, 'weights2' : weights2, 'biases2' : biases2})
    saver.restore(sess, filename)
    feature_data = tf.placeholder("float", [None, npredictors])
    predict_operation = (tf.matmul(tf.nn.relu(tf.matmul(feature_data, weights1) + biases1), weights2) + biases2) * SCALE_NUM_TRIPS
    predicted = sess.run(predict_operation, feed_dict = {
        feature_data : input.values
      })
    print predicted

[[ 43445.4140625 ]
 [ 49276.5859375 ]
 [ 50152.28515625]]
