In [1]:
import gcp.bigquery as bq
import pandas as pd
import numpy as np

In [2]:
%%bigquery schema --table "nyc-tlc:yellow.trips"

In [3]:
%%sql --module taxiquery
SELECT daynumber, COUNT(*) AS numtrips FROM
    (SELECT DAYOFYEAR(pickup_datetime) AS daynumber FROM [nyc-tlc:yellow.trips])
GROUP BY daynumber ORDER BY daynumber

In [4]:
trips = bq.Query(taxiquery).to_dataframe()

In [5]:
%%sql --module wxquery
SELECT DAYOFYEAR(TIMESTAMP('$YEAR'+mo+da)) daynumber,
       FIRST(DAYOFWEEK(TIMESTAMP('$YEAR'+mo+da))) dayofweek,
       MIN(min) mintemp, MAX(max) maxtemp, MAX(IF(prcp=99.99,0,prcp)) rain
FROM [fh-bigquery:weather_gsod.gsod$YEAR]
WHERE stn='725030' GROUP BY 1 ORDER BY daynumber DESC

In [6]:
weather = bq.Query(wxquery, YEAR=2015).to_dataframe()
data = pd.merge(weather, trips, on='daynumber')

In [7]:
data[:10]

Unnamed: 0,daynumber,dayofweek,mintemp,maxtemp,rain,numtrips
0,365,5,46.0,48.2,0.17,2668294
1,364,4,34.0,48.0,0.13,2494007
2,363,3,33.8,46.9,0.37,2415137
3,362,2,39.0,62.1,0.02,2155247
4,361,1,46.0,62.6,0.14,1815101
5,360,7,50.0,64.0,0.03,1630893
6,359,6,54.0,72.0,0.01,1491734
7,358,5,50.0,72.0,1.46,2203949
8,357,4,50.0,62.1,0.18,2642256
9,356,3,55.0,62.6,0.0,2760196


In [8]:
data.length

AttributeError: 'DataFrame' object has no attribute 'length'

In [9]:
data.size

2190

In [10]:
data[:400]

Unnamed: 0,daynumber,dayofweek,mintemp,maxtemp,rain,numtrips
0,365,5,46.0,48.2,0.17,2668294
1,364,4,34.0,48.0,0.13,2494007
2,363,3,33.8,46.9,0.37,2415137
3,362,2,39.0,62.1,0.02,2155247
4,361,1,46.0,62.6,0.14,1815101
5,360,7,50.0,64.0,0.03,1630893
6,359,6,54.0,72.0,0.01,1491734
7,358,5,50.0,72.0,1.46,2203949
8,357,4,50.0,62.1,0.18,2642256
9,356,3,55.0,62.6,0.00,2760196


In [11]:
weather = bq.Query(wxquery, YEAR=2014).to_dataframe()
data2014 = pd.merge(weather, trips, on='daynumber')
data2 = pd.concat([data, data2014])

In [12]:
data2.size

4380

In [13]:
weather = bq.Query(wxquery, YEAR=2013).to_dataframe()
data2013 = pd.merge(weather, trips, on='daynumber')
data3 = pd.concat([data2, data2013])

weather = bq.Query(wxquery, YEAR=2012).to_dataframe()
data2012 = pd.merge(weather, trips, on='daynumber')
data4 = pd.concat([data3, data2012])

In [14]:
data4.size

8766

In [15]:
import tensorflow as tf
shuffled = data2.sample(frac=1)
# It would be a good idea, if we had more data, to treat the days as categorical variables
# with the small amount of data, we have though, the model tends to overfit
#predictors = shuffled.iloc[:,2:5]
#for day in xrange(1,8):
#  matching = shuffled['dayofweek'] == day
#  predictors.loc[matching, 'day_' + str(day)] = 1
#  predictors.loc[~matching, 'day_' + str(day)] = 0
predictors = shuffled.iloc[:,1:5]
targets = shuffled.iloc[:,5]