In [51]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import csv
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.linearmodels as corplt
import numpy as np
import sklearn.linear_model
import time
from datetime import datetime
from numpy.linalg import inv
from sklearn.feature_extraction import DictVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn import linear_model
from sklearn import preprocessing

np.set_printoptions(precision=4, suppress=True)

In [3]:
with open('dailyRidesWeather.csv', 'rb') as f:
    reader = csv.reader(f)
    dat = list(reader)
    header = dat[0]
    rides = np.asarray(dat[1:])

print rides.shape

(149711, 32)


In [5]:
print rides[0]

['0' 'Yellow' '2016' '1' '15' '07114' '38.0' '1.1875' '32' '2016-01-15'
 'True' '9.50' 'False' '21.90' '51.10' '41.20' 'False' 'False' 'False'
 '0.00' '27.50' '-5.61' '10.61' '5.11' '1007.10' '1012.00' 'False' ''
 'False' '3.80' '8.00' '15.00']


In [21]:
counter = 0
feature_idx = {val:idx for idx, val in enumerate(header)}
print feature_idx
# 0 : rnbr | 1 : ride_source | 2 : pickup_year | 3 : pickup_month | 4 : pickup_day | 5 : pickup_zipcode | 
# 6 : passenger_count | 7 : avg_passenger_count | 8 : ride_count | 9 : datestamp | 10 : weather_ok | 
# 11 : visibility | 12 : fog | 13 : min_temp | 14 : max_temp | 15 : mean_temp | 16 : rain | 17 : hail | 
# 18 : thunder | 19 : precipitation | 20 : dew_point | 21 : min_temp_c | 22 : max_temp_c | 23 : mean_temp_c | 
# 24 : station_pressure | 25 : sea_level_pressure | 26 : snow | 27 : snow_depth | 28 : tornado | 29 : mean_wind_speed | 
# 30 : max_wind_speed | 31 : max_wind_gus |

{'max_temp_c': 22, 'weather_ok': 10, 'hail': 17, 'datestamp': 9, 'min_temp': 13, 'mean_temp_c': 23, 'mean_wind_speed': 29, 'max_wind_speed': 30, 'pickup_month': 3, 'rnbr': 0, 'pickup_zipcode': 5, 'snow': 26, 'sea_level_pressure': 25, 'fog': 12, 'pickup_day': 4, 'pickup_year': 2, 'tornado': 28, 'rain': 16, 'max_wind_gust': 31, 'visibility': 11, 'ride_count': 8, 'avg_passenger_count': 7, 'precipitation': 19, 'dew_point': 20, 'mean_temp': 15, 'min_temp_c': 21, 'snow_depth': 27, 'passenger_count': 6, 'ride_source': 1, 'thunder': 18, 'station_pressure': 24, 'max_temp': 14}


In [9]:
rides_2015 = rides[rides[:,2] == '2015']
y_rides_2015 = rides_2015[rides_2015[:,1]=='Yellow']
print y_rides_2015.shape

(57040, 32)


In [98]:
#Feature Extraction
def get_feature_dict(x):
    feature_dict = {}
    # Get pickup date
    pickup_date = datetime.strptime("%s-%s-%s" % (x[feature_idx["pickup_year"]],\
                                     x[feature_idx["pickup_month"]],\
                                     x[feature_idx["pickup_day"]]), '%Y-%m-%d')
    
    feature_dict["zipcode"] = x[feature_idx["pickup_zipcode"]]
    feature_dict["month"] = x[feature_idx["pickup_month"]].zfill(2)
    feature_dict["day"] = x[feature_idx["pickup_day"]].zfill(2)
    feature_dict["weekday"] = '%02d' % pickup_date.weekday()
    
    #mean temp
    if float(x[feature_idx["mean_temp"]]) < 55.:
        feature_dict["temp"] = "Cold"
    elif float(x[feature_idx["mean_temp"]]) > 75.:
        feature_dict["temp"] = "Hot"
    else:
        feature_dict["temp"] = "Normal"
        
    #mean wind speed
    feature_dict["wind_speed"] = "%0d" % (float(x[feature_idx["mean_wind_speed"]] or 12.))
        
    #mean wind speed
    feature_dict["precip"] = "%00d" % (float(x[feature_idx["precipitation"]] or 10.))
    
    #zipcode-weekday
    feature_dict["zipcode_weekday"] = "%s_%s" % (feature_dict["zipcode"], feature_dict["weekday"])
    
    #zipcode-weekday-precip
    feature_dict["zipcode_weekday_precip"] = "%s_%s_%s" % (feature_dict["zipcode"], feature_dict["weekday"], feature_dict["precip"])
    
    #zipcode-weekday-temp
    feature_dict["zipcode_weekday_temp"] = "%s_%s_%s" % (feature_dict["zipcode"], feature_dict["weekday"], feature_dict["temp"])
    
    #zipcode-weekday-wind
    feature_dict["zipcode_weekday_wind"] = "%s_%s_%s" % (feature_dict["zipcode"], feature_dict["weekday"], feature_dict["wind_speed"])
    
    #zipcode-weekday-wind-precip
    feature_dict["zipcode_weekday_temp_precip"] = "%s_%s_%s_%s" % (feature_dict["zipcode"], feature_dict["weekday"], feature_dict["temp"], feature_dict["precip"])
    
    #zipcode-weekday-wind-precip
    feature_dict["zipcode_weekday_temp_precip_wind"] = "%s_%s_%s_%s_%s" % (feature_dict["zipcode"], feature_dict["weekday"], feature_dict["temp"], feature_dict["precip"], feature_dict["wind_speed"])
    
    return feature_dict

#print get_feature_dict(y_rides_2015[1])

In [106]:
vectorizer = DictVectorizer()
def get_feature_vector(feat_dict, is_test):
    transformed = vectorizer.transform(feat_dict) if is_test else vectorizer.fit_transform(feat_dict)
    return transformed

In [107]:
feature_dicts = [get_feature_dict(x) for x in y_rides_2015]
train_feature_vectors = get_feature_vector(feature_dicts[:50000], False)
test_feature_vectors = get_feature_vector(feature_dicts[50000:], True)

In [108]:
y = np.array([row[feature_idx['ride_count']] for row in y_rides_2015])
small_train_y = y[:50000]
small_test_y = y[50000:]

In [None]:
# using stocastic gradient descent.
sgd_regressor = linear_model.SGDRegressor(
    n_iter=5000, # Takes many iterations to converge.
    alpha=0.0, # Works better without regularization.
    learning_rate='invscaling',
    eta0=0.2, # Converges faster with higher-than-default initial learning rate.
    power_t=0.4,
    verbose=1
)
sgd_regressor.fit(train_feature_vectors, small_train_y)

-- Epoch 1
Norm: 24479.49, NNZs: 14070, Bias: 3.087900, T: 50000, Avg. loss: 1604221.408993
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 30727.08, NNZs: 14070, Bias: -1.394671, T: 100000, Avg. loss: 1133553.067568
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 34189.12, NNZs: 14070, Bias: -3.306708, T: 150000, Avg. loss: 898294.752925
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 36405.21, NNZs: 14070, Bias: -5.602986, T: 200000, Avg. loss: 754914.387606
Total training time: 0.04 seconds.
-- Epoch 5
Norm: 37960.30, NNZs: 14070, Bias: -7.518436, T: 250000, Avg. loss: 657919.149257
Total training time: 0.05 seconds.
-- Epoch 6
Norm: 39106.51, NNZs: 14070, Bias: -9.242384, T: 300000, Avg. loss: 587722.559024
Total training time: 0.06 seconds.
-- Epoch 7
Norm: 39987.37, NNZs: 14070, Bias: -10.711175, T: 350000, Avg. loss: 534488.313230
Total training time: 0.07 seconds.
-- Epoch 8
Norm: 40686.92, NNZs: 14070, Bias: -11.574821, T: 400000, Avg. loss: 492643.273581
Total tra

In [112]:
y_hat = sgd_regressor.predict(test_feature_vectors)

In [115]:
print "Predicted"
print y_hat[:25]
print "Actual"
print small_test_y[:25]

[ 10038.9173    166.4433     59.453      88.2767     33.343     -17.9078
    -24.1408   8384.508    5301.0208     33.8599    216.9594    328.229
    -63.3961    134.6278    -82.4755    122.8659    -43.3134   -119.8531
     17.5807   -121.4209    -94.8331    150.148     189.1699     20.0705
     73.6871]
['9569' '147' '113' '35' '10' '7' '4' '7334' '5250' '3' '47' '96' '15'
 '103' '2' '1' '22' '1' '1' '19' '60' '144' '181' '39' '9']


In [117]:
from sklearn.externals import joblib
joblib.dump(sgd_regressor, 'nyc_taxi_predictor.pkl') 

['nyc_taxi_predictor.pkl']

(57040, 100630)
(1000, 100630)
10000
