In [47]:
from __future__ import print_function, division
from scipy.stats.stats import pearsonr, linregress
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import datetime
%matplotlib inline

In [48]:
taxi_dat = pd.read_csv("Taxi_Train.csv")
print(taxi_dat.count())
taxi_dat.head()

id            41898
start_time    41897
end_time      41897
fare          41897
number_pax    41897
start_lng     41897
start_lat     41897
end_lng       41897
end_lat       41897
start_taz     41897
end_taz       41897
dtype: int64


Unnamed: 0,id,start_time,end_time,fare,number_pax,start_lng,start_lat,end_lng,end_lat,start_taz,end_taz
0,0,9/1/12 0:11,9/1/12 0:20,13.2,1.0,-122.41354,37.802683,-122.421277,37.785395,38.0,30.0
1,1,9/1/12 0:23,9/1/12 0:31,10.65,1.0,-122.4197,37.78609,-122.435217,37.762177,30.0,94.0
2,2,9/1/12 0:45,9/1/12 0:49,9.0,1.0,-122.41512,37.774672,-122.407657,37.782615,10.0,11.0
3,3,9/1/12 0:41,9/1/12 0:54,13.95,2.0,-122.419392,37.806622,-122.415393,37.778115,40.0,10.0
4,4,9/1/12 1:09,9/1/12 1:13,7.35,1.0,-122.429722,37.79779,-122.41806,37.789032,45.0,32.0


In [49]:
#the id column of taxi_data is redundant, so I'll remove it here
taxi_data = taxi_dat.drop('id', 1)[:-1]
taxi_data.describe()
#taxi_data.head()

Unnamed: 0,fare,number_pax,start_lng,start_lat,end_lng,end_lat,start_taz,end_taz
count,41897.0,41897.0,41897.0,41897.0,41897.0,41897.0,41897.0,41897.0
mean,17.047906,1.100699,-122.415726,37.770983,-122.414716,37.769447,61.074612,80.488794
std,16.058616,0.422372,0.020074,0.045916,0.037718,0.052593,66.375489,135.680716
min,0.0,0.0,-122.515832,37.459648,-122.63442,37.22564,0.0,0.0
25%,8.45,1.0,-122.424645,37.77162,-122.429705,37.768903,12.0,17.0
50%,11.2,1.0,-122.412197,37.785703,-122.414028,37.785255,37.0,43.0
75%,16.7,1.0,-122.40287,37.792302,-122.403683,37.792668,88.0,99.0
max,281.25,6.0,-122.14239,37.940077,-121.353468,38.387638,1441.0,1453.0


In [50]:
def get_datetime(obj):
    month, obj = obj[:obj.index('/')], obj[obj.index('/')+1:]
    day, obj = obj[:obj.index('/')], obj[obj.index('/')+1:]
    year, obj = obj[:obj.index(' ')], obj[obj.index(' ')+1:]
    hour, obj = obj[:obj.index(':')], obj[obj.index(':')+1:]
    minute = obj
    return datetime.timedelta(int(year), int(month), int(day), int(hour), int(minute))

def get_difference(day1_string, day2_string):
    day1 = get_datetime(day1_string)
    day2 = get_datetime(day2_string)
    diff = (day2-day1).total_seconds()
    if diff<0:
        return int(3600+diff)
    else:
        return diff

start_times = np.array(taxi_data.get('start_time'))
end_times = np.array(taxi_data.get('end_time'))
ride_time = np.array([])

for i in np.arange(len(start_times)):
    day1_string = start_times.item(i)
    #print(day1_string)
    day2_string = end_times.item(i)
    #print(day2_string)
    diff = get_difference(day1_string, day2_string)
    #print(diff)
    ride_time = np.append(ride_time, diff)
    #print()

ride_time
    
taxi_with_time = taxi_data[:]
taxi_with_time['ride_time'] = pd.Series(ride_time, index=taxi_with_time.index)
taxi_with_time.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,fare,number_pax,start_lng,start_lat,end_lng,end_lat,start_taz,end_taz,ride_time
count,41897.0,41897.0,41897.0,41897.0,41897.0,41897.0,41897.0,41897.0,41897.0
mean,17.047906,1.100699,-122.415726,37.770983,-122.414716,37.769447,61.074612,80.488794,668.410936
std,16.058616,0.422372,0.020074,0.045916,0.037718,0.052593,66.375489,135.680716,436.404415
min,0.0,0.0,-122.515832,37.459648,-122.63442,37.22564,0.0,0.0,0.001
25%,8.45,1.0,-122.424645,37.77162,-122.429705,37.768903,12.0,17.0,360.0
50%,11.2,1.0,-122.412197,37.785703,-122.414028,37.785255,37.0,43.0,540.0
75%,16.7,1.0,-122.40287,37.792302,-122.403683,37.792668,88.0,99.0,840.0
max,281.25,6.0,-122.14239,37.940077,-121.353468,38.387638,1441.0,1453.0,3540.0


In [52]:
def get_distance(start_1, end_1, start_2, end_2):
    return abs(end_1-start_1) + abs(end_2-start_2)

start_long = np.array(taxi_data.get('start_lng'))
end_long = np.array(taxi_data.get('end_lng'))
start_lat = np.array(taxi_data.get('start_lat'))
end_lat = np.array(taxi_data.get('end_lat'))
ride_dist = np.array([])

for i in np.arange(len(start_long)):
    ride_dist = np.append(ride_dist, get_distance(
        start_long.item(i), 
        end_long.item(i), 
        start_lat.item(i), 
        end_lat.item(i)
    ))

ride_dist

taxi_time_dist = taxi_with_time[:]
taxi_time_dist['ride_dist'] = pd.Series(ride_dist, index=taxi_time_dist.index)
taxi_time_dist.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,start_time,end_time,fare,number_pax,start_lng,start_lat,end_lng,end_lat,start_taz,end_taz,ride_time,ride_dist
0,9/1/12 0:11,9/1/12 0:20,13.2,1.0,-122.41354,37.802683,-122.421277,37.785395,38.0,30.0,540.0,0.025025
1,9/1/12 0:23,9/1/12 0:31,10.65,1.0,-122.4197,37.78609,-122.435217,37.762177,30.0,94.0,480.0,0.03943
2,9/1/12 0:45,9/1/12 0:49,9.0,1.0,-122.41512,37.774672,-122.407657,37.782615,10.0,11.0,240.0,0.015406
3,9/1/12 0:41,9/1/12 0:54,13.95,2.0,-122.419392,37.806622,-122.415393,37.778115,40.0,10.0,780.0,0.032506
4,9/1/12 1:09,9/1/12 1:13,7.35,1.0,-122.429722,37.79779,-122.41806,37.789032,45.0,32.0,240.0,0.02042


In [59]:
def print_results(wine_model_in, X, y, i):
    predicted_results = np.array(wine_model_in.predict(X.head()))
    actual_results = np.array(y.head()['fare'])
    print("\nThe predictions are:")
    print(predicted_results)
    print("\nThe actual fares are:")
    print(actual_results)
    print("\nThe differences are:")
    model_error_list = predicted_results - actual_results 
    print(model_error_list)
    print("\nThe MSE of model " + str(i) + " is:")
    mse = sum(model_error_list.__pow__(2))
    print(mse)
    
    
k = 10
taxi_len = taxi_time_dist.shape[0]
#taxi_time_dist = shuffle(taxi_time_dist)
models = np.array([])
for i in range(k):
    #find the indices to take
    lower_bound = i*taxi_len/k
    upper_bound = (i+1)*taxi_len/k
    test_range = np.arange(lower_bound, upper_bound) # 1/kth of the data, used for testing
    train_range = np.concatenate((np.arange(0, lower_bound), 
                                  np.arange(upper_bound, taxi_len))) #(k-1)/kth of the data, used for training
    #define testing and training samples for fold i
    taxi_test = taxi_time_dist.take(test_range)
    taxi_train = taxi_time_dist.take(train_range)
    #set the features of the data
    taxi_features = ['ride_time', 'ride_dist']
    X = taxi_train.get(taxi_features)
    y = taxi_train.get(['fare'])
    # Define model. Specify a number for random_state to ensure same results each run
    taxi_model_alpha = DecisionTreeRegressor(random_state=1)
    # Fit model
    taxi_model_alpha.fit(X, y)
    #print the results of the model
    print_results(taxi_model_alpha, X, y, i)
    models = np.append(models, taxi_model_alpha)

models


The predictions are:
[ 6.8 13.4  7.9 11.   6.8]

The actual qualities are:
[ 6.8 13.4  7.9 11.   6.8]

The differences are:
[ 0.0000000e+00  0.0000000e+00 -8.8817842e-16  0.0000000e+00
  0.0000000e+00]

The MSE of model 0 is:
7.888609052210118e-31

The predictions are:
[11.1   10.375  9.    13.95   7.35 ]

The actual qualities are:
[13.2  10.65  9.   13.95  7.35]

The differences are:
[-2.1   -0.275  0.     0.     0.   ]

The MSE of model 1 is:
4.485624999999999

The predictions are:
[11.1   10.375  9.    13.95   7.35 ]

The actual qualities are:
[13.2  10.65  9.   13.95  7.35]

The differences are:
[-2.1   -0.275  0.     0.     0.   ]

The MSE of model 2 is:
4.485624999999999

The predictions are:
[11.1   10.375  9.    13.95   7.35 ]

The actual qualities are:
[13.2  10.65  9.   13.95  7.35]

The differences are:
[-2.1   -0.275  0.     0.     0.   ]

The MSE of model 3 is:
4.485624999999999

The predictions are:
[11.1   10.375  9.    13.95   7.35 ]

The actual qualities are:
[13.2  1

array([DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best'),
       DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best'),
       DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best'),
       DecisionTreeRegressor(criterion='mse', 