In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
sns.set_style('whitegrid')

In [3]:
sample_fraction = 0.01

In [4]:
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
selected_cols

['fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

In [5]:
import random

In [9]:
dtypes = {
    'fare_amount' : 'float32',
 'pickup_longitude' : 'float32',
 'pickup_latitude' : 'float32',
 'dropoff_longitude' : 'float32',
 'dropoff_latitude' : 'float32',
 'passenger_count' : 'uint8'
}

def skip_row(row_idx):
    if row_idx == 0:
        return False
    return random.random() > sample_fraction

random.seed(42)
df = pd.read_csv('train.csv', usecols=selected_cols, parse_dates=['pickup_datetime'], dtype=dtypes, skiprows=skip_row)

In [11]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755482,1
1,8.0,2013-01-17 17:22:00+00:00,0.000000,0.000000,0.000000,0.000000,2
2,8.9,2011-06-15 18:07:00+00:00,-73.996330,40.753223,-73.978897,40.766964,3
3,6.9,2009-12-14 12:33:00+00:00,-73.982430,40.745747,-73.982430,40.745747,1
4,7.0,2013-11-06 11:26:54+00:00,-73.959061,40.781059,-73.962059,40.768604,1
...,...,...,...,...,...,...,...
552445,45.0,2014-02-06 23:59:45+00:00,-73.973587,40.747669,-73.999916,40.602894,1
552446,22.5,2015-01-05 15:29:08+00:00,-73.935928,40.799656,-73.985710,40.726952,2
552447,4.5,2013-02-17 22:27:00+00:00,-73.992531,40.748619,-73.998436,40.740143,1
552448,14.5,2013-01-27 12:41:00+00:00,-74.012115,40.706635,-73.988724,40.756218,1


In [13]:
test_df = pd.read_csv('test.csv', dtype=dtypes , parse_dates=['pickup_datetime'])

In [15]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746140,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751637,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6


### Training Set

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552450 entries, 0 to 552449
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   fare_amount        552450 non-null  float32            
 1   pickup_datetime    552450 non-null  datetime64[ns, UTC]
 2   pickup_longitude   552450 non-null  float32            
 3   pickup_latitude    552450 non-null  float32            
 4   dropoff_longitude  552450 non-null  float32            
 5   dropoff_latitude   552450 non-null  float32            
 6   passenger_count    552450 non-null  uint8              
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 15.3 MB


In [20]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,552450.0,552450.0,552450.0,552450.0,552450.0,552450.0
mean,11.354059,-72.497063,39.9105,-72.504326,39.934265,1.684983
std,9.810809,11.622035,8.041162,12.065184,9.226158,1.337664
min,-52.0,-1183.362793,-3084.490234,-3356.729736,-2073.150635,0.0
25%,6.0,-73.99202,40.734875,-73.991425,40.73399,1.0
50%,8.5,-73.981819,40.752621,-73.980179,40.753101,1.0
75%,12.5,-73.967155,40.767036,-73.963737,40.768059,2.0
max,499.0,2420.209473,404.983337,2467.752686,3351.403076,208.0


In [22]:
df['pickup_datetime'].min(), df['pickup_datetime'].max()

(Timestamp('2009-01-01 00:11:46+0000', tz='UTC'),
 Timestamp('2015-06-30 23:59:54+0000', tz='UTC'))

#### Observations about training data:
- 550k+ rows, as expected
- No missing data(in the sample)
- fare_amount rangs from $-52.0 to $499.0
- There seen to be some errors in the latitude & longitude values
- Data range from 1st Jan 2009 to 30th june 2015
- The dataset takes up ~15MB of space in the RAM
We may need to deal with outliers and data entry errors before we train our model.

### Test data :

In [42]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   key                9914 non-null   object             
 1   pickup_datetime    9914 non-null   datetime64[ns, UTC]
 2   pickup_longitude   9914 non-null   float32            
 3   pickup_latitude    9914 non-null   float32            
 4   dropoff_longitude  9914 non-null   float32            
 5   dropoff_latitude   9914 non-null   float32            
 6   passenger_count    9914 non-null   uint8              
dtypes: datetime64[ns, UTC](1), float32(4), object(1), uint8(1)
memory usage: 319.6+ KB


In [44]:
test_df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974716,40.751041,-73.973656,40.75174,1.671273
std,0.042799,0.033542,0.039093,0.035436,1.278747
min,-74.25219,40.573143,-74.263245,40.568974,1.0
25%,-73.9925,40.736125,-73.991249,40.735253,1.0
50%,-73.982327,40.753052,-73.980015,40.754065,1.0
75%,-73.968012,40.767113,-73.964062,40.768757,2.0
max,-72.986534,41.709557,-72.990967,41.696682,6.0


In [46]:
test_df['pickup_datetime'].min(), test_df['pickup_datetime'].max()

(Timestamp('2009-01-01 11:04:24+0000', tz='UTC'),
 Timestamp('2015-06-30 20:03:50+0000', tz='UTC'))

#### Some observations about the test set:
- 9914 rows of data
- No missing values
- No obvious data entry errors
- 1 to 6 passengers(we can limit training data to this range)
- Latitude lie between 40 and 42
- Longitude lie between -75 and -72
- Pickup dates range from Jan 1st 2009 to Jun 30th 2015(same as training set)
We can use range of the test set to drop outliers/invalid data from the training set.

### Preparing Dataset for Training

#### Split Training & Validation Set :
We will set aside 20% of the training data as the validation set, to evaluate the models we train on previously unseen data.
Since the test set and training set have the same data ranges, we can pick a random 20% fraction.

In [52]:
from sklearn.model_selection import train_test_split

In [54]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [56]:
len(train_df), len(val_df)

(441960, 110490)

#### Filling/Removing Missing Values :
There are no missing values in our sample, but if there were, we could simply drop the rows with missing values instead of trying to fill them.
(Since we have a lot of training data)>

In [59]:
train_df = train_df.dropna()
val_df = val_df.dropna()

#### Extracting Inputs and Outputs :

In [62]:
train_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [64]:
input_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

In [66]:
target_col = 'fare_amount'

#### Training :

In [69]:
train_inputs = train_df[input_cols]

In [71]:
train_targets = train_df[target_col]

In [73]:
train_inputs

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
353352,-73.993652,40.741543,-73.977974,40.742352,4
360070,-73.993805,40.724579,-73.993805,40.724579,1
372609,-73.959160,40.780750,-73.969116,40.761230,1
550895,-73.952187,40.783951,-73.978645,40.772602,1
444151,-73.977112,40.746834,-73.991104,40.750404,2
...,...,...,...,...,...
110268,-73.987152,40.750633,-73.979073,40.763168,1
259178,-73.972656,40.764042,-74.013176,40.707840,2
365838,-73.991982,40.749767,-73.989845,40.720551,3
131932,-73.969055,40.761398,-73.990814,40.751328,1


In [75]:
train_targets

353352     6.0
360070     3.7
372609    10.0
550895     8.9
444151     7.3
          ... 
110268     9.3
259178    18.5
365838    10.1
131932    10.9
121958     9.5
Name: fare_amount, Length: 441960, dtype: float32

In [77]:
val_inputs = val_df[input_cols]

In [79]:
val_targets = val_df[target_col]

In [81]:
val_inputs

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
15971,-73.995834,40.759190,-73.973679,40.739086,1
149839,-73.977386,40.738335,-73.976143,40.751205,1
515867,-73.983910,40.749470,-73.787170,40.646645,1
90307,-73.790794,40.643463,-73.972252,40.690182,1
287032,-73.976593,40.761944,-73.991463,40.750309,2
...,...,...,...,...,...
467556,-73.968567,40.761238,-73.983406,40.750019,3
19482,-73.986725,40.755920,-73.985855,40.731171,1
186063,0.000000,0.000000,0.000000,0.000000,1
382260,-73.980057,40.760334,-73.872589,40.774300,1


In [83]:
val_targets

15971     14.000000
149839     6.500000
515867    49.570000
90307     49.700001
287032     8.500000
            ...    
467556     6.100000
19482      7.300000
186063     4.500000
382260    32.900002
18838     11.500000
Name: fare_amount, Length: 110490, dtype: float32

##### Test :

In [86]:
test_inputs = test_df[input_cols]

In [88]:
test_inputs

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,-73.973320,40.763805,-73.981430,40.743835,1
1,-73.986862,40.719383,-73.998886,40.739201,1
2,-73.982521,40.751259,-73.979652,40.746140,1
3,-73.981163,40.767807,-73.990448,40.751637,1
4,-73.966049,40.789776,-73.988564,40.744427,1
...,...,...,...,...,...
9909,-73.968124,40.796997,-73.955643,40.780388,6
9910,-73.945511,40.803600,-73.960213,40.776371,6
9911,-73.991600,40.726608,-73.789742,40.647011,6
9912,-73.985573,40.735432,-73.939178,40.801731,6


### Training Hardcoded & Baseline Models :

#### Training & Evaluating Hardcoded Model :
Creating a simple model that always predicts the average.

In [92]:
class MeanRegressor:
    def fit(self, inputs, targets):
        self.mean = targets.mean()
    def predict(self, inputs) :
        return np.full(inputs.shape[0], self.mean)

In [94]:
mean_model = MeanRegressor()

In [96]:
mean_model.fit(train_inputs, train_targets)

In [98]:
mean_model.mean

11.354714

In [100]:
train_preds = mean_model.predict(train_inputs)

In [102]:
train_preds

array([11.354714, 11.354714, 11.354714, ..., 11.354714, 11.354714,
       11.354714], dtype=float32)

In [104]:
train_targets

353352     6.0
360070     3.7
372609    10.0
550895     8.9
444151     7.3
          ... 
110268     9.3
259178    18.5
365838    10.1
131932    10.9
121958     9.5
Name: fare_amount, Length: 441960, dtype: float32

In [106]:
val_preds = mean_model.predict(val_inputs)

In [108]:
val_preds

array([11.354714, 11.354714, 11.354714, ..., 11.354714, 11.354714,
       11.354714], dtype=float32)

In [110]:
val_targets

15971     14.000000
149839     6.500000
515867    49.570000
90307     49.700001
287032     8.500000
            ...    
467556     6.100000
19482      7.300000
186063     4.500000
382260    32.900002
18838     11.500000
Name: fare_amount, Length: 110490, dtype: float32

In [112]:
from sklearn.metrics import mean_squared_error

In [114]:
def rmse(targets, preds):
    return mean_squared_error(targets, preds, squared=False)

In [116]:
train_rmse = rmse(train_targets, train_preds)
train_rmse



9.789782

- Our dumb hard-coded model is off by USD 9.789782 on average, which is pretty bad considering the average fare is USD11.35

#### Training & Evaluating Baseline Model :

We will train a linear regression model as our baseline, which tries to express the target as a weighted sum of the inputs.

In [121]:
from sklearn.linear_model import LinearRegression

In [123]:
linear_model = LinearRegression()

In [125]:
linear_model.fit(train_inputs, train_targets)

In [127]:
train_preds = linear_model.predict(train_inputs)

In [129]:
train_preds

array([11.546233, 11.284611, 11.284141, ..., 11.458915, 11.284281,
       11.284449], dtype=float32)

In [131]:
rmse(train_targets, train_preds)



9.788632

In [133]:
val_preds = linear_model.predict(val_inputs)

In [135]:
rmse(val_targets, val_preds)



9.898088

The linear regression model is off by USD9.898, which isn't much better than simply predicting the average.
This is mainly because the training data(geocoordinates) is not in a format that's useful for the model, and we're not using one of the most important columns:pickup data & time.
However, now we have a baseline that our models should ideally beat.

In [138]:
test_inputs

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,-73.973320,40.763805,-73.981430,40.743835,1
1,-73.986862,40.719383,-73.998886,40.739201,1
2,-73.982521,40.751259,-73.979652,40.746140,1
3,-73.981163,40.767807,-73.990448,40.751637,1
4,-73.966049,40.789776,-73.988564,40.744427,1
...,...,...,...,...,...
9909,-73.968124,40.796997,-73.955643,40.780388,6
9910,-73.945511,40.803600,-73.960213,40.776371,6
9911,-73.991600,40.726608,-73.789742,40.647011,6
9912,-73.985573,40.735432,-73.939178,40.801731,6


In [140]:
test_preds = linear_model.predict(test_inputs)

In [142]:
sub_df = pd.read_csv('sample_submission.csv')

In [144]:
sub_df

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,11.35
1,2015-01-27 13:08:24.0000003,11.35
2,2011-10-08 11:53:44.0000002,11.35
3,2012-12-01 21:12:12.0000002,11.35
4,2012-12-01 21:12:12.0000003,11.35
...,...,...
9909,2015-05-10 12:37:51.0000002,11.35
9910,2015-01-12 17:05:51.0000001,11.35
9911,2015-04-19 20:44:15.0000001,11.35
9912,2015-01-31 01:05:19.0000005,11.35


In [146]:
sub_df['fare_amount'] = test_preds

In [148]:
sub_df.to_csv('linear_model_submission.csv',index=None)

In [150]:
def predict_and_submission(model, test_inputs, fname):
    test_preds = model.predict(test_inputs)
    sub_df = pd.read_csv('sample_submission.csv')
    sub_df.to_csv(fname, index=None)
    return sub_df

### Feature Engineering :

##### Extracting Parts of Date :
- Year
- Month
- Day
- Weekday
- Hour


In [154]:
def add_dateparts(df, col):
    df[col + '_year'] = df[col].dt.year
    df[col + '_month'] = df[col].dt.month
    df[col + '_day'] = df[col].dt.day
    df[col + '_weekday'] = df[col].dt.weekday
    df[col + '_hour'] = df[col].dt.hour

In [156]:
add_dateparts(train_df, 'pickup_datetime')

In [158]:
add_dateparts(val_df, 'pickup_datetime')

In [160]:
add_dateparts(test_df, 'pickup_datetime')

In [162]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755482,1
1,8.0,2013-01-17 17:22:00+00:00,0.000000,0.000000,0.000000,0.000000,2
2,8.9,2011-06-15 18:07:00+00:00,-73.996330,40.753223,-73.978897,40.766964,3
3,6.9,2009-12-14 12:33:00+00:00,-73.982430,40.745747,-73.982430,40.745747,1
4,7.0,2013-11-06 11:26:54+00:00,-73.959061,40.781059,-73.962059,40.768604,1
...,...,...,...,...,...,...,...
552445,45.0,2014-02-06 23:59:45+00:00,-73.973587,40.747669,-73.999916,40.602894,1
552446,22.5,2015-01-05 15:29:08+00:00,-73.935928,40.799656,-73.985710,40.726952,2
552447,4.5,2013-02-17 22:27:00+00:00,-73.992531,40.748619,-73.998436,40.740143,1
552448,14.5,2013-01-27 12:41:00+00:00,-74.012115,40.706635,-73.988724,40.756218,1


In [164]:
def haversine_np (lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [166]:
def add_trip_distance(df):
    df['trip_distance'] = haversine_np(df['pickup_longitude'],
                                       df['pickup_latitude'],
                                df['dropoff_longitude'], 
                                       df['dropoff_latitude'])


In [168]:
add_trip_distance(train_df)

In [170]:
add_trip_distance(val_df)

In [172]:
add_trip_distance(test_df)

In [174]:
train_df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance
353352,6.0,2015-04-12 03:40:38+00:00,-73.993652,40.741543,-73.977974,40.742352,4,2015,4,12,6,3,1.323411
360070,3.7,2011-01-26 19:21:00+00:00,-73.993805,40.724579,-73.993805,40.724579,1,2011,1,26,2,19,0.0
372609,10.0,2012-10-03 10:40:17+00:00,-73.95916,40.78075,-73.969116,40.76123,1,2012,10,3,2,10,2.325504
550895,8.9,2012-03-14 13:44:27+00:00,-73.952187,40.783951,-73.978645,40.772602,1,2012,3,14,2,13,2.558912
444151,7.3,2012-02-05 15:33:00+00:00,-73.977112,40.746834,-73.991104,40.750404,2,2012,2,5,6,15,1.243267


#### Adding Distance From Popular Landmark :
- JFK Airport
- LGA Airport
- EWR Airport
- Times Airport
- Times Airport
- Met Meuseum
- World Trade Center

We'll be adding distance from drop location.

In [177]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126

In [179]:
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
    lon, lat = landmark_lonlat
    df[landmark_name + 'drop_distance'] = haversine_np(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])
    

In [181]:
def add_landmarks(a_df):
    landmarks = [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('met', met_lonlat), ('wtc', wtc_lonlat)]
    for name, lonlat in landmarks :
        add_landmark_dropoff_distance(a_df, name, lonlat)

In [183]:
add_landmarks(train_df)

In [185]:
add_landmarks(val_df)

In [187]:
add_landmarks(test_df)

In [189]:
train_df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfkdrop_distance,lgadrop_distance,ewrdrop_distance,metdrop_distance,wtcdrop_distance
353352,6.0,2015-04-12 03:40:38+00:00,-73.993652,40.741543,-73.977974,40.742352,4,2015,4,12,6,3,1.323411,20.2414,9.556355,17.56444,4.300385,4.261684
360070,3.7,2011-01-26 19:21:00+00:00,-73.993805,40.724579,-73.993805,40.724579,1,2011,1,26,2,19,0.0,20.39752,11.641132,15.713149,6.614004,1.900218
372609,10.0,2012-10-03 10:40:17+00:00,-73.95916,40.78075,-73.969116,40.76123,1,2012,10,3,2,10,2.325504,20.894815,8.192266,19.044893,2.079418,6.402866
550895,8.9,2012-03-14 13:44:27+00:00,-73.952187,40.783951,-73.978645,40.772602,1,2012,3,14,2,13,2.558912,22.322773,8.819165,18.902145,1.503061,7.168338
444151,7.3,2012-02-05 15:33:00+00:00,-73.977112,40.746834,-73.991104,40.750404,2,2012,2,5,6,15,1.243267,21.658104,10.286617,16.863903,3.986955,4.489382


#### Removing Outliers and Invalid Data :
There seems to be some invalid data in each of the following columns :
- Fare amount
- Passenger count
- Pickup latitude & longitude
- Drop latitude & longitude

In [192]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,552450.0,552450.0,552450.0,552450.0,552450.0,552450.0
mean,11.354059,-72.497063,39.9105,-72.504326,39.934265,1.684983
std,9.810809,11.622035,8.041162,12.065184,9.226158,1.337664
min,-52.0,-1183.362793,-3084.490234,-3356.729736,-2073.150635,0.0
25%,6.0,-73.99202,40.734875,-73.991425,40.73399,1.0
50%,8.5,-73.981819,40.752621,-73.980179,40.753101,1.0
75%,12.5,-73.967155,40.767036,-73.963737,40.768059,2.0
max,499.0,2420.209473,404.983337,2467.752686,3351.403076,208.0


In [194]:
test_df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfkdrop_distance,lgadrop_distance,ewrdrop_distance,metdrop_distance,wtcdrop_distance
count,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974716,40.751041,-73.973656,40.75174,1.671273,2011.815816,6.857979,16.19417,2.852834,13.46742,3.433216,20.916754,9.67518,18.546659,4.512898,6.037652
std,0.042799,0.033542,0.039093,0.035436,1.278747,1.803347,3.353272,8.838482,1.994451,6.868584,3.969877,3.30394,3.295646,4.035816,4.018422,4.252535
min,-74.25219,40.573143,-74.263245,40.568974,1.0,2009.0,1.0,1.0,0.0,0.0,0.0,0.4019,0.285629,0.28468,0.085747,0.040269
25%,-73.9925,40.736125,-73.991249,40.735253,1.0,2010.0,4.0,9.0,1.0,8.0,1.297261,20.513337,8.311565,16.520517,2.126287,3.670107
50%,-73.982327,40.753052,-73.980015,40.754065,1.0,2012.0,7.0,16.0,3.0,15.0,2.215648,21.181472,9.477797,18.02435,3.698123,5.541466
75%,-73.968012,40.767113,-73.964062,40.768757,2.0,2014.0,10.0,25.0,5.0,19.0,4.043051,21.909794,10.965272,19.880536,5.922544,7.757612
max,-72.986534,41.709557,-72.990967,41.696682,6.0,2015.0,12.0,31.0,6.0,23.0,99.933281,134.497726,126.062576,149.400787,130.347153,138.619492


We'll be using the following ranges :
- fare_amount $1 to $500
- longitudes : -75 to -72
- latitudes : 40 to 42
- passenger_count :1 to 6

In [197]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) &
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) &
              (df['pickup_longitude'] <= -72) &
              (df['dropoff_longitude'] >= -75) &
              (df['dropoff_longitude'] <= -72) &
              (df['pickup_latitude'] >= 40) &
              (df['pickup_latitude'] <= 42) &
              (df['dropoff_latitude'] >= 40) &
              (df['dropoff_latitude'] <= 42) &
              (df['passenger_count'] >= 1) &
              (df['passenger_count'] <= 6)]
    

In [199]:
train_df = remove_outliers(train_df)

In [201]:
val_df = remove_outliers(val_df)

##### Splitting Inputs & Targets :


In [204]:
train_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour', 'trip_distance',
       'jfkdrop_distance', 'lgadrop_distance', 'ewrdrop_distance',
       'metdrop_distance', 'wtcdrop_distance'],
      dtype='object')

In [206]:
input_cols = ['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour', 'trip_distance',
       'jfkdrop_distance', 'lgadrop_distance', 'ewrdrop_distance',
       'metdrop_distance', 'wtcdrop_distance']

In [208]:
target_col = ['fare_amount']

In [210]:
train_inputs = train_df[input_cols]
train_targets = train_df[target_col]

In [212]:
val_inputs = val_df[input_cols]
val_targets = val_df[target_col]

In [214]:
test_inputs = test_df[input_cols]

In [216]:
def evaluate(model):
    train_preds = model.predict(train_inputs)
    train_rmse = rmse(train_targets, train_preds)
    val_preds = model.predict(val_inputs)
    val_rmse = rmse(val_targets, val_preds)
    return train_rmse, val_rmse, train_preds, val_preds

##### Ridge Regression :

In [219]:
from sklearn.linear_model import Ridge

In [221]:
Ridge_model = Ridge(random_state=42)

In [223]:
Ridge_model.fit(train_inputs, train_targets)

In [224]:
evaluate(Ridge_model)



(5.049315152711235,
 5.217865657340078,
 array([[ 8.12925918],
        [ 4.11578439],
        [ 8.75063014],
        ...,
        [10.47234932],
        [ 8.2305928 ],
        [10.58672774]]),
 array([[10.91955339],
        [ 6.20493172],
        [46.21787888],
        ...,
        [ 8.0463052 ],
        [25.56885585],
        [ 8.45342102]]))

#### Our model was able to get to an RMSE of USD 5.2, much better than our baseline model.

In [228]:
train_inputs.shape

(431098, 16)

In [230]:
test_preds = Ridge_model.predict(test_inputs)

In [232]:
sub_df = pd.read_csv('sample_submission.csv')

In [234]:
sub_df

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,11.35
1,2015-01-27 13:08:24.0000003,11.35
2,2011-10-08 11:53:44.0000002,11.35
3,2012-12-01 21:12:12.0000002,11.35
4,2012-12-01 21:12:12.0000003,11.35
...,...,...
9909,2015-05-10 12:37:51.0000002,11.35
9910,2015-01-12 17:05:51.0000001,11.35
9911,2015-04-19 20:44:15.0000001,11.35
9912,2015-01-31 01:05:19.0000005,11.35


In [236]:
sub_df['fare_amount'] = test_preds

In [238]:
sub_df.to_csv('ridge_model_submission_final.csv',index=None)

In [240]:
sub_df

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.082151
1,2015-01-27 13:08:24.0000003,11.399499
2,2011-10-08 11:53:44.0000002,5.356923
3,2012-12-01 21:12:12.0000002,8.763571
4,2012-12-01 21:12:12.0000003,14.609918
...,...,...
9909,2015-05-10 12:37:51.0000002,9.024995
9910,2015-01-12 17:05:51.0000001,11.218598
9911,2015-04-19 20:44:15.0000001,47.926481
9912,2015-01-31 01:05:19.0000005,22.600022


##### Random Forest :

In [243]:
from sklearn.ensemble import RandomForestRegressor


In [245]:
Random_forest_model = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=10, n_estimators=100)

In [247]:
Random_forest_model.fit(train_inputs, train_targets)

  return fit_method(estimator, *args, **kwargs)


In [251]:
evaluate(Random_forest_model)



(3.5955102502414125,
 4.161216507784617,
 array([ 6.99399909,  9.10190057,  9.09379987, ..., 10.43082088,
         7.782156  , 10.400694  ]),
 array([12.65433613,  6.14604627, 47.31069124, ...,  8.36589355,
        29.27069612,  8.24300662]))

In [257]:
test_preds = Random_forest_model.predict(test_inputs)

In [259]:
sub_df = pd.read_csv('sample_submission.csv')

In [261]:
sub_df['fare_amount'] = test_preds

In [265]:
sub_df.to_csv('rf_model.csv',index=None)