# New York Taxi Fare Amount Prediction

Kaggle Link : https://www.kaggle.com/competitions/new-york-city-taxi-fare-prediction

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import  train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE

/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv
/kaggle/input/new-york-city-taxi-fare-prediction/GCP-Coupons-Instructions.rtf
/kaggle/input/new-york-city-taxi-fare-prediction/train.csv
/kaggle/input/new-york-city-taxi-fare-prediction/test.csv


In [2]:
train = pd.read_csv("../input/new-york-city-taxi-fare-prediction/train.csv", nrows = 1000000)
test = pd.read_csv("../input/new-york-city-taxi-fare-prediction/test.csv")

In [3]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [5]:
train.shape,test.shape

((1000000, 8), (9914, 7))

In [6]:
train.isnull().sum()

key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    10
dropoff_latitude     10
passenger_count       0
dtype: int64

In [7]:
train = train.dropna(how ='any', axis ='rows') # eliminated rows with null values

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999990 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                999990 non-null  object 
 1   fare_amount        999990 non-null  float64
 2   pickup_datetime    999990 non-null  object 
 3   pickup_longitude   999990 non-null  float64
 4   pickup_latitude    999990 non-null  float64
 5   dropoff_longitude  999990 non-null  float64
 6   dropoff_latitude   999990 non-null  float64
 7   passenger_count    999990 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 68.7+ MB


In [10]:
train['fare_amount'].describe()

count    999990.000000
mean         11.347953
std           9.821790
min         -44.900000
25%           6.000000
50%           8.500000
75%          12.500000
max         500.000000
Name: fare_amount, dtype: float64

In [11]:
train.drop(train[train['passenger_count'] == 0].index, axis=0, inplace = True)

In [13]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)

In [15]:
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.day_name()
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.day
train.loc[:, 'pickup_month'] = train['pickup_datetime'].dt.month
train.loc[:, 'pickup_day'] = train['pickup_datetime'].dt.dayofweek
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.day_name()
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.day
test.loc[:, 'pickup_month'] = test['pickup_datetime'].dt.month
test.loc[:, 'pickup_day'] = test['pickup_datetime'].dt.dayofweek

In [16]:
train.drop(['key','pickup_datetime'], axis=1,inplace=True)

The city of New York longitude ranges between -75 and -72. The latitude ranges between 40 and 42. So,

In [17]:
train.dropna(inplace=True)

train.drop(train.index[(train.pickup_longitude < -75) | 
           (train.pickup_longitude > -72) | 
           (train.pickup_latitude < 40) | 
           (train.pickup_latitude > 42)],inplace=True)
train.drop(train.index[(train.dropoff_longitude < -75) | 
           (train.dropoff_longitude > -72) | 
           (train.dropoff_latitude < 40) | 
           (train.dropoff_latitude > 42)],inplace=True)

In [19]:
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_hour,pickup_weekday,pickup_date,pickup_month,pickup_day
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,17,Monday,15,6,0
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,16,Tuesday,5,1,1
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,0,Thursday,18,8,3
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,4,Saturday,21,4,5
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,7,Tuesday,9,3,1


In [25]:
train = train.drop(['pickup_weekday'],axis=1)

## Model Training

In [27]:
x, y = train.drop('fare_amount', axis = 1), train['fare_amount']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=12)

In [28]:
scaler = StandardScaler()
scaler.fit_transform(x_train,x_test)

array([[-0.50910543, -1.10240075, -0.8668322 , ..., -1.3472513 ,
         0.5033584 , -1.5592937 ],
       [-0.43654057,  0.27544186,  0.6601215 , ..., -1.46243925,
         0.7942589 , -0.5337492 ],
       [-0.74933517, -1.27666273, -0.76334361, ..., -0.08018377,
        -0.3693431 ,  1.00456755],
       ...,
       [ 0.25570344,  0.7023167 ,  0.39967348, ...,  0.72613193,
         0.5033584 ,  0.4917953 ],
       [ 0.13827377,  0.40496543, -0.09778329, ...,  1.53244763,
         1.3760599 , -1.5592937 ],
       [-0.84636239, -0.40444798, -0.53941482, ..., -0.08018377,
        -0.6602436 , -1.5592937 ]])

In [29]:
xgb_r = xgb.XGBRegressor(objective ='reg:linear',n_estimators = 400, seed = 123)

In [30]:
xgb_r.fit(x_train,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=400, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=123, reg_alpha=0, ...)

In [31]:
y_pred = xgb_r.predict(x_test)

In [32]:
rmse = np.sqrt(MSE(y_test, y_pred))
rmse

4.305091807264246

In [33]:
test.drop(['key','pickup_datetime'], axis=1,inplace=True)

In [34]:
test.dropna(inplace=True)

test.drop(test.index[(test.pickup_longitude < -75) | 
           (test.pickup_longitude > -72) | 
           (test.pickup_latitude < 40) | 
           (test.pickup_latitude > 42)],inplace=True)
test.drop(test.index[(test.dropoff_longitude < -75) | 
           (test.dropoff_longitude > -72) | 
           (test.dropoff_latitude < 40) | 
           (test.dropoff_latitude > 42)],inplace=True)

In [36]:
test.drop(['pickup_weekday'], axis=1,inplace=True)

In [37]:
scaler.fit_transform(test)

array([[ 0.03278367,  0.38058296, -0.19896544, ...,  1.22265065,
        -1.74703242, -0.92904143],
       [-0.28383094, -0.94387891, -0.64575618, ...,  1.22265065,
        -1.74703242, -0.92904143],
       [-0.18240469,  0.00653781, -0.15350703, ..., -0.92714831,
         0.93704904,  1.07662393],
       ...,
       [-0.39460146, -0.72846192,  4.70733595, ...,  0.31747214,
        -0.8523386 ,  1.57804027],
       [-0.25368566, -0.46538934,  0.88247126, ...,  1.6752399 ,
        -1.74703242,  1.07662393],
       [-0.31094385,  0.09032734, -0.68149163, ...,  0.20432483,
        -1.74703242,  1.57804027]])

In [38]:
new_pred = xgb_r.predict(test)

In [39]:
sample_new=pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv')

In [43]:
sample_new.drop(['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count'], axis=1,inplace=True)

In [44]:
sample_new['fare_amount'] = new_pred

In [45]:
sample_new.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,8.052094
1,2015-01-27 13:08:24.0000003,9.322865
2,2011-10-08 11:53:44.0000002,6.166053
3,2012-12-01 21:12:12.0000002,8.809672
4,2012-12-01 21:12:12.0000003,15.723586


In [46]:
submission=sample_new.to_csv("submission.csv", index=False)