In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/train.csv', nrows = 1000000)
test = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv')

In [None]:
print("Train data shape", train.shape)
print("Test data shape", test.shape)

In [None]:
train.describe()

In [None]:
train.head()

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
train = train.dropna()

In [None]:
test.head()

In [None]:
test.isnull().sum().sort_values(ascending=False)

In [None]:
train['fare_amount'].describe()

In [None]:
#drop negative price and outlier passenger (only one with 208 passengers and multiple with 0)
train = train.drop(train[train['fare_amount']<0].index, axis = 0)
train = train.drop(train[(train['passenger_count']>6) | (train['passenger_count'] == 0)].index, axis = 0)

In [None]:
train.dtypes

In [None]:
def convertDateTime(dataset):
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'])
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

In [None]:
# https://en.wikipedia.org/wiki/Haversine_formula - compute distance between two points given lat/long coords.
def haversineDist(pick_lat, pick_long, drop_lat, drop_long):
    R = 6371 #earths radius
    pick_lat, pick_long, drop_lat, drop_long = map(np.radians, [pick_lat, pick_long, drop_lat, drop_long])
    distanceLat = drop_lat - pick_lat
    distanceLong = drop_long - pick_long
    a = np.sin(distanceLat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(distanceLong/2.0)**2
    c = np.arcsin(np.sqrt(a))
    return R*c

In [None]:
#convert pickup_datetime to datetime dtype and create new distance column
train = convertDateTime(train)
train['distance'] = haversineDist(train['pickup_latitude'], train['pickup_longitude'], 
                                   train['dropoff_latitude'] , train['dropoff_longitude'])

#drop unused columns
train.drop(columns=['key', 'pickup_datetime'], inplace=True)

In [None]:
#pre-process test data
test = convertDateTime(test)
test['distance'] = haversineDist(test['pickup_latitude'], test['pickup_longitude'], 
                                   test['dropoff_latitude'] , test['dropoff_longitude'])
testKeys = test['key']
test.drop(columns=['key', 'pickup_datetime'], inplace=True)

In [None]:
y = train['fare_amount']
train = train.drop(columns=['fare_amount'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=0, test_size = 0.2)

In [None]:
from xgboost import XGBRegressor
import xgboost as xgb

def XGBModel(X_train, X_test, y_train, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    params = {'max_depth':6, 'eta':0.3, 'objective':'reg:linear', 'eval_metric':'rmse'}
    model = xgb.train(params=params,
                     dtrain=dtrain,
                     early_stopping_rounds=10,
                     num_boost_round=5000,
                     evals=[(dtest,'test')])
    return model

model = XGBModel(X_train, X_test, y_train, y_test)

In [None]:
preds = model.predict(xgb.DMatrix(test), ntree_limit = model.best_ntree_limit)

In [None]:
submission = pd.DataFrame({
    "key": testKeys,
    "fare_amount": preds.round(2)
})

submission.to_csv('taxi_submission_1', index=False)