In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from geopy.distance import geodesic
import sklearn.feature_selection
from sklearn.model_selection import train_test_split
import pickle
from sklearn import linear_model
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from time import time

#import warnings
#warnings.filterwarnings('ignore')

In [2]:
#Functions

#add new features to df:
def add_newFeats(df):
    
    #compute distance function
    def haversine_distance(df_pickup_lat, df_pickup_long,df_dropoff_lat,df_dropoff_long):

    #6367 earth radius
        distance_col = 6367 * 2 * np.arcsin(np.sqrt(np.sin((np.radians(df_pickup_lat) /
                                                - np.radians(df_dropoff_lat))/2)**2 /
                                                + np.cos(np.radians(df_dropoff_lat))*np.cos(np.radians(df_pickup_lat))*np.sin((np.radians(df_pickup_long) - np.radians(df_dropoff_long))/2)**2))
        return distance_col
    
     #add features related with datetime
    df['key'] = pd.to_datetime(df['key'])
    df['weekday'] = df['key'].dt.dayofweek
    df['year']=df['key'].dt.year
    df['month']=df['key'].dt.month
    df['day']=df['key'].dt.day
    df['hour']=df['key'].dt.hour

    #Absolute differences between longitudes and between latitudes
    df ['abs_diff_longitude']=np.abs(df.pickup_longitude-df.dropoff_longitude)
    df ['abs_diff_latitude']=np.abs(df.pickup_latitude-df.dropoff_latitude)
    
    #there are some flat rates on trips from/to some of the airports in NY (JFK, LaGuardia, Newark) so we add features of
    #distances from/to these airports
    # coordinates from https://get-direction.com
    jfk_latitude = 40.644538879
    jfk_longitude = -73.795356750
    laguardia_latitude = 40.774009705
    laguardia_longitude =-73.872497559
    newark_latitude = 40.692878723
    newark_longitude = -74.185447693 
        
    #distances: between pickup and dropoffs, and distances to the main 3 airports in NYC
    #note: Newark is actually New Jersey, that's also why the bounding box was also larger
    df['distance'] = haversine_distance(df['pickup_latitude'],df['pickup_longitude'],df['dropoff_latitude'],df['dropoff_longitude'])
    df['pickup_distance_jfk'] = haversine_distance(df['pickup_latitude'],df['pickup_longitude'],jfk_latitude, jfk_longitude)
    df['pickup_distance_laguardia'] = haversine_distance(df['pickup_latitude'],df['pickup_longitude'],laguardia_latitude, laguardia_longitude)
    df['pickup_distance_newark'] = haversine_distance(df['pickup_latitude'],df['pickup_longitude'],newark_latitude, newark_longitude)
    df['dropoff_distance_jfk'] = haversine_distance(df['dropoff_latitude'],df['dropoff_longitude'],jfk_latitude, jfk_longitude)
    df['dropoff_distance_newark'] = haversine_distance(df['dropoff_latitude'],df['dropoff_longitude'],newark_latitude, newark_longitude)

    df['dropoff_distance_laguardia'] = haversine_distance(df['dropoff_latitude'],df['dropoff_longitude'],laguardia_latitude, laguardia_longitude)
    
    #log transform for distance help?
    df['distance_log'] = df['distance'].transform(lambda x: np.log(x+sys.float_info.epsilon))
    df['abs_diff_latitude_log'] = df['abs_diff_latitude'].transform(lambda x: np.log(x+sys.float_info.epsilon))
    df['abs_diff_longitude_log'] = df['abs_diff_longitude'].transform(lambda x: np.log(x+sys.float_info.epsilon))
    
    return df


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
#train_path = '/content/drive/My Drive/Colab Notebooks/train.csv'
train_path = 'train_orig.csv'

In [None]:
train = pd.read_csv(train_path, nrows=1_000_000) #we don't select all rows, since it is a big dataset
train.head()

In [None]:
train.head()

In [None]:
train.dtypes

## Pre-processing (denoising data) and feature engineering

### Drop outliers (given amount of data is large)

In [None]:
#drop duplicates
train.drop_duplicates(keep = 'first', inplace = True)
train.shape[0]

In [None]:
#we drop already some clear errors in data set (NaN rows, and duplicates)

#check if there are some NaN values
train.columns[train.isna().any()].tolist()

In [None]:
#we drop these values given that the data set is quite big
#original rows: 1_000_000
train.dropna(inplace=True)
train.shape[0]

In [None]:
#we limit the coordinates to proper ranges to compute distances

#Latitudes range from -90 to 90. Longitudes range from -180 to 180.
train = train[ (train.pickup_longitude >= -180) & (train.pickup_longitude <= 180) & \
              (train.dropoff_longitude >= -180) & (train.dropoff_longitude <= 180) & \
              (train.pickup_latitude >= -90) & (train.pickup_latitude <= 90) & \
              (train.dropoff_latitude >= -90) & (train.dropoff_latitude <= 90) ]

In [None]:
#based on these descriptors, we can at first sight already remove some other outliers:
#2) passengers in a taxi, up to 6 (icnluding suvs) https://ride.guru/lounge/p/how-many-people-can-ride-in-a-cab 
#leave drives of 0 passengers in case documents are transported?
#3) fare has to be positive value, over 2.50$? (that seems to be the initial charge) https://www1.nyc.gov/site/tlc/passengers/taxi-fare.page
#also fare now seems too high (500) and the variance is pretty hight too. so we cut the max fare now

#4) new york city actually has the following bounding box of coordinates (https://boundingbox.klokantech.com/)
#eastlimit_longitude=-73.700181
#southlimit_latitude =40.47739894
#westlimit_longitude=-74.25909
#northlimit_latitude=40.916178


#trying to use a bounding box slightly larger than the actual NY, so there is more flexibility for trips around airport
eastlimit_longitude=-73.664465
southlimit_latitude=40.477399
westlimit_longitude=-74.307165
northlimit_latitude=40.935154
                                                                                                               
train = train[ (train.pickup_longitude >= westlimit_longitude) & (train.pickup_longitude <= eastlimit_longitude) & \
              (train.dropoff_longitude >= westlimit_longitude) & (train.dropoff_longitude <= eastlimit_longitude) & \
              (train.pickup_latitude >= southlimit_latitude ) & (train.pickup_latitude <= northlimit_latitude) & \
              (train.dropoff_latitude >= southlimit_latitude) & (train.dropoff_latitude <= northlimit_latitude) ]                                                                              

train = train[(train.passenger_count<= 6) & \
              (train.fare_amount > 2.50)  ]
   


In [None]:
train.fare_amount.hist(bins=50)

In [None]:
train = train[ (train.fare_amount < 100)]

### Adding features

In [None]:
train = add_newFeats(train)

In [None]:
train.describe()

In [None]:
#keeping the cases of distances = 0 that we were going to keep seem to decrease the performance,
#so all cases are removed:
#removed any row with distance equal to 0:
train = train[ (train.distance > 0)]

In [None]:
train.isna().sum()

In [None]:
train.dropna(inplace=True)
train.shape[0]

## Data visualization (exploratory analysis)

In [None]:
#check distributions of all the features
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
train.hist(ax = ax, bins=50)


In [None]:
corr_pearson = train.corr(method='pearson')
mask = np.zeros_like(corr_pearson, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

In [None]:
fig4, ax4 = plt.subplots(figsize=(16, 10))
sns.heatmap(corr_pearson, mask=mask, cmap='RdBu_r', vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},vmin=-1)

In [None]:
#visualizer = FeatureCorrelation( method='mutual_info-classification', feature_names=train_out.columns, sort=True)
X = train.drop(['fare_amount','key','pickup_datetime'],axis=1, inplace=False).to_numpy()
Y = train.fare_amount.to_numpy()
#lab_enc = sklearn.preprocessing.LabelEncoder()
#Y_encoded = lab_enc.fit_transform(Y)

#mutual information has too expensive computationally to run for all data, so we just take 
X = X[1:100000,:]
Y = Y[1:100000]

feature_MIscores = sklearn.feature_selection.mutual_info_regression(X, Y)

In [None]:
feature_MIscores.shape

#plot also the fare vs the date (year), and maybe check as hypothesis if there is a positive correlation between year and fare

In [None]:
feats_labels = train.drop(['fare_amount','key','pickup_datetime'],axis=1, inplace=False).columns
#print(feats_labels)
index = np.arange(len(feats_labels))
plt.figure(figsize=(10,10))
plt.barh(index, feature_MIscores)
plt.xlabel('mutual information score', fontsize=20)
plt.yticks(index, feats_labels, fontsize=10, rotation=30)

In [None]:
#sns.pairplot(train, y_vars=['fare_amount'], x_vars=['distance', 'distance_log', 'hour','day','month','year','weekday','passenger_count','dropoff_latitude','dropoff_longitude','pickup_latitude','pickup_longitude'])

In [None]:
sns.scatterplot(x="distance", y="fare_amount", data=train)

In [None]:
#there is also a kind of straight line, that can be explained by the flat rates used for taxi rides from airport
 #"Taxis at JFK Airport charge a flat fare of $52 for trips between the airport and Manhattan." 
#https://www.jfkairport.com/to-from-airport/taxi-car-and-van-service
sns.scatterplot(x="distance_log", y="fare_amount", data=train)

In [None]:
#using hex bins because regular scatter plot does not work (too many points)
sns.jointplot(x="distance", y="fare_amount", data=train, kind='reg',joint_kws={'line_kws':{'color':'cyan'}})

In [None]:
#using hex bins because regular scatter plot does not work (too many points)
#easier to see with dostance_log since the scale is shrunk
sns.jointplot(x="distance", y="fare_amount", data=train, kind='hex')

In [None]:
#TODO: maybe also plot histograms grouping by day, or by hour, to see what times are more common, and see if the fair is similar for 

In [None]:
#TODO:maybe a heat map of the fares would be interesting, doing a geomap, maybe use plotly?

In [None]:
#TODO: maybe employ clustering to get some extra information of the data, like outliers? 

## Split of training data (train + validation)

In [None]:
train2, val = train_test_split(train, test_size=0.3)

In [None]:
train.shape

In [None]:
train2.shape

In [None]:
train.columns

In [None]:
val.shape

## Modeling 
### (Comparison of several regression models, plus a dummy model as baseline)

In [None]:
features_set1=['year','distance','pickup_distance_jfk','pickup_distance_laguardia',\
                  'dropoff_distance_jfk','dropoff_distance_laguardia','abs_diff_latitude','abs_diff_longitude','dropoff_distance_newark','pickup_distance_newark']

X_train = train2[features_set1]
y_train = train2[['fare_amount']]

X_val = val[features_set1]
y_val = val[['fare_amount']]

### Dummy model

In [None]:
dummy_regr = DummyRegressor(strategy="mean")
t0_dummy=time()
dummy_regr.fit(X_train, y_train)
print("Dummy model regression, training time:" + str(round(time()-t0_dummy, 3)) + "s")
print(dummy_regr)


In [None]:
y_valPred_dummy = dummy_regr.predict(X_val)
y_trainPred_dummy = dummy_regr.predict(X_train)

In [None]:
#R2 is the proportion of the variance in the dependent variable that is predictable from the independent
#variable(s).

r2_val_dummy = r2_score(y_val, y_valPred_dummy)
r2_train_dummy = r2_score(y_train, y_trainPred_dummy)

print('Dummy model- training, R2:' + str(r2_train_dummy))
print('Dummy model- validation, R2:' + str(r2_val_dummy))


In [None]:
mse_val_dummy = mean_squared_error(y_val, y_valPred_dummy)
mse_train_dummy = mean_squared_error(y_train, y_trainPred_dummy)

print('Dummy model - training, MSE:' + str(mse_train_dummy))
print('Dummy model - validation, MSE:' + str(mse_val_dummy))

### Linear regression

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

#use same scaler for both, based on X_train data
X_trainNorm = scaler.transform(X_train)
X_valNorm = scaler.transform(X_val)

In [None]:
# create linear regression model
linRegr = linear_model.LinearRegression()
print(linRegr)

In [None]:
# train the model with training data

t0_linR=time()
linRegr.fit(X_trainNorm, y_train)
print("Linear regression, training time:" + str(round(time()-t0_linR, 3)) + "s")
print(linRegr)

In [None]:
y_valPred_linReg = linRegr.predict(X_valNorm) #to check validation error measures
y_trainPred_linReg = linRegr.predict(X_trainNorm) #to check training error measures

In [None]:
r2_val_linReg = r2_score(y_val, y_valPred_linReg)
r2_train_linReg = r2_score(y_train, y_trainPred_linReg)

print('Linear regression - training, R2:' + str(r2_train_linReg))
print('Linear regression - validation, R2:' + str(r2_val_linReg))


In [None]:
mse_val_linReg = mean_squared_error(y_val, y_valPred_linReg)
mse_train_linReg = mean_squared_error(y_train, y_trainPred_linReg)

print('Linear regression - training, MSE:' + str(mse_train_linReg))
print('Linear regression - validation, MSE:' + str(mse_val_linReg))


### Linear regression with Lasso regularization (i.e. feature selection)

In [None]:
train2.columns

In [None]:
X_train_lasso = train2.drop(['fare_amount','key','pickup_datetime','distance_log','abs_diff_latitude_log','abs_diff_longitude_log'],axis=1, inplace=False)
y_train_lasso = train2[['fare_amount']]

X_val_lasso = val.drop(['fare_amount','key','pickup_datetime','distance_log','abs_diff_latitude_log','abs_diff_longitude_log'],axis=1, inplace=False)
y_val_lasso = val[['fare_amount']]

In [None]:
X_train_lasso.shape

In [None]:
y_train_lasso.shape

In [None]:
scaler_lasso = StandardScaler()
scaler_lasso.fit(X_train_lasso)

#use same scaler for both, based on X_train_lasso data
X_trainLassoNorm = scaler_lasso.transform(X_train_lasso)
X_valLassoNorm = scaler_lasso.transform(X_val_lasso)

In [None]:
lassoReg = linear_model.LassoCV(cv=5)
t0_linRLasso=time()
lassoReg.fit(X_trainLassoNorm, y_train_lasso.values.ravel())
print("Linear regression with Lasso reg., training time:" + str(round(time()-t0_linRLasso, 3)) + "s")
print(lassoReg)

In [None]:
y_valPred_lassoReg = lassoReg.predict(X_valLassoNorm)
y_trainPred_lassoReg = lassoReg.predict(X_trainLassoNorm)

In [None]:
r2_val_lassoReg = r2_score(y_val_lasso, y_valPred_lassoReg)
r2_train_lassoReg = r2_score(y_train_lasso, y_trainPred_lassoReg)

print('Linear regression with Lasso reg. - training, R2:' + str(r2_train_lassoReg))
print('Linear regression with Lasso reg. - validation, R2:' + str(r2_val_lassoReg))

In [None]:
mse_val_lassoReg = mean_squared_error(y_val_lasso, y_valPred_lassoReg)
mse_train_lassoReg = mean_squared_error(y_train_lasso, y_trainPred_lassoReg)

print('Linear regression with Lasso reg. - training, MSE:' + str(mse_train_lassoReg))
print('Linear regression with Lasso reg. - validation, MSE:' + str(mse_val_lassoReg))

In [None]:
#TODO: include a plotbar showing the weights obtained

### Random forest regression

Decision trees can deal with non-linearity relationships between independent variables and the dependent variable (fare_amount), unlike linear regressors.

In [None]:
randFor = RandomForestRegressor(n_estimators=10) #current default, future is 100, but it is too much computation

#decisions trees don't need normalized data since they don't assume any distribution

t0_randFor=time()
randFor.fit(X_train_lasso, y_train_lasso.values.ravel())
print("Random forest regression, training time:" + str(round(time()-t0_randFor, 3)) + "s")
print(randFor)

In [None]:
importances = randFor.feature_importances_

indices = np.argsort(importances)

features =  X_train_lasso.columns

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


#Avenues generally run north and south in NY.
# all streets run east and west
#https://becomeanewyorker.com/streets-and-avenues-there-is-a-differenc/
#it may be that taxis go mainly through avenues, and that is why longitude affects so much

In [None]:
y_valPred_randFor = randFor.predict(X_val_lasso)
y_trainPred_randFor = randFor.predict(X_train_lasso)

In [None]:
r2_val_randFor = r2_score(y_val_lasso, y_valPred_randFor)
r2_train_randFor = r2_score(y_train_lasso, y_trainPred_randFor)

print('Random forest regression - training, R2:' + str(r2_train_randFor))
print('Random forest regression - validation, R2:' + str(r2_val_randFor))

In [None]:
mse_val_randFor = mean_squared_error(y_val_lasso, y_valPred_randFor)
mse_train_randFor = mean_squared_error(y_train_lasso, y_trainPred_randFor)

print('Random forest regression - training, MSE:' + str(mse_train_randFor))
print('Random forest regression - validation, MSE:' + str(mse_val_randFor))

### XGBoost regression

With this approach with reduce not only variance (as in random forests) but also the bias.
It has also regularization to avoid overfitting.

In [None]:
# https://datascience.stackexchange.com/questions/23789/why-do-we-need-xgboost-and-random-forest
#xgbr = xgb.XGBRegressor(objective ='reg:squarederror')  #objective changed based on warning: "reg:linear is now deprecated in favor of reg:squarederror."
#print(xgbr)

In [None]:
#again here, using decision trees, so no need 

#t0_xgbr=time()
#xgbr.fit(X_train_lasso, y_train_lasso.values.ravel())
#print("XGBoost regression, training time:" + str(round(time()-t0_xgbr, 3)) + "s")
#print(xgbr)

#filename = 'XGBoost_model.sav'
#pickle.dump(xgbr, open(filename, 'wb'))

In [None]:
#y_valPred_xgbr = xgbr.predict(X_val_lasso)
#y_trainPred_xgbr = xgbr.predict(X_train_lasso)

In [None]:
#r2_val_xgbr = r2_score(y_val_lasso, y_valPred_xgbr)
#r2_train_xgbr = r2_score(y_train_lasso, y_trainPred_xgbr)

#print('XGBoost regression - training, R2:' + str(r2_train_xgbr))
#print('XGBoost regression - validation, R2:' + str(r2_val_xgbr))

In [None]:
#mse_val_xgbr = mean_squared_error(y_val_lasso, y_valPred_xgbr)
#mse_train_xgbr = mean_squared_error(y_train_lasso, y_trainPred_xgbr)

#print('XGBoost regression - training, MSE:' + str(mse_train_xgbr))
#print('XGBoost forest regression - validation, MSE:' + str(mse_val_xgbr))

- #### XGBoost with parameter tuning

In [None]:
#https://www.kaggle.com/omarito/gridsearchcv-xgbregressor-0-556-lb
#https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
#https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f

In [None]:
# Parameters that are going to be tuned are those given by an :
params ={}
params = {
    'max_depth': [3,5,7,9],
    'min_child_weight': [1,3,5],
    'learning_rate': [0.05,0.1,0.3],
    'subsample': [1],
    'colsample_bytree': [1],
    # Other parameters
    'objective': ['reg:squarederror'],
}
#num_boost_round = 999

In [None]:
# initializing XGBoost and GridSearchCV

#k=10 #10-folding
#xgb_regr_tuning = xgb.XGBRegressor() 
#xgbr_grid = GridSearchCV(xgb_regr_tuning, params, cv=k, scoring='neg_mean_squared_error')

In [None]:
##we use just a subset of the values, otherwise it takes too much time 
##(also, batches will be smaller, which may be better for results in the end)
#X_train_grid = X_train_lasso.head(300_000)
#y_train_grid = y_train_lasso.head(300_000)

#t0_xgbrGrid=time()
#grid_result = xgbr_grid.fit(X_train_grid, y_train_grid.values.ravel())
#print("XGBoost grid search tuning, time:" + str(round(time()-t0_xgbrGrid, 3)) + "s")
#print(xgbr_grid)

#filename = 'XGBoostGrid_model.sav'
#pickle.dump(grid_result, open(filename, 'wb'))

In [None]:
#best_params = grid_result.best_params_
#params['max_depth']=[best_params['max_depth']]
#params['min_child_weight']=[best_params['min_child_weight']]
#params['learning_rate']=[best_params['learning_rate']]

#params

In [3]:
#tuned_xgbr = xgb.XGBRegressor(max_depth=9,min_child_weight=1,learning_rate= 0.1,objective ='reg:squarederror')
#t0_xgbrTune=time()
#tuned_xgbr.fit(X_train_lasso, y_train_lasso.values.ravel())
#print("XGBoost regression with tuning, training time:" + str(round(time()-t0_xgbrTune, 3)) + "s")
#print(xgbr_grid)

filename = 'XGBoostTuned_model.sav'
#pickle.dump(tuned_xgbr, open(filename, 'wb'))
tuned_xgbr = pickle.load(open(filename, 'rb'))

In [None]:
#y_valPred_xgbrGrid = xgbr_grid.predict(X_val_lasso)
#y_trainPred_xgbrGrid = xgbr_grid.predict(X_train_lasso)

y_valPred_tuned_xgbr = tuned_xgbr.predict(X_val_lasso)
y_trainPred_tuned_xgbr = tuned_xgbr.predict(X_train_lasso)

In [None]:
#r2_val_xgbrGrid = r2_score(y_val_lasso, y_valPred_xgbrGrid)
#r2_train_xgbrGrid = r2_score(y_train_lasso, y_trainPred_xgbrGrid)
r2_val_tuned_xgbr = r2_score(y_val_lasso, y_valPred_tuned_xgbr)
r2_train_tuned_xgbr = r2_score(y_train_lasso, y_trainPred_tuned_xgbr)


#print('XGBoost regression, with tuning - training, R2:' + str(r2_train_xgbrGrid))
#print('XGBoost regression, with tuning - validation, R2:' + str(r2_val_xgbrGrid))

print('XGBoost regression, with tuning - training, R2:' + str(r2_train_tuned_xgbr))
print('XGBoost regression, with tuning - validation, R2:' + str(r2_val_tuned_xgbr))

In [None]:
#mse_val_xgbrGrid = mean_squared_error(y_val_lasso, y_valPred_xgbrGrid)
#mse_train_xgbrGrid = mean_squared_error(y_train_lasso, y_trainPred_xgbrGrid)

mse_val_tuned_xgbr = mean_squared_error(y_val_lasso, y_valPred_tuned_xgbr)
mse_train_tuned_xgbr = mean_squared_error(y_train_lasso, y_trainPred_tuned_xgbr)

print('XGBoost regression - training, MSE:' + str(mse_train_tuned_xgbr))
print('XGBoost forest regression - validation, MSE:' + str(mse_val_tuned_xgbr))

In [None]:
#been on end of notebook running
import os
duration = 1  # seconds
freq = 400  # Hz
os.system('play -nq -t alsa synth {} sine {}'.format(duration, freq))

## Predictions on test set

In [4]:
test_path = 'test_orig.csv'
test = pd.read_csv(test_path)
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [5]:
test.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974722,40.751041,-73.973657,40.751743,1.671273
std,0.042774,0.033541,0.039072,0.035435,1.278747
min,-74.252193,40.573143,-74.263242,40.568973,1.0
25%,-73.992501,40.736125,-73.991247,40.735254,1.0
50%,-73.982326,40.753051,-73.980015,40.754065,1.0
75%,-73.968013,40.767113,-73.964059,40.768757,2.0
max,-72.986532,41.709555,-72.990963,41.696683,6.0


In [6]:
test = add_newFeats(test)

In [None]:
test.describe()

In [None]:
#data is clean of NaNs
test.isna().sum()

In [None]:
#fill using median rather than mean since it is more robust to outliers
#test['weekday'].fillna((test['weekday'].median()), inplace=True)
#test['year'].fillna((test['year'].median()), inplace=True)
#test['month'].fillna((test['month'].median()), inplace=True)
#test['hour'].fillna((test['hour'].median()), inplace=True)

In [None]:
#Check also about the coordinates

test[ (test.pickup_longitude < -180) | (test.pickup_longitude > 180) | \
              (test.dropoff_longitude < -180) | (test.dropoff_longitude > 180) | \
              (test.pickup_latitude < -90) | (test.pickup_latitude > 90) | \
              (test.dropoff_latitude < -90) | (test.dropoff_latitude > 90) ]


In [None]:
sum(test.distance == 0)

In [7]:
test_feats = test.drop(['key','pickup_datetime','distance_log','abs_diff_latitude_log','abs_diff_longitude_log'],axis=1, inplace=False)

In [8]:
test_feats.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,weekday,year,month,day,hour,abs_diff_longitude,abs_diff_latitude,distance,pickup_distance_jfk,pickup_distance_laguardia,pickup_distance_newark,dropoff_distance_jfk,dropoff_distance_newark,dropoff_distance_laguardia
count,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974722,40.751041,-73.973657,40.751743,1.671273,2.852834,2011.815816,6.857979,16.19417,13.46742,0.023348,0.022133,1.243933,9.648597,5.685396,11.237838,9.570293,11.294917,5.551162
std,0.042774,0.033541,0.039072,0.035435,1.278747,1.994451,1.803347,3.353272,8.838482,6.868584,0.036719,0.025589,1.958653,1.933168,1.589563,2.283139,1.812506,2.083768,1.597437
min,-74.252193,40.573143,-74.263242,40.568973,1.0,0.0,2009.0,1.0,1.0,0.0,0.0,0.0,0.0,0.009949,0.005357,1.905468,0.041663,0.088896,0.000563
25%,-73.992501,40.736125,-73.991247,40.735254,1.0,1.0,2010.0,4.0,9.0,8.0,0.006354,0.007279,0.338505,9.220034,5.094757,10.287253,9.010791,10.354199,4.90447
50%,-73.982326,40.753051,-73.980015,40.754065,1.0,3.0,2012.0,7.0,16.0,15.0,0.013123,0.014715,0.699284,9.979375,5.851,10.833266,9.855911,10.952961,5.731228
75%,-73.968013,40.767113,-73.964059,40.768757,2.0,5.0,2014.0,10.0,25.0,19.0,0.024557,0.028261,1.308633,10.521719,6.393444,11.599358,10.45313,11.808672,6.326527
max,-72.986532,41.709555,-72.990963,41.696683,6.0,6.0,2015.0,12.0,31.0,23.0,0.849168,0.633213,45.645175,43.763288,47.844913,64.822849,43.515665,64.571588,47.597004


In [9]:
y_testPred_tuned_xgbr = tuned_xgbr.predict(test_feats)

In [10]:
y_testPred_tuned_xgbr.shape

(9914,)

In [15]:
submission = pd.read_csv('sample_submission.csv')
submission['fare_amount'] = y_testPred_tuned_xgbr
submission.to_csv('submission_test.csv', index=False)

In [16]:
submission.head(5)

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.473106
1,2015-01-27 13:08:24.0000003,10.243099
2,2011-10-08 11:53:44.0000002,4.685626
3,2012-12-01 21:12:12.0000002,8.538606
4,2012-12-01 21:12:12.0000003,15.312421
