In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost
import lightgbm
import geopy.distance as gd

In [13]:
train_df = pd.read_csv("../data/train.csv", parse_dates=["pickup_datetime"])
test_df = pd.read_csv("../data/test.csv", parse_dates=["pickup_datetime"])

In [14]:
train_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [15]:
test_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag'],
      dtype='object')

In [16]:
def calculate_distance(x):
    return gd.geodesic((x["pickup_latitude"],
                                                x["pickup_longitude"]),
                                                (x["dropoff_latitude"],
                                                x["dropoff_longitude"])).kilometers

In [17]:
train_df["geo_distance"] = train_df.apply(lambda x: calculate_distance(x), axis=1)

In [18]:
test_df["geo_distance"] = test_df.apply(lambda x: calculate_distance(x), axis=1)

In [19]:
train_df["day"] = train_df["pickup_datetime"].dt.day
train_df["month"] = train_df["pickup_datetime"].dt.month
train_df["year"] = train_df["pickup_datetime"].dt.year
train_df["hour"] = train_df["pickup_datetime"].dt.hour
train_df["minute"] = train_df["pickup_datetime"].dt.minute
train_df["seconds"] = train_df["pickup_datetime"].dt.second

In [20]:
test_df["day"] = test_df["pickup_datetime"].dt.day
test_df["month"] = test_df["pickup_datetime"].dt.month
test_df["year"] = test_df["pickup_datetime"].dt.year
test_df["hour"] = test_df["pickup_datetime"].dt.hour
test_df["minute"] = test_df["pickup_datetime"].dt.minute
test_df["seconds"] = test_df["pickup_datetime"].dt.second

In [21]:
map_dict = {"Y":0,
            "N":1}
train_df["store_and_fwd_flag"] = train_df["store_and_fwd_flag"].map(map_dict)
test_df["store_and_fwd_flag"] = test_df["store_and_fwd_flag"].map(map_dict)

In [22]:
test_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,geo_distance,day,month,year,hour,minute,seconds
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,1,2.742863,30,6,2016,23,59,58
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,1,2.755774,30,6,2016,23,59,53
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,1,1.307112,30,6,2016,23,59,47
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,1,5.266978,30,6,2016,23,59,41
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,1,0.961745,30,6,2016,23,59,33


In [23]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,geo_distance,day,month,year,hour,minute,seconds
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,1,455,1.502172,14,3,2016,17,24,55
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,1,663,1.80866,12,6,2016,0,43,35
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,1,2124,6.379687,19,1,2016,11,35,24
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,1,429,1.483632,6,4,2016,19,32,31
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,1,435,1.187038,26,3,2016,13,30,55


In [24]:
train_y = np.log(train_df["trip_duration"].values)

In [25]:
train_df.drop(["id","pickup_datetime","dropoff_datetime", "trip_duration"], inplace=True, axis=1)

In [26]:
test_id = test_df["id"].values

In [27]:
test_df.drop(["id","pickup_datetime"], inplace=True, axis=1)

In [28]:
print(train_df.columns,"\n", test_df.columns)

Index(['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'geo_distance', 'day', 'month', 'year', 'hour', 'minute', 'seconds'],
      dtype='object') 
 Index(['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'geo_distance', 'day', 'month', 'year', 'hour', 'minute', 'seconds'],
      dtype='object')


In [29]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [30]:
kfold_object = KFold(n_splits=10, shuffle=True, random_state=2000)

In [37]:
def run_xgb(train_x, train_y,val_x, val_y,test_x, params, number_of_rounds=1000, early_stopping_rounds=10):
    
    train = xgboost.DMatrix(train_x, label=train_y)
    validate = xgboost.DMatrix(val_x, val_y)
    test = xgboost.DMatrix(test_x)
    eval_list = [(train,"train"),(validate,"test")]
    model = xgboost.train(params=params,
                          dtrain=train,
                          num_boost_round=number_of_rounds,
                          evals=eval_list,
                          early_stopping_rounds=early_stopping_rounds
                         )
    pred_test = model.predict(test)
    print("pred test {}".format(pred_test))
    return pred_test

In [38]:
def create_param_dict(*args, **kwargs):
    params = dict()
    for key, value in kwargs.items():
        params[key] = value
    return params

In [43]:
params = create_param_dict(verbosity=1,
                           eta=0.05,
                           num_parallel_tree=4,
                           objective="reg:squarederror",
                           eval_metric="rmse",
                           max_depth=5, 
                           seed=2000,
                           colsample_bytree=0.7,
                           subsample=0.8,
                           min_child_weight=1)

In [49]:
prediction_for_test = 0  # np.zeros(train_df.shape[0])
for train_index, test_index in kfold_object.split(train_df):
    train_x, val_x = train_df.iloc[train_index], train_df.iloc[test_index]
    tran_y, val_y = train_y[train_index], train_y[test_index]
    prediction = run_xgb(train_x, tran_y, val_x, val_y, test_df, params,number_of_rounds=100, early_stopping_rounds=10)
    prediction_for_test += prediction
    
prediction_for_test = prediction_for_test/5
prediction_for_test = np.exp(prediction_for_test)

[0]	train-rmse:5.72035	test-rmse:5.7181
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 10 rounds.
[1]	train-rmse:5.43806	test-rmse:5.43585
[2]	train-rmse:5.16845	test-rmse:5.16635
[3]	train-rmse:4.91347	test-rmse:4.91144
[4]	train-rmse:4.67119	test-rmse:4.66923
[5]	train-rmse:4.44177	test-rmse:4.43985
[6]	train-rmse:4.22398	test-rmse:4.22209
[7]	train-rmse:4.01685	test-rmse:4.015
[8]	train-rmse:3.82017	test-rmse:3.81837
[9]	train-rmse:3.63385	test-rmse:3.63208
[10]	train-rmse:3.45658	test-rmse:3.45484
[11]	train-rmse:3.28725	test-rmse:3.2856
[12]	train-rmse:3.12706	test-rmse:3.12546
[13]	train-rmse:2.97659	test-rmse:2.97497
[14]	train-rmse:2.83181	test-rmse:2.83025
[15]	train-rmse:2.69443	test-rmse:2.69294
[16]	train-rmse:2.56409	test-rmse:2.56267
[17]	train-rmse:2.44052	test-rmse:2.43916
[18]	train-rmse:2.32463	test-rmse:2.32325
[19]	train-rmse:2.21475	test-rmse:2.21336
[20]	train-rmse:2.11024	test-rm

[83]	train-rmse:0.465282	test-rmse:0.463167
[84]	train-rmse:0.464336	test-rmse:0.46223
[85]	train-rmse:0.463397	test-rmse:0.461307
[86]	train-rmse:0.462593	test-rmse:0.460523
[87]	train-rmse:0.461832	test-rmse:0.45978
[88]	train-rmse:0.461125	test-rmse:0.459084
[89]	train-rmse:0.460465	test-rmse:0.458439
[90]	train-rmse:0.459841	test-rmse:0.457825
[91]	train-rmse:0.459244	test-rmse:0.457244
[92]	train-rmse:0.458711	test-rmse:0.456726
[93]	train-rmse:0.45819	test-rmse:0.456217
[94]	train-rmse:0.457703	test-rmse:0.455746
[95]	train-rmse:0.457262	test-rmse:0.455315
[96]	train-rmse:0.456868	test-rmse:0.45494
[97]	train-rmse:0.456461	test-rmse:0.454545
[98]	train-rmse:0.45609	test-rmse:0.454179
[99]	train-rmse:0.455757	test-rmse:0.453859
pred test [6.599051  6.4715652 6.0488067 ... 7.059986  7.3267064 6.9146624]
[0]	train-rmse:5.72006	test-rmse:5.72089
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 10 round

[65]	train-rmse:0.507217	test-rmse:0.510668
[66]	train-rmse:0.502756	test-rmse:0.506263
[67]	train-rmse:0.49866	test-rmse:0.502222
[68]	train-rmse:0.494852	test-rmse:0.498469
[69]	train-rmse:0.491409	test-rmse:0.495077
[70]	train-rmse:0.488284	test-rmse:0.492001
[71]	train-rmse:0.485394	test-rmse:0.489159
[72]	train-rmse:0.482774	test-rmse:0.486575
[73]	train-rmse:0.480429	test-rmse:0.484272
[74]	train-rmse:0.478105	test-rmse:0.481982
[75]	train-rmse:0.476074	test-rmse:0.479986
[76]	train-rmse:0.47423	test-rmse:0.47818
[77]	train-rmse:0.472518	test-rmse:0.4765
[78]	train-rmse:0.470928	test-rmse:0.474942
[79]	train-rmse:0.469468	test-rmse:0.473512
[80]	train-rmse:0.468076	test-rmse:0.472145
[81]	train-rmse:0.466854	test-rmse:0.470943
[82]	train-rmse:0.465713	test-rmse:0.469833
[83]	train-rmse:0.464655	test-rmse:0.468799
[84]	train-rmse:0.463711	test-rmse:0.46787
[85]	train-rmse:0.462784	test-rmse:0.466965
[86]	train-rmse:0.46198	test-rmse:0.466184
[87]	train-rmse:0.461197	test-rmse:0.46

[47]	train-rmse:0.698319	test-rmse:0.699054
[48]	train-rmse:0.679102	test-rmse:0.679798
[49]	train-rmse:0.661405	test-rmse:0.662073
[50]	train-rmse:0.645033	test-rmse:0.645661
[51]	train-rmse:0.629581	test-rmse:0.630177
[52]	train-rmse:0.615363	test-rmse:0.615924
[53]	train-rmse:0.602364	test-rmse:0.602895
[54]	train-rmse:0.590411	test-rmse:0.59091
[55]	train-rmse:0.579186	test-rmse:0.579649
[56]	train-rmse:0.568921	test-rmse:0.569359
[57]	train-rmse:0.559304	test-rmse:0.559713
[58]	train-rmse:0.550656	test-rmse:0.551039
[59]	train-rmse:0.542609	test-rmse:0.542953
[60]	train-rmse:0.535207	test-rmse:0.535518
[61]	train-rmse:0.52845	test-rmse:0.528734
[62]	train-rmse:0.522269	test-rmse:0.522527
[63]	train-rmse:0.516455	test-rmse:0.516682
[64]	train-rmse:0.511174	test-rmse:0.511379
[65]	train-rmse:0.506434	test-rmse:0.506618
[66]	train-rmse:0.502079	test-rmse:0.502245
[67]	train-rmse:0.497991	test-rmse:0.49813
[68]	train-rmse:0.494286	test-rmse:0.494405
[69]	train-rmse:0.490939	test-rmse:

[28]	train-rmse:1.4444	test-rmse:1.44259
[29]	train-rmse:1.38004	test-rmse:1.37815
[30]	train-rmse:1.31925	test-rmse:1.31728
[31]	train-rmse:1.26271	test-rmse:1.26073
[32]	train-rmse:1.20874	test-rmse:1.2067
[33]	train-rmse:1.1581	test-rmse:1.15602
[34]	train-rmse:1.11014	test-rmse:1.10803
[35]	train-rmse:1.06519	test-rmse:1.06306
[36]	train-rmse:1.02239	test-rmse:1.02019
[37]	train-rmse:0.982382	test-rmse:0.980134
[38]	train-rmse:0.945221	test-rmse:0.942966
[39]	train-rmse:0.909855	test-rmse:0.907556
[40]	train-rmse:0.876983	test-rmse:0.874664
[41]	train-rmse:0.845915	test-rmse:0.84356
[42]	train-rmse:0.81685	test-rmse:0.814454
[43]	train-rmse:0.789783	test-rmse:0.787368
[44]	train-rmse:0.764605	test-rmse:0.76218
[45]	train-rmse:0.741048	test-rmse:0.738605
[46]	train-rmse:0.719133	test-rmse:0.71668
[47]	train-rmse:0.69836	test-rmse:0.695873
[48]	train-rmse:0.679151	test-rmse:0.676642
[49]	train-rmse:0.661473	test-rmse:0.658958
[50]	train-rmse:0.645098	test-rmse:0.642584
[51]	train-rms

[9]	train-rmse:3.62916	test-rmse:3.6295
[10]	train-rmse:3.45149	test-rmse:3.45183
[11]	train-rmse:3.28275	test-rmse:3.28311
[12]	train-rmse:3.12263	test-rmse:3.12298
[13]	train-rmse:2.97081	test-rmse:2.97117
[14]	train-rmse:2.82706	test-rmse:2.8274
[15]	train-rmse:2.69061	test-rmse:2.69093
[16]	train-rmse:2.56075	test-rmse:2.56106
[17]	train-rmse:2.43762	test-rmse:2.43792
[18]	train-rmse:2.3212	test-rmse:2.3215
[19]	train-rmse:2.21018	test-rmse:2.21047
[20]	train-rmse:2.10534	test-rmse:2.10561
[21]	train-rmse:2.00557	test-rmse:2.00585
[22]	train-rmse:1.9114	test-rmse:1.91166
[23]	train-rmse:1.82241	test-rmse:1.82265
[24]	train-rmse:1.73819	test-rmse:1.73841
[25]	train-rmse:1.65861	test-rmse:1.6588
[26]	train-rmse:1.58321	test-rmse:1.58338
[27]	train-rmse:1.51189	test-rmse:1.51204
[28]	train-rmse:1.44443	test-rmse:1.44456
[29]	train-rmse:1.38011	test-rmse:1.38023
[30]	train-rmse:1.31931	test-rmse:1.31941
[31]	train-rmse:1.26276	test-rmse:1.26283
[32]	train-rmse:1.20877	test-rmse:1.20882

In [50]:
prediction_for_test

array([ 541914.06,  412498.53,  182226.66, ..., 1410846.5 , 2439314.8 ,
       1018208.44], dtype=float32)

In [51]:
test_pred_df = pd.DataFrame({'id':test_id})
test_pred_df['trip_duration'] = prediction_for_test
test_pred_df.to_csv("output.csv", index=False)

