In [1]:
# Importing necessary modules

import json
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
# Loading the data

train_df = pd.read_csv("train_after_further_explorations.gz", compression = 'gzip')

del train_df['Unnamed: 0']

In [8]:
train_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_date', 'pickup_day', 'dropoff_date',
       'dropoff_day', 'trip_in_minutes', 'pickup_weekend_or_not'],
      dtype='object')

In [9]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_date,pickup_day,dropoff_date,dropoff_day,trip_in_minutes,pickup_weekend_or_not
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,455,2016-03-14,Monday,2016-03-14,Monday,7,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,663,2016-06-12,Sunday,2016-06-12,Sunday,11,1
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,2016-01-19,Tuesday,2016-01-19,Tuesday,35,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,429,2016-04-06,Wednesday,2016-04-06,Wednesday,7,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,435,2016-03-26,Saturday,2016-03-26,Saturday,7,1


In [44]:
# Lightgbm

# Seperating label and inputs

#y = np.log(train_df['trip_in_minutes'] + 1)
#y = train_df['trip_in_minutes']
y = np.log(train_df['trip_duration'] + 1)

X = train_df[['vendor_id','passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
        'pickup_weekend_or_not']]

# Splitting training data into train and test data

X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

# create dataset for lightgbm

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 5,
    'learning_rate': 0.6,
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 2,
    'verbose': 1
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=50)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The mse of prediction is:', mean_squared_error(y_test, y_pred))

print('Dump model to JSON...')
# dump model to json (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)


print('Feature names:', gbm.feature_name())

print('Calculate feature importances...')
# feature importances
print('Feature importances:', list(gbm.feature_importance()))



Start training...
[1]	valid_0's l2: 0.571204
Train until valid scores didn't improve in 50 rounds.
[2]	valid_0's l2: 0.556648
[3]	valid_0's l2: 0.545577
[4]	valid_0's l2: 0.510276
[5]	valid_0's l2: 0.487493
[6]	valid_0's l2: 0.482992
[7]	valid_0's l2: 0.459522
[8]	valid_0's l2: 0.45837
[9]	valid_0's l2: 0.450618
[10]	valid_0's l2: 0.432709
[11]	valid_0's l2: 0.42912
[12]	valid_0's l2: 0.428033
[13]	valid_0's l2: 0.425418
[14]	valid_0's l2: 0.419572
[15]	valid_0's l2: 0.417837
[16]	valid_0's l2: 0.412018
[17]	valid_0's l2: 0.409525
[18]	valid_0's l2: 0.407173
[19]	valid_0's l2: 0.405905
[20]	valid_0's l2: 0.405262
[21]	valid_0's l2: 0.404037
[22]	valid_0's l2: 0.403831
[23]	valid_0's l2: 0.403503
[24]	valid_0's l2: 0.403212
[25]	valid_0's l2: 0.401792
[26]	valid_0's l2: 0.399973
[27]	valid_0's l2: 0.39913
[28]	valid_0's l2: 0.396976
[29]	valid_0's l2: 0.396059
[30]	valid_0's l2: 0.395463
[31]	valid_0's l2: 0.393522
[32]	valid_0's l2: 0.393001
[33]	valid_0's l2: 0.39201
[34]	valid_0's l2

[287]	valid_0's l2: 0.355663
[288]	valid_0's l2: 0.355663
[289]	valid_0's l2: 0.355658
[290]	valid_0's l2: 0.35565
[291]	valid_0's l2: 0.355641
[292]	valid_0's l2: 0.355607
[293]	valid_0's l2: 0.355587
[294]	valid_0's l2: 0.355558
[295]	valid_0's l2: 0.355531
[296]	valid_0's l2: 0.355504
[297]	valid_0's l2: 0.355498
[298]	valid_0's l2: 0.35504
[299]	valid_0's l2: 0.355036
[300]	valid_0's l2: 0.35503
[301]	valid_0's l2: 0.355011
[302]	valid_0's l2: 0.355006
[303]	valid_0's l2: 0.354968
[304]	valid_0's l2: 0.354953
[305]	valid_0's l2: 0.354932
[306]	valid_0's l2: 0.354915
[307]	valid_0's l2: 0.354904
[308]	valid_0's l2: 0.354899
[309]	valid_0's l2: 0.354863
[310]	valid_0's l2: 0.354842
[311]	valid_0's l2: 0.354826
[312]	valid_0's l2: 0.354784
[313]	valid_0's l2: 0.354782
[314]	valid_0's l2: 0.354781
[315]	valid_0's l2: 0.354771
[316]	valid_0's l2: 0.354756
[317]	valid_0's l2: 0.354756
[318]	valid_0's l2: 0.354704
[319]	valid_0's l2: 0.35459
[320]	valid_0's l2: 0.35455
[321]	valid_0's l2:

[574]	valid_0's l2: 0.35242
[575]	valid_0's l2: 0.352421
[576]	valid_0's l2: 0.352412
[577]	valid_0's l2: 0.352411
[578]	valid_0's l2: 0.352412
[579]	valid_0's l2: 0.352415
[580]	valid_0's l2: 0.352413
[581]	valid_0's l2: 0.352418
[582]	valid_0's l2: 0.352417
[583]	valid_0's l2: 0.352422
[584]	valid_0's l2: 0.352404
[585]	valid_0's l2: 0.352404
[586]	valid_0's l2: 0.352397
[587]	valid_0's l2: 0.352395
[588]	valid_0's l2: 0.352386
[589]	valid_0's l2: 0.352385
[590]	valid_0's l2: 0.352379
[591]	valid_0's l2: 0.352365
[592]	valid_0's l2: 0.352345
[593]	valid_0's l2: 0.352348
[594]	valid_0's l2: 0.352344
[595]	valid_0's l2: 0.352321
[596]	valid_0's l2: 0.352317
[597]	valid_0's l2: 0.35231
[598]	valid_0's l2: 0.352312
[599]	valid_0's l2: 0.352311
[600]	valid_0's l2: 0.35231
[601]	valid_0's l2: 0.352303
[602]	valid_0's l2: 0.352299
[603]	valid_0's l2: 0.352299
[604]	valid_0's l2: 0.352297
[605]	valid_0's l2: 0.352297
[606]	valid_0's l2: 0.352293
[607]	valid_0's l2: 0.352283
[608]	valid_0's l

[858]	valid_0's l2: 0.351356
[859]	valid_0's l2: 0.351358
[860]	valid_0's l2: 0.351365
[861]	valid_0's l2: 0.351366
[862]	valid_0's l2: 0.351364
[863]	valid_0's l2: 0.351366
[864]	valid_0's l2: 0.351365
[865]	valid_0's l2: 0.351367
[866]	valid_0's l2: 0.351363
[867]	valid_0's l2: 0.351363
[868]	valid_0's l2: 0.351365
[869]	valid_0's l2: 0.351365
[870]	valid_0's l2: 0.351361
[871]	valid_0's l2: 0.35136
[872]	valid_0's l2: 0.351362
[873]	valid_0's l2: 0.351349
[874]	valid_0's l2: 0.351354
[875]	valid_0's l2: 0.351354
[876]	valid_0's l2: 0.351354
[877]	valid_0's l2: 0.351349
[878]	valid_0's l2: 0.351352
[879]	valid_0's l2: 0.351348
[880]	valid_0's l2: 0.351345
[881]	valid_0's l2: 0.351342
[882]	valid_0's l2: 0.351344
[883]	valid_0's l2: 0.351326
[884]	valid_0's l2: 0.351324
[885]	valid_0's l2: 0.351327
[886]	valid_0's l2: 0.351325
[887]	valid_0's l2: 0.35133
[888]	valid_0's l2: 0.351327
[889]	valid_0's l2: 0.351327
[890]	valid_0's l2: 0.351325
[891]	valid_0's l2: 0.351297
[892]	valid_0's 

In [32]:
# Setting up the test dataset

test_df = pd.read_csv('test.zip',compression='zip')
test_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag'],
      dtype='object')

In [33]:
'''This function takes in a "dataframe" and a "column_name" in which it has to replace
values. It takes an "original" value and replaces it by the value of the "replacement".'''

def replace_column_values(dataframe, column_name, original, replacement):
    dataframe.replace({column_name:{original : replacement}}, inplace = True)

In [34]:
# Converting the store_and_fwd_flag to a numerical categorical variable by encoding Y:1,N:0.

replace_column_values(test_df, 'store_and_fwd_flag','Y', 1)
replace_column_values(test_df, 'store_and_fwd_flag','N', 0)

In [35]:
'''This function can take in dropoff_datetime and pickup_datetime columns.It then extracts 
   two new columns out of each, for instance, pickup_date and pickup_day'''


def get_day_date(col_name):
    original_datetime = test_df[col_name]
    original_datetime.tolist()
    
    dates_only = []
    
    for each_datetime in original_datetime:
        dates_only.append(each_datetime[:10])
    
    src_dest, _ = col_name.split('_')
    
    test_df[src_dest + '_date'] = dates_only
    test_df[src_dest + '_date'] = pd.to_datetime(test_df[src_dest + '_date'])
    
    test_df[src_dest + '_day'] = test_df[src_dest + '_date'].dt.weekday_name
    

In [37]:
get_day_date('pickup_datetime')
#get_day_date('dropoff_datetime')

In [38]:
test_df['pickup_weekend_or_not'] = test_df['pickup_day']
replace_column_values(test_df, 'pickup_weekend_or_not','Saturday', 1)
replace_column_values(test_df, 'pickup_weekend_or_not','Sunday', 1)
replace_column_values(test_df, 'pickup_weekend_or_not','Monday', 0)
replace_column_values(test_df, 'pickup_weekend_or_not','Tuesday', 0)
replace_column_values(test_df, 'pickup_weekend_or_not','Wednesday', 0)
replace_column_values(test_df, 'pickup_weekend_or_not','Thursday', 0)
replace_column_values(test_df, 'pickup_weekend_or_not','Friday', 0)


In [None]:
# Grid Search


In [7]:
y = np.log(train_df['trip_duration'] + 1)

X = train_df[['vendor_id','passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
        'pickup_weekend_or_not']]

# Splitting training data into train and test data

X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

In [8]:
# Set params
# Scores ~0.784 (without tuning and early stopping)    
params = {'boosting_type': 'gbdt',
          'max_depth' : 4,
          'objective': 'regression', 
          'nthread': 2, 
          'silent': True,
          'num_leaves': 64, 
          'learning_rate': 0.05, 
          'max_bin': 512, 
          'subsample_for_bin': 200,
          'subsample': 1, 
          'subsample_freq': 1, 
          'colsample_bytree': 0.8, 
          'reg_alpha': 5, 
          'reg_lambda': 10,
          'min_split_gain': 0.5, 
          'min_child_weight': 1, 
          'min_child_samples': 5, 
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'l2'}

In [9]:
# Create parameters to search
gridParams = {
    'learning_rate': [0.01],
    'n_estimators': [8,24,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'objective' : ['regression'],
    'seed' : [500],
    'colsample_bytree' : [0.65, 0.75, 0.8],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,2,6],
    'reg_lambda' : [1,2,6],
    }

In [10]:
# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = lgb.LGBMRegressor(boosting_type= 'gbdt', 
          objective = 'regression', 
          nthread = 2, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          subsample_freq = params['subsample_freq'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'], 
          scale_pos_weight = params['scale_pos_weight'])

In [None]:
# To view the default model params:
mdl.get_params().keys()

# Create the grid
grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(X_train,y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 648 candidates, totalling 2592 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.6min


In [39]:
len(test_df)

625134

In [40]:
test_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'pickup_date', 'pickup_day',
       'pickup_weekend_or_not'],
      dtype='object')

In [41]:
X_test_final_columns = ['vendor_id','passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
         'pickup_weekend_or_not']

In [42]:
X_test_submit = test_df[X_test_final_columns]

In [43]:
y_pred = gbm.predict(X_test_submit, num_iteration=gbm.best_iteration)

y_pred = np.exp(y_pred) - 1

x_data = test_df['id'].as_matrix()

df = pd.DataFrame({"id" : x_data, "trip_duration" : y_pred})
df.to_csv("submission.csv", index=False)