In [None]:
# Aim to get a baseline accuracy using lightgbm on the modified dataset

In [1]:
# Importing necessary modules

import json
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Loading the data

#train_df = pd.read_csv("train_after_further_explorations.gz", compression = 'gzip')
train_df = pd.read_csv("dmerge.csv")
#del train_df['Unnamed: 0']

In [None]:
# Removing redundant columns

'''del train_df['trip_duration']

del train_df['pickup_datetime'], train_df['dropoff_datetime']'''

In [3]:
train_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'Month', 'Week', 'Weekday', 'Hour', 'tempm', 'tempi',
       'dewptm', 'dewpti', 'hum', 'wspdm', 'wspdi', 'wgustm', 'wgusti',
       'wdird', 'wdire', 'vism', 'visi', 'pressurem', 'pressurei',
       'windchillm', 'windchilli', 'heatindexm', 'heatindexi', 'precipm',
       'precipi', 'conds', 'icon', 'fog', 'rain', 'snow', 'hail', 'thunder',
       'tornado'],
      dtype='object')

In [4]:
# Creating a random mean basleline model and checking its accuracy

def baseline_random(train_df):

    # Seperating label and inputs

    #y = np.log(train_df['trip_in_minutes'] + 1)
    y = train_df['trip_in_minutes']

    X = train_df[['vendor_id', 'passenger_count', 'pickup_longitude',
           'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
           'store_and_fwd_flag','pickup_weekend_or_not']]

    # Splitting training data into train and test data

    X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, test_size=0.33, random_state=42)

    answer = np.mean(y_train)

    y_pred = []

    for i in range(len(y_test)):
        y_pred.append(answer)

    print('The mse of prediction is:', mean_squared_error(y_test, y_pred))

In [7]:
baseline_random(train_df)

The mse of prediction is: 2904.55424542


In [16]:
# Lightgbm

# Seperating label and inputs

#y = np.log(train_df['trip_in_minutes'] + 1)
#y = train_df['trip_in_minutes']
y = np.log(train_df['trip_duration'] + 1)

X = train_df[['vendor_id',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude',
       'Month', 'Week', 'Hour', 'tempm', 'tempi', 'precipm',
       'precipi', 'fog', 'rain']]

# Splitting training data into train and test data

X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.33, random_state=42)

# create dataset for lightgbm

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 5,
    'learning_rate': 0.6,
    'feature_fraction': 0.9,
    'bagging_fraction': 1,
    'bagging_freq': 2,
    'verbose': 0
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=50)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The mse of prediction is:', mean_squared_error(y_test, y_pred))

print('Dump model to JSON...')
# dump model to json (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)


print('Feature names:', gbm.feature_name())

print('Calculate feature importances...')
# feature importances
print('Feature importances:', list(gbm.feature_importance()))



Start training...
[1]	valid_0's l2: 0.572946
Train until valid scores didn't improve in 50 rounds.
[2]	valid_0's l2: 0.550504
[3]	valid_0's l2: 0.523053
[4]	valid_0's l2: 0.489071
[5]	valid_0's l2: 0.483012
[6]	valid_0's l2: 0.478852
[7]	valid_0's l2: 0.476108
[8]	valid_0's l2: 0.472142
[9]	valid_0's l2: 0.461108
[10]	valid_0's l2: 0.46007
[11]	valid_0's l2: 0.454496
[12]	valid_0's l2: 0.451973
[13]	valid_0's l2: 0.45066
[14]	valid_0's l2: 0.445541
[15]	valid_0's l2: 0.444288
[16]	valid_0's l2: 0.443481
[17]	valid_0's l2: 0.431203
[18]	valid_0's l2: 0.42932
[19]	valid_0's l2: 0.427715
[20]	valid_0's l2: 0.424206
[21]	valid_0's l2: 0.409806
[22]	valid_0's l2: 0.408798
[23]	valid_0's l2: 0.408051
[24]	valid_0's l2: 0.407358
[25]	valid_0's l2: 0.406953
[26]	valid_0's l2: 0.404384
[27]	valid_0's l2: 0.401679
[28]	valid_0's l2: 0.40095
[29]	valid_0's l2: 0.400655
[30]	valid_0's l2: 0.399704
[31]	valid_0's l2: 0.399488
[32]	valid_0's l2: 0.395481
[33]	valid_0's l2: 0.395072
[34]	valid_0's l2

[288]	valid_0's l2: 0.338961
[289]	valid_0's l2: 0.338949
[290]	valid_0's l2: 0.33894
[291]	valid_0's l2: 0.338941
[292]	valid_0's l2: 0.338922
[293]	valid_0's l2: 0.338905
[294]	valid_0's l2: 0.338869
[295]	valid_0's l2: 0.338864
[296]	valid_0's l2: 0.338854
[297]	valid_0's l2: 0.338843
[298]	valid_0's l2: 0.338823
[299]	valid_0's l2: 0.338797
[300]	valid_0's l2: 0.338803
[301]	valid_0's l2: 0.338784
[302]	valid_0's l2: 0.338751
[303]	valid_0's l2: 0.338697
[304]	valid_0's l2: 0.338679
[305]	valid_0's l2: 0.338642
[306]	valid_0's l2: 0.338615
[307]	valid_0's l2: 0.338573
[308]	valid_0's l2: 0.338558
[309]	valid_0's l2: 0.338547
[310]	valid_0's l2: 0.338535
[311]	valid_0's l2: 0.338522
[312]	valid_0's l2: 0.338528
[313]	valid_0's l2: 0.338489
[314]	valid_0's l2: 0.338489
[315]	valid_0's l2: 0.338468
[316]	valid_0's l2: 0.33844
[317]	valid_0's l2: 0.33841
[318]	valid_0's l2: 0.338325
[319]	valid_0's l2: 0.338315
[320]	valid_0's l2: 0.338278
[321]	valid_0's l2: 0.338201
[322]	valid_0's l

[577]	valid_0's l2: 0.334083
[578]	valid_0's l2: 0.334072
[579]	valid_0's l2: 0.334066
[580]	valid_0's l2: 0.33406
[581]	valid_0's l2: 0.334054
[582]	valid_0's l2: 0.334044
[583]	valid_0's l2: 0.334043
[584]	valid_0's l2: 0.334033
[585]	valid_0's l2: 0.334016
[586]	valid_0's l2: 0.334006
[587]	valid_0's l2: 0.33399
[588]	valid_0's l2: 0.333986
[589]	valid_0's l2: 0.333991
[590]	valid_0's l2: 0.333976
[591]	valid_0's l2: 0.333963
[592]	valid_0's l2: 0.333964
[593]	valid_0's l2: 0.333948
[594]	valid_0's l2: 0.333947
[595]	valid_0's l2: 0.333944
[596]	valid_0's l2: 0.333937
[597]	valid_0's l2: 0.333934
[598]	valid_0's l2: 0.333932
[599]	valid_0's l2: 0.33392
[600]	valid_0's l2: 0.333909
[601]	valid_0's l2: 0.333895
[602]	valid_0's l2: 0.333888
[603]	valid_0's l2: 0.33389
[604]	valid_0's l2: 0.333876
[605]	valid_0's l2: 0.33387
[606]	valid_0's l2: 0.333868
[607]	valid_0's l2: 0.333859
[608]	valid_0's l2: 0.33385
[609]	valid_0's l2: 0.333846
[610]	valid_0's l2: 0.333846
[611]	valid_0's l2: 

[865]	valid_0's l2: 0.332331
[866]	valid_0's l2: 0.332344
[867]	valid_0's l2: 0.33234
[868]	valid_0's l2: 0.332337
[869]	valid_0's l2: 0.332339
[870]	valid_0's l2: 0.332339
[871]	valid_0's l2: 0.332333
[872]	valid_0's l2: 0.332327
[873]	valid_0's l2: 0.332317
[874]	valid_0's l2: 0.33232
[875]	valid_0's l2: 0.332321
[876]	valid_0's l2: 0.332311
[877]	valid_0's l2: 0.332309
[878]	valid_0's l2: 0.332284
[879]	valid_0's l2: 0.332278
[880]	valid_0's l2: 0.332278
[881]	valid_0's l2: 0.332276
[882]	valid_0's l2: 0.332275
[883]	valid_0's l2: 0.332277
[884]	valid_0's l2: 0.33228
[885]	valid_0's l2: 0.332282
[886]	valid_0's l2: 0.332276
[887]	valid_0's l2: 0.332277
[888]	valid_0's l2: 0.332267
[889]	valid_0's l2: 0.332262
[890]	valid_0's l2: 0.332259
[891]	valid_0's l2: 0.332255
[892]	valid_0's l2: 0.33225
[893]	valid_0's l2: 0.33224
[894]	valid_0's l2: 0.332233
[895]	valid_0's l2: 0.33223
[896]	valid_0's l2: 0.33223
[897]	valid_0's l2: 0.332216
[898]	valid_0's l2: 0.332212
[899]	valid_0's l2: 0

In [6]:
params = {'boosting_type': 'gbdt',
          'max_depth' : 4,
          'objective': 'regression', 
          'nthread': 5, 
          'silent': True,
          'num_leaves': 64, 
          'learning_rate': 0.05, 
          'max_bin': 512, 
          'subsample_for_bin': 200,
          'subsample': 1, 
          'subsample_freq': 1, 
          'colsample_bytree': 0.8, 
          'reg_alpha': 5, 
          'reg_lambda': 10,
          'min_split_gain': 0.5, 
          'min_child_weight': 1, 
          'min_child_samples': 5, 
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'l2'}

In [7]:
# Grid Search in Light GBM

# Create parameters to search
gridParams = {
    'learning_rate': [0.01],
    'n_estimators': [8,24,48],
    'num_leaves': [6,12,16,22],
    'boosting_type' : ['gbdt'],
    'objective' : ['regression'],
    'seed' : [500],
    'colsample_bytree' : [0.65, 0.75, 0.8],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,2,6],
    'reg_lambda' : [1,2,6],
    }

 #Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = lgb.LGBMRegressor(boosting_type= 'gbdt', 
          objective = 'regression', 
          nthread = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          subsample_freq = params['subsample_freq'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'], 
          scale_pos_weight = params['scale_pos_weight'])

# To view the default model params:
mdl.get_params().keys()

# Create the grid
grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

NameError: name 'GridSearchCV' is not defined

In [None]:
'''dmerge.csv dataset gave very poor results without any further pre preocessing.
The mse of prediction is: 20458773.863
This shows bigger isn't always better especially when lots of NaNs? And also shows that the
features engineered in earlier data explorations on the original dataset were of value. '''

In [17]:
X_test_final = pd.read_csv("tmerge.csv")

In [19]:
X_test_final.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'Month', 'Week', 'Weekday',
       'Hour', 'tempm', 'tempi', 'dewptm', 'dewpti', 'hum', 'wspdm', 'wspdi',
       'wgustm', 'wgusti', 'wdird', 'wdire', 'vism', 'visi', 'pressurem',
       'pressurei', 'windchillm', 'windchilli', 'heatindexm', 'heatindexi',
       'precipm', 'precipi', 'conds', 'icon', 'fog', 'rain', 'snow', 'hail',
       'thunder', 'tornado'],
      dtype='object')

In [20]:
X_test_final_columns = ['vendor_id',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude',
       'Month', 'Week', 'Hour', 'tempm', 'tempi', 'precipm',
       'precipi', 'fog', 'rain']

In [21]:
X_test_submit = X_test_final[X_test_final_columns]

In [22]:
y_pred = gbm.predict(X_test_submit, num_iteration=gbm.best_iteration)

y_pred = np.exp(y_pred) - 1

x_data = X_test_final['id'].as_matrix()

df = pd.DataFrame({"id" : x_data, "trip_duration" : y_pred})
df.to_csv("submission.csv", index=False)