In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
data_path = r"C:\Users\USER-PC\Desktop\Akash Cloudy ML\3. ML Projects\4. Seoul Bike Trip Duraion Prediction\cleaned_seoul_bike_data.csv"

In [3]:
df = pd.read_csv(
        data_path,
        dtype={
            'Duration':'int8',
            'Distance':'int8',
            'PLong':'float32',
            'PLatd':'float32',
            'DLong':'float32',
            'DLatd':'float32',
            'Haversine':'float32',
            'Pmonth':'int8',
            'Pday':'int8',
            'Phour':'int8',
            'Pmin':'int8',
            'PDweek':'int8',
            'Dmonth':'int8',
            'Dday':'int8',
            'Dhour':'int8',
            'Dmin':'int8',
            'DDweek':'int8',
            'Temp':'float32',
            'Precip':'float32',
            'Wind':'float32',
            'Humid':'float32',
            'Solar':'float32',
            'Snow':'float32',
            'GroundTemp':'float32',
            'Dust':'float32'
        },
)
df.head()

Unnamed: 0,Duration,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,...,Dmin,DDweek,Temp,Precip,Wind,Humid,Solar,Snow,GroundTemp,Dust
0,21,118,37.571068,126.998192,37.565331,127.007843,1.063239,12,21,18,...,29,4,8.7,0.0,1.2,35.0,0.0,0.0,3.6,119.0
1,14,68,37.545166,127.05751,37.55603,127.078644,2.220476,9,12,14,...,34,2,26.9,0.0,1.6,45.0,2.27,0.0,39.5,18.0
2,3,68,37.47818,126.897408,37.476952,126.891869,0.507496,11,6,7,...,23,1,8.3,0.0,0.9,84.0,0.0,0.0,6.1,72.0
3,18,42,37.510658,126.842537,37.530338,126.838257,2.220635,11,11,17,...,37,6,8.9,0.0,3.0,57.0,0.03,0.0,8.7,78.0
4,45,86,37.55125,127.035103,37.582592,127.028976,3.526667,7,3,23,...,35,2,26.4,0.0,0.3,78.0,0.0,0.0,23.9,13.0


In [4]:
df.shape

(8583860, 25)

In [5]:
x = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [6]:
print(x.head(5))

   Distance      PLong       PLatd      DLong       DLatd  Haversine  Pmonth  \
0       118  37.571068  126.998192  37.565331  127.007843   1.063239      12   
1        68  37.545166  127.057510  37.556030  127.078644   2.220476       9   
2        68  37.478180  126.897408  37.476952  126.891869   0.507496      11   
3        42  37.510658  126.842537  37.530338  126.838257   2.220635      11   
4        86  37.551250  127.035103  37.582592  127.028976   3.526667       7   

   Pday  Phour  Pmin  ...  Dmin  DDweek  Temp  Precip  Wind  Humid  Solar  \
0    21     18     7  ...    29       4   8.7     0.0   1.2   35.0   0.00   
1    12     14    18  ...    34       2  26.9     0.0   1.6   45.0   2.27   
2     6      7    20  ...    23       1   8.3     0.0   0.9   84.0   0.00   
3    11     17    18  ...    37       6   8.9     0.0   3.0   57.0   0.03   
4     3     23    49  ...    35       2  26.4     0.0   0.3   78.0   0.00   

   Snow  GroundTemp   Dust  
0   0.0         3.6  119.0 

In [7]:
print(y)

0          21
1          14
2           3
3          18
4          45
           ..
8583855     3
8583856     9
8583857    10
8583858    18
8583859    55
Name: Duration, Length: 8583860, dtype: int8


In [8]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=1)
print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(6008702, 24) (6008702,)
(2575158, 24) (2575158,)


## Scaling the data:

In [9]:
%%time
sc = StandardScaler()
scaled_xtrain = sc.fit_transform(xtrain)
scaled_xtest = sc.transform(xtest)

Wall time: 3.51 s


In [10]:
def model_evaluation(model, xtrain, ytrain, xtest, ytest):
    ytrain_pred = model.predict(xtrain)
    ytest_pred = model.predict(xtest)

    train_mae = mean_absolute_error(ytrain, ytrain_pred)
    train_mse = mean_squared_error(ytrain, ytrain_pred)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(ytrain, ytrain_pred)

    test_mae = mean_absolute_error(ytest, ytest_pred)
    test_mse = mean_squared_error(ytest, ytest_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(ytest, ytest_pred)

    print("Training Scores:")
    print(f"MAE: {train_mae}\nMSE: {train_mse}\nRMSE: {train_rmse}\nR2: {train_r2}\n")

    print("Testing Scores:")
    print(f"MAE: {test_mae}\nMSE: {test_mse}\nRMSE: {test_rmse}\nR2: {test_r2}\n")

## Linear Regression

In [11]:
%%time
lr_Model = LinearRegression()
lr_Model.fit(scaled_xtrain, ytrain)

# Evaluate the Linear Regression model
model_evaluation(lr_Model, scaled_xtrain, ytrain, scaled_xtest, ytest)

Training Scores:
MAE: 12.114371299743652
MSE: 349.1412353515625
RMSE: 18.685321807861328
R2: 0.332176807181751

Testing Scores:
MAE: 12.11842155456543
MSE: 349.44696044921875
RMSE: 18.693500518798828
R2: 0.33116409360916876

Wall time: 3.41 s


## XGBoost Regressor

In [12]:
%%time
xgbr_Model = XGBRegressor()
xgbr_Model.fit(scaled_xtrain,ytrain)

# Evaluate the XGBoost Regressor model
model_evaluation(xgbr_Model, scaled_xtrain, ytrain, scaled_xtest, ytest)

Training Scores:
MAE: 2.7964985370635986
MSE: 26.280874252319336
RMSE: 5.126487731933594
R2: 0.9497310154123892

Testing Scores:
MAE: 2.805436134338379
MSE: 26.642532348632812
RMSE: 5.161640644073486
R2: 0.9490066460240496

Wall time: 1min 56s


## CatBoost Regressor

In [14]:
%%time
cat_Model = CatBoostRegressor()
cat_Model.fit(scaled_xtrain,ytrain)

# Evaluate the CATBoost Regressor model
model_evaluation(cat_Model, scaled_xtrain, ytrain, scaled_xtest, ytest)

Learning rate set to 0.161893
0:	learn: 21.7038548	total: 1.01s	remaining: 16m 49s
1:	learn: 20.8137903	total: 1.85s	remaining: 15m 23s
2:	learn: 20.1123919	total: 2.7s	remaining: 14m 58s
3:	learn: 19.5481914	total: 3.57s	remaining: 14m 50s
4:	learn: 19.1435189	total: 4.37s	remaining: 14m 30s
5:	learn: 18.7996921	total: 5.21s	remaining: 14m 23s
6:	learn: 18.5612400	total: 6.01s	remaining: 14m 12s
7:	learn: 18.3775893	total: 6.77s	remaining: 13m 59s
8:	learn: 18.1422260	total: 7.64s	remaining: 14m 1s
9:	learn: 18.0015218	total: 8.49s	remaining: 14m
10:	learn: 17.8488848	total: 9.34s	remaining: 14m
11:	learn: 17.6844861	total: 10.1s	remaining: 13m 54s
12:	learn: 17.4416823	total: 10.9s	remaining: 13m 43s
13:	learn: 17.3341876	total: 11.6s	remaining: 13m 37s
14:	learn: 17.1947955	total: 12.4s	remaining: 13m 34s
15:	learn: 17.0489899	total: 13.2s	remaining: 13m 33s
16:	learn: 16.7564370	total: 14s	remaining: 13m 30s
17:	learn: 16.5283417	total: 14.7s	remaining: 13m 22s
18:	learn: 16.239874

152:	learn: 7.3198367	total: 2m 4s	remaining: 11m 27s
153:	learn: 7.3050914	total: 2m 4s	remaining: 11m 26s
154:	learn: 7.2966207	total: 2m 5s	remaining: 11m 25s
155:	learn: 7.2663052	total: 2m 6s	remaining: 11m 24s
156:	learn: 7.2327518	total: 2m 7s	remaining: 11m 23s
157:	learn: 7.2131565	total: 2m 8s	remaining: 11m 23s
158:	learn: 7.1923636	total: 2m 8s	remaining: 11m 21s
159:	learn: 7.1853042	total: 2m 9s	remaining: 11m 20s
160:	learn: 7.1677519	total: 2m 10s	remaining: 11m 18s
161:	learn: 7.1328094	total: 2m 11s	remaining: 11m 18s
162:	learn: 7.0966035	total: 2m 12s	remaining: 11m 18s
163:	learn: 7.0725790	total: 2m 12s	remaining: 11m 17s
164:	learn: 7.0505565	total: 2m 13s	remaining: 11m 15s
165:	learn: 7.0275163	total: 2m 14s	remaining: 11m 14s
166:	learn: 7.0235973	total: 2m 15s	remaining: 11m 14s
167:	learn: 6.9971088	total: 2m 16s	remaining: 11m 13s
168:	learn: 6.9842242	total: 2m 16s	remaining: 11m 12s
169:	learn: 6.9498680	total: 2m 17s	remaining: 11m 11s
170:	learn: 6.9366

304:	learn: 4.9726520	total: 4m 8s	remaining: 9m 26s
305:	learn: 4.9610224	total: 4m 9s	remaining: 9m 25s
306:	learn: 4.9559253	total: 4m 10s	remaining: 9m 24s
307:	learn: 4.9525915	total: 4m 10s	remaining: 9m 23s
308:	learn: 4.9419636	total: 4m 11s	remaining: 9m 22s
309:	learn: 4.9361514	total: 4m 12s	remaining: 9m 22s
310:	learn: 4.9306541	total: 4m 13s	remaining: 9m 21s
311:	learn: 4.9262612	total: 4m 14s	remaining: 9m 20s
312:	learn: 4.9211961	total: 4m 14s	remaining: 9m 19s
313:	learn: 4.9102992	total: 4m 15s	remaining: 9m 18s
314:	learn: 4.9005102	total: 4m 16s	remaining: 9m 17s
315:	learn: 4.8942288	total: 4m 17s	remaining: 9m 16s
316:	learn: 4.8870773	total: 4m 17s	remaining: 9m 15s
317:	learn: 4.8792921	total: 4m 18s	remaining: 9m 14s
318:	learn: 4.8702874	total: 4m 19s	remaining: 9m 13s
319:	learn: 4.8607037	total: 4m 20s	remaining: 9m 12s
320:	learn: 4.8478277	total: 4m 21s	remaining: 9m 12s
321:	learn: 4.8436841	total: 4m 21s	remaining: 9m 11s
322:	learn: 4.8374269	total: 4

457:	learn: 3.9188664	total: 6m 12s	remaining: 7m 20s
458:	learn: 3.9167804	total: 6m 12s	remaining: 7m 19s
459:	learn: 3.9079199	total: 6m 13s	remaining: 7m 18s
460:	learn: 3.8997842	total: 6m 14s	remaining: 7m 18s
461:	learn: 3.8950753	total: 6m 15s	remaining: 7m 17s
462:	learn: 3.8845119	total: 6m 16s	remaining: 7m 16s
463:	learn: 3.8797506	total: 6m 17s	remaining: 7m 15s
464:	learn: 3.8745532	total: 6m 17s	remaining: 7m 14s
465:	learn: 3.8696065	total: 6m 18s	remaining: 7m 13s
466:	learn: 3.8661449	total: 6m 19s	remaining: 7m 12s
467:	learn: 3.8624308	total: 6m 20s	remaining: 7m 12s
468:	learn: 3.8560943	total: 6m 21s	remaining: 7m 11s
469:	learn: 3.8458362	total: 6m 21s	remaining: 7m 10s
470:	learn: 3.8434941	total: 6m 22s	remaining: 7m 9s
471:	learn: 3.8428245	total: 6m 23s	remaining: 7m 9s
472:	learn: 3.8411005	total: 6m 24s	remaining: 7m 8s
473:	learn: 3.8379123	total: 6m 25s	remaining: 7m 7s
474:	learn: 3.8348054	total: 6m 25s	remaining: 7m 6s
475:	learn: 3.8311076	total: 6m 2

610:	learn: 3.3244784	total: 8m 17s	remaining: 5m 16s
611:	learn: 3.3226719	total: 8m 17s	remaining: 5m 15s
612:	learn: 3.3212355	total: 8m 18s	remaining: 5m 14s
613:	learn: 3.3161508	total: 8m 19s	remaining: 5m 14s
614:	learn: 3.3071967	total: 8m 20s	remaining: 5m 13s
615:	learn: 3.3042007	total: 8m 21s	remaining: 5m 12s
616:	learn: 3.2991374	total: 8m 21s	remaining: 5m 11s
617:	learn: 3.2971856	total: 8m 22s	remaining: 5m 10s
618:	learn: 3.2952228	total: 8m 23s	remaining: 5m 10s
619:	learn: 3.2903729	total: 8m 24s	remaining: 5m 9s
620:	learn: 3.2871110	total: 8m 25s	remaining: 5m 8s
621:	learn: 3.2858081	total: 8m 26s	remaining: 5m 7s
622:	learn: 3.2736030	total: 8m 27s	remaining: 5m 6s
623:	learn: 3.2712064	total: 8m 27s	remaining: 5m 6s
624:	learn: 3.2637136	total: 8m 28s	remaining: 5m 5s
625:	learn: 3.2604734	total: 8m 29s	remaining: 5m 4s
626:	learn: 3.2580224	total: 8m 30s	remaining: 5m 3s
627:	learn: 3.2513445	total: 8m 31s	remaining: 5m 2s
628:	learn: 3.2493369	total: 8m 31s	r

763:	learn: 2.9284063	total: 10m 19s	remaining: 3m 11s
764:	learn: 2.9248338	total: 10m 20s	remaining: 3m 10s
765:	learn: 2.9231217	total: 10m 21s	remaining: 3m 9s
766:	learn: 2.9222292	total: 10m 22s	remaining: 3m 9s
767:	learn: 2.9212537	total: 10m 23s	remaining: 3m 8s
768:	learn: 2.9190609	total: 10m 23s	remaining: 3m 7s
769:	learn: 2.9169234	total: 10m 24s	remaining: 3m 6s
770:	learn: 2.9125932	total: 10m 25s	remaining: 3m 5s
771:	learn: 2.9100932	total: 10m 26s	remaining: 3m 4s
772:	learn: 2.9090824	total: 10m 27s	remaining: 3m 4s
773:	learn: 2.9063579	total: 10m 27s	remaining: 3m 3s
774:	learn: 2.9029824	total: 10m 28s	remaining: 3m 2s
775:	learn: 2.9017882	total: 10m 29s	remaining: 3m 1s
776:	learn: 2.8998610	total: 10m 30s	remaining: 3m
777:	learn: 2.8980752	total: 10m 31s	remaining: 3m
778:	learn: 2.8960690	total: 10m 32s	remaining: 2m 59s
779:	learn: 2.8940319	total: 10m 32s	remaining: 2m 58s
780:	learn: 2.8916512	total: 10m 33s	remaining: 2m 57s
781:	learn: 2.8892389	total: 

914:	learn: 2.6808769	total: 12m 22s	remaining: 1m 8s
915:	learn: 2.6788981	total: 12m 23s	remaining: 1m 8s
916:	learn: 2.6773016	total: 12m 24s	remaining: 1m 7s
917:	learn: 2.6751935	total: 12m 24s	remaining: 1m 6s
918:	learn: 2.6741260	total: 12m 25s	remaining: 1m 5s
919:	learn: 2.6728780	total: 12m 26s	remaining: 1m 4s
920:	learn: 2.6712638	total: 12m 27s	remaining: 1m 4s
921:	learn: 2.6698252	total: 12m 28s	remaining: 1m 3s
922:	learn: 2.6688419	total: 12m 29s	remaining: 1m 2s
923:	learn: 2.6682340	total: 12m 30s	remaining: 1m 1s
924:	learn: 2.6676231	total: 12m 30s	remaining: 1m
925:	learn: 2.6670991	total: 12m 31s	remaining: 1m
926:	learn: 2.6645963	total: 12m 32s	remaining: 59.3s
927:	learn: 2.6637933	total: 12m 33s	remaining: 58.4s
928:	learn: 2.6630950	total: 12m 34s	remaining: 57.6s
929:	learn: 2.6612985	total: 12m 34s	remaining: 56.8s
930:	learn: 2.6595309	total: 12m 35s	remaining: 56s
931:	learn: 2.6590195	total: 12m 36s	remaining: 55.2s
932:	learn: 2.6569355	total: 12m 37s

## Light Gradient Boosting Resgressor

In [16]:
lgbm_Model = LGBMRegressor()
lgbm_Model.fit(scaled_xtrain,ytrain)

# Evaluate the LGBM Regressor model
model_evaluation(lgbm_Model, scaled_xtrain, ytrain, scaled_xtest, ytest)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2777
[LightGBM] [Info] Number of data points in the train set: 6008702, number of used features: 24
[LightGBM] [Info] Start training from score 23.124859
Training Scores:
MAE: 4.592815621722957
MSE: 65.67211195722649
RMSE: 8.103833164449185
R2: 0.8743850167176628

Testing Scores:
MAE: 4.602311265882582
MSE: 66.10972503374526
RMSE: 8.1307887092056
R2: 0.8734670663085532



### Hyperparameter tuning of LGBM Regressor:

In [18]:
%%time
lgbm_params = {
    "n_estimators":[100,200],
    "learning_rate":[0.001,0.01],
    "num_leaves":[20,80]
}

lgbm_reg = LGBMRegressor(random_state=24,n_jobs=-1)
lgbm_grid_search = GridSearchCV(
                    lgbm_reg,
                    param_grid=lgbm_params,
                    cv=3,
                    verbose=3,
                    n_jobs=-1
)
lgbm_grid_search.fit(scaled_xtrain,ytrain)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2782
[LightGBM] [Info] Number of data points in the train set: 6008702, number of used features: 24
[LightGBM] [Info] Start training from score 23.124859
Wall time: 24min 58s


GridSearchCV(cv=3, estimator=LGBMRegressor(n_jobs=-1, random_state=24),
             n_jobs=-1,
             param_grid={'learning_rate': [0.001, 0.01],
                         'n_estimators': [100, 200], 'num_leaves': [20, 80]},
             verbose=3)

In [20]:
%%time
lgbm_final_model = LGBMRegressor(**{'learning_rate':0.01,'n_estimators':200,'num_leaves':80},random_state=1)
lgbm_final_model.fit(scaled_xtrain,ytrain)
# Evaluate the final LGBM Regressor model
model_evaluation(lgbm_final_model, scaled_xtrain, ytrain, scaled_xtest, ytest)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2781
[LightGBM] [Info] Number of data points in the train set: 6008702, number of used features: 24
[LightGBM] [Info] Start training from score 23.124859
Training Scores:
MAE: 8.521531922748682
MSE: 170.76812906528798
RMSE: 13.06782801636477
R2: 0.67336156797175

Testing Scores:
MAE: 8.530687990255307
MSE: 171.19491464401588
RMSE: 13.084147455757899
R2: 0.6723357301530595

Wall time: 3min 21s


### In this notebook I have trained Linear Regression (base line model), XG Boost Models, Cat Boost Models and LGBM Models. But, will give a check for the ensemble model too in the new notebook.