In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
data_path = r"C:\Users\USER-PC\Desktop\Akash Cloudy ML\3. ML Projects\4. Seoul Bike Trip Duraion Prediction\cleaned_seoul_bike_data.csv"

In [3]:
df = pd.read_csv(
        data_path,
        dtype={
            'Duration':'int8',
            'Distance':'int8',
            'PLong':'float32',
            'PLatd':'float32',
            'DLong':'float32',
            'DLatd':'float32',
            'Haversine':'float32',
            'Pmonth':'int8',
            'Pday':'int8',
            'Phour':'int8',
            'Pmin':'int8',
            'PDweek':'int8',
            'Dmonth':'int8',
            'Dday':'int8',
            'Dhour':'int8',
            'Dmin':'int8',
            'DDweek':'int8',
            'Temp':'float32',
            'Precip':'float32',
            'Wind':'float32',
            'Humid':'float32',
            'Solar':'float32',
            'Snow':'float32',
            'GroundTemp':'float32',
            'Dust':'float32'
        }
)
df.head()

Unnamed: 0,Duration,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,...,Dmin,DDweek,Temp,Precip,Wind,Humid,Solar,Snow,GroundTemp,Dust
0,21,118,37.571068,126.998192,37.565331,127.007843,1.063239,12,21,18,...,29,4,8.7,0.0,1.2,35.0,0.0,0.0,3.6,119.0
1,14,68,37.545166,127.05751,37.55603,127.078644,2.220476,9,12,14,...,34,2,26.9,0.0,1.6,45.0,2.27,0.0,39.5,18.0
2,3,68,37.47818,126.897408,37.476952,126.891869,0.507496,11,6,7,...,23,1,8.3,0.0,0.9,84.0,0.0,0.0,6.1,72.0
3,18,42,37.510658,126.842537,37.530338,126.838257,2.220635,11,11,17,...,37,6,8.9,0.0,3.0,57.0,0.03,0.0,8.7,78.0
4,45,86,37.55125,127.035103,37.582592,127.028976,3.526667,7,3,23,...,35,2,26.4,0.0,0.3,78.0,0.0,0.0,23.9,13.0


In [4]:
df.shape

(8583860, 25)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8583860 entries, 0 to 8583859
Data columns (total 25 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Duration    int8   
 1   Distance    int8   
 2   PLong       float32
 3   PLatd       float32
 4   DLong       float32
 5   DLatd       float32
 6   Haversine   float32
 7   Pmonth      int8   
 8   Pday        int8   
 9   Phour       int8   
 10  Pmin        int8   
 11  PDweek      int8   
 12  Dmonth      int8   
 13  Dday        int8   
 14  Dhour       int8   
 15  Dmin        int8   
 16  DDweek      int8   
 17  Temp        float32
 18  Precip      float32
 19  Wind        float32
 20  Humid       float32
 21  Solar       float32
 22  Snow        float32
 23  GroundTemp  float32
 24  Dust        float32
dtypes: float32(13), int8(12)
memory usage: 523.9 MB


In [6]:
x = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [7]:
print(x.head())

   Distance      PLong       PLatd      DLong       DLatd  Haversine  Pmonth  \
0       118  37.571068  126.998192  37.565331  127.007843   1.063239      12   
1        68  37.545166  127.057510  37.556030  127.078644   2.220476       9   
2        68  37.478180  126.897408  37.476952  126.891869   0.507496      11   
3        42  37.510658  126.842537  37.530338  126.838257   2.220635      11   
4        86  37.551250  127.035103  37.582592  127.028976   3.526667       7   

   Pday  Phour  Pmin  ...  Dmin  DDweek  Temp  Precip  Wind  Humid  Solar  \
0    21     18     7  ...    29       4   8.7     0.0   1.2   35.0   0.00   
1    12     14    18  ...    34       2  26.9     0.0   1.6   45.0   2.27   
2     6      7    20  ...    23       1   8.3     0.0   0.9   84.0   0.00   
3    11     17    18  ...    37       6   8.9     0.0   3.0   57.0   0.03   
4     3     23    49  ...    35       2  26.4     0.0   0.3   78.0   0.00   

   Snow  GroundTemp   Dust  
0   0.0         3.6  119.0 

In [8]:
print(y.head())

0    21
1    14
2     3
3    18
4    45
Name: Duration, dtype: int8


In [9]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=1)
print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(6437895, 24) (6437895,)
(2145965, 24) (2145965,)


In [10]:
# Scaling:-
sc = StandardScaler()
scaled_xtrain = sc.fit_transform(xtrain)
scaled_xtest = sc.transform(xtest)

In [11]:
def model_evaluation(model, xtrain, ytrain, xtest, ytest):
    ytrain_pred = model.predict(xtrain)
    ytest_pred = model.predict(xtest)

    train_mae = mean_absolute_error(ytrain, ytrain_pred)
    train_mse = mean_squared_error(ytrain, ytrain_pred)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(ytrain, ytrain_pred)

    test_mae = mean_absolute_error(ytest, ytest_pred)
    test_mse = mean_squared_error(ytest, ytest_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(ytest, ytest_pred)

    print("Training Scores:")
    print(f"MAE: {train_mae}\nMSE: {train_mse}\nRMSE: {train_rmse}\nR2: {train_r2}\n")

    print("Testing Scores:")
    print(f"MAE: {test_mae}\nMSE: {test_mse}\nRMSE: {test_rmse}\nR2: {test_r2}\n")

## Random Forest Regressor

In [12]:
%%time
rf_model = RandomForestRegressor(**{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 40},random_state=1,n_jobs=-1)
rf_model.fit(scaled_xtrain,ytrain)

# Evaluate the Random Forest Regressor model
model_evaluation(rf_model, scaled_xtrain, ytrain, scaled_xtest, ytest)

Training Scores:
MAE: 0.6684590401274568
MSE: 3.4669571013672185
RMSE: 1.8619766650973955
R2: 0.9933663644535675

Testing Scores:
MAE: 1.4314514249012558
MSE: 14.299472225985776
RMSE: 3.7814642965372256
R2: 0.9726545062924915

Wall time: 1h 9min 18s


In [14]:
import joblib
joblib.dump(rf_model, 'model.joblib')

['model.joblib']