Random Forest Regressor for Bike Sharing UCI Dataset

Variable,Name,Role,Type,Description,Units,Missing Values
1. instant,ID,Integer,record index,,no
2. dteday,Feature,Date,date,,no
3. season,Feature,Categorical,1:winter, 2:spring, 3:summer, 4:fall,,no
4. yr,Feature,Categorical,year (0: 2011, 1: 2012),,no
5. mnth,Feature,Categorical,month (1 to 12),,no
6. hr,Feature,Categorical,hour (0 to 23),,no
7. holiday,Feature,Binary,weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule),,no
8. weekday,Feature,Categorical,day of the week,,no
9. workingday,Feature,Binary,if day is neither weekend nor holiday is 1, otherwise is 0,,no
10. weathersit,Feature,Categorical,- 1: Clear, Few clouds, Partly cloudy, Partly cloudy,,no
11. temp,Feature,Continuous,Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale),C,no
12. atemp,Feature,Continuous,Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale),C,no
13. hum,Feature,Continuous,Normalized humidity. The values are divided to 100 (max),,no
14. windspeed,Feature,Continuous,Normalized wind speed. The values are divided to 67 (max),,no
15. casual,Other,Integer,count of casual users,,no
16. registered,Other,Integer,count of registered users,,no
17. cnt,Target,Integer,count of total rental bikes including both casual and registered,,no

In [537]:
from ucimlrepo import fetch_ucirepo
bike_sharing = fetch_ucirepo(id=275)
X = bike_sharing.data.features
y = bike_sharing.data.targets
print(bike_sharing.metadata)
print(bike_sharing.variables)

{'uci_id': 275, 'name': 'Bike Sharing', 'repository_url': 'https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/275/data.csv', 'abstract': 'This dataset contains the hourly and daily count of rental bikes between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.', 'area': 'Social Science', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 17389, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['cnt'], 'index_col': ['instant'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2013, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C5W894', 'creators': ['Hadi Fanaee-T'], 'intro_paper': {'ID': 422, 'type': 'NATIVE', 'title': 'Event labeling combining ensemble detectors and background knowledge', 'authors': 'Hadi Fanaee-T, João Gama', 'venue': 'Progress

In [805]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler,PolynomialFeatures,PowerTransformer
from sklearn.pipeline import FunctionTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,GridSearchCV

In [787]:
x=pd.DataFrame(X)

In [788]:
x["dteday"]=pd.to_datetime(x["dteday"])
x["timestamp"]=x.apply(lambda row: pd.Timestamp(row["dteday"]) + pd.to_timedelta(row["hr"], unit='h'), axis=1)
x["timestamp"]=x["timestamp"].astype("int64")//10**9

In [789]:
x["dom"]=x["dteday"].dt.day
x.drop("dteday",axis=1,inplace=True)

In [790]:
x["hour_sin"]=np.sin(2*np.pi*x["hr"]/24)
x["hour_cos"]=np.cos(2*np.pi*x["hr"]/24)
x["month_sin"]=np.sin(2*np.pi*x["mnth"]/12)
x["month_cos"]=np.cos(2*np.pi*x["mnth"]/12)
x["dom_sin"]=np.sin(2*np.pi*x["dom"]/30.4585)
x["dom_cos"]=np.cos(2*np.pi*x["dom"]/30.4585)

In [791]:
a,b,c,d=train_test_split(x,y["cnt"],test_size=0.2, random_state=1)

In [792]:
model=RandomForestRegressor(n_jobs=-1)
model.fit(a,c)
preds=model.predict(b)
mean_squared_error(d,preds)

1496.9149296892983

In [None]:
perm_importance=permutation_importance(model,b,d, n_repeats=10, random_state=1)
feature_importance=pd.DataFrame({"Feature": a.columns, "Importance": perm_importance.importances_mean})

In [794]:
ai=a[feature_importance[feature_importance["Importance"]>0]["Feature"]]
bi=b[feature_importance[feature_importance["Importance"]>0]["Feature"]]
model.fit(ai,c)
preds=model.predict(bi)
print(mean_squared_error(d,preds))

1495.0017225834295


In [795]:
poly=PolynomialFeatures(degree=2)
a_=pd.DataFrame(poly.fit_transform(a))
b_=poly.transform(b)
model.fit(a_,c)
preds=model.predict(b_)
print(mean_squared_error(d,preds))

1605.6042446202532


In [796]:
poly=PolynomialFeatures(degree=2, interaction_only=True)
a_=poly.fit_transform(a)
b_=poly.transform(b)
names=poly.get_feature_names_out(input_features=a.columns)
a_=pd.DataFrame(data=a_, columns=names)
b_=pd.DataFrame(data=b_, columns=names)
model.fit(a_,c)
preds=model.predict(b_)
mse=mean_squared_error(d,preds)
print(mse)

1623.6773578826237


In [797]:
rs=RobustScaler()
ms=MinMaxScaler()
ss=StandardScaler()
for i in [rs,ms,ss]:
    ac=i.fit_transform(a_)
    ac=pd.DataFrame(data=ac,columns=a_.columns)
    bc=i.transform(b_)
    bc=pd.DataFrame(data=bc,columns=a_.columns) 
    model.fit(ac,c)
    preds=model.predict(bc)
    print(mean_squared_error(d,preds))

1638.1357468066742
1569.237510356732
1596.9524989643269


In [798]:
perm_importance = permutation_importance(model,b_,d, n_repeats=10, random_state=1)
feature_importance=pd.DataFrame({"Feature": a_.columns, "Importance": perm_importance.importances_mean})

In [799]:
ai_=a_[feature_importance[feature_importance["Importance"]>0]["Feature"]]
bi_=b_[feature_importance[feature_importance["Importance"]>0]["Feature"]]
model.fit(ai_,c)
preds=model.predict(bi_)
print(mean_squared_error(d,preds))

1680.5562847813578


In [800]:
def inverset(x):
    return 1/(x+1)
pwrt=PowerTransformer(method="yeo-johnson")
boxt=FunctionTransformer(lambda x: boxcox(x)[0],validate=False)
logt=FunctionTransformer(np.log1p)
invt=FunctionTransformer(inverset)
sqrr=FunctionTransformer(np.sqrt)

for i in a.columns:
    j=a[i].skew()
    ap=(a[i],b[i])
    if j<=-0.5:
        print(i,"Before: ",j)
        if (a[i] <= 0).any() or (b[i] <= 0).any():
            a[i] = a[i] + abs(a[i].min())
            b[i] = b[i] + abs(a[i].min())
        try:
            iT=invt.fit_transform(a[i])
            it=invt.transform(b[i])
        except:
            iT=a[i]
            it=b[i]
        try:
            bT=boxt.fit_transform(a[i])
            bTD=pd.Series(data=bT,name=i)
            bt=boxt.transform(b[i])
            btd=pd.Series(data=bt,name=i)
        except:
            bTD=a[i]
            btd=b[i]
        try:
            sT=sqrr.fit_transform(a[i])
            st=sqrr.transform(b[i])
        except:
            sT=a[i]
            st=b[i]
        ap=sorted([(iT,it),(sT,st),(bTD,btd), ap], key=lambda x:abs(x[0].skew()))
        a[i]=ap[0][0]
        b[i]=ap[0][1]
        print(i,"After: ",a[i].skew())
    elif j>=0.5:
        print(i,"Before: ",j)
        if (a[i] <= 0).any() or (b[i] <= 0).any():
            a[i] = a[i] + abs(a[i].min()+1)
            b[i] = b[i] + abs(a[i].min()+1)
        try:

            lT=logt.fit_transform(a[i])
            lt=logt.transform(b[i])
        except:
            lT=a[i]
            lt=b[i]
        try:
            sT=sqrr.fit_transform(a[i])            
            st=sqrr.transform(b[i])
        except:
            sT=a[i]
            st=b[i]
        try:        
            bT=boxt.fit_transform(a[i])
            bt=boxt.transform(b[i])
            bTD=pd.Series(data=bT,name=i)
            btd=pd.Series(data=bt,name=i)
        except:
            bTD=a[i]
            btd=b[i]
        try:        
            zT=pwrt.fit_transform(np.array(a[i]).reshape(-1,1))
            zt=pwrt.transform(np.array(b[i]).reshape(-1,1))
            zTD=pd.Series(data=zT.ravel(),name=i)
            ztd=pd.Series(data=zt.ravel(),name=i)
        except:
            xTD=a[i]
            ztd=b[i]
        ap=sorted([(lT,lt),(sT,st),(bTD,btd),(zTD,ztd), ap], key=lambda x:abs(x[0].skew()))
        a[i]=ap[0][0]
        b[i]=ap[0][1]
        print(i,"After: ",a[i].skew())

holiday Before:  5.600461351032705
holiday After:  0.0
workingday Before:  -0.7757142815812474
workingday After:  0.7757142815812469
weathersit Before:  1.2318362420718578
weathersit After:  0.6759882552420668
windspeed Before:  0.5517372130317661
windspeed After:  0.007060154121852272


In [801]:
model.fit(a,c)
preds=model.predict(b)
print(mean_squared_error(d,preds))

1714.6806250575373


In [802]:
for i in [rs,ms,ss]:
    ac=i.fit_transform(a)
    ac=pd.DataFrame(data=ac,columns=a.columns)
    bc=i.transform(b)
    bc=pd.DataFrame(data=bc,columns=a.columns) 
    model.fit(ac,c)
    preds=model.predict(bc)
    print(mean_squared_error(d,preds))

1712.825222733666
1724.5932781357883
1736.6232327387802


In [810]:
a,b,c,d=train_test_split(x,y["cnt"],test_size=0.2, random_state=1)

In [811]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'ccp_alpha': [0.0001,0.001,0.01,0.1,1],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}
rf = RandomForestRegressor(n_jobs=-1, random_state=1)
grid_search = GridSearchCV(rf, param_grid, cv=7, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(a,c)
best=grid_search.best_estimator_
pred=best.predict(b)
print(mean_squared_error(d,pred))

1631.40007287112
