<font size=6><b>Bike Sharing Demand - ML

* ref : https://www.kaggle.com/competitions/bike-sharing-demand/data <br>

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_squared_log_error, make_scorer

from sklearn.ensemble     import RandomForestRegressor
from sklearn.tree         import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# ---- 추가 모델
from sklearn.ensemble     import AdaBoostRegressor, VotingRegressor
from xgboost              import XGBRegressor
from lightgbm             import LGBMRegressor

# Data Load

In [3]:
train = pd.read_csv("./datasets/train.csv", parse_dates=['datetime'])
test  = pd.read_csv("./datasets/test.csv" , parse_dates=['datetime'])

In [4]:
df_list = [train, test]
for df in df_list:
    df.rename(columns = {'datetime' : 'regdate', 'count' : 'regcount'}, inplace = True)
    df.columns = df.columns.str.lower()

# Feature Engineering

* 파생피쳐 생성
    * regdate -> y, m, d, h, w
    * holiday, workingday -> day_type
    * h, workingday -> peek
    * temp, windspeed -> ideal
    * humidity, workingday -> sticky
* 자연재해, 공휴일 처리
    * Sandy
    * Christmas
* Outlier 삭제 (train만 해당)
    * temp > 40
    * windspeed > 50
* 연속형 Feature Scaling (logScaling)
    * temp, atemp, humidity, windspeed
* 다중공선 처리
    * temp, atemp  ->  temp
    * season, m -> season
    * w, day_type, holiday, workingday  -> w, day_type
* 이산형 Feature OneHotEncoding
    * season, weather, y, h, w, day_type
* Target data Scaling (logScaling)
* 불필요한 컬럼 삭제
    * d, reg_data

In [5]:
df_list = [train, test]
df_name = ['train', 'test']
for i, df in enumerate(df_list):
    # 파생피쳐 생성
    df['y'] = df['regdate'].dt.year
    df['m'] = df['regdate'].dt.month
    df['d'] = df['regdate'].dt.day
    df['h'] = df['regdate'].dt.hour
    df['w'] = df['regdate'].dt.dayofweek
    #df['woy'] = df['regdate'].dt.weekofyear
    
    df['day_type'] = 0
    df['day_type'] = np.where( (df['holiday']==0) & (df['workingday'] == 1),   1,  df['day_type'])
    df['day_type'] = np.where( (df['holiday']==1) & (df['workingday'] == 0),   2,  df['day_type'])
    
    df['peak']   = df[['h', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  ( x['h'] == 8 or 17 <= x['h'] <= 18 or 12 <= x['h'] <= 12)) or (x['workingday'] == 0 and  10 <= x['h'] <= 19)], axis = 1)
    df['ideal']  = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 27 and x['windspeed'] < 30], axis = 1)
    df['sticky'] = df[['humidity', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['humidity'] >= 60], axis = 1)
    
    # 자연재해, sandy
    df['holiday'] = df[['m', 'd', 'holiday', 'y']].apply(lambda x: (x['holiday'], 1)[x['y'] == 2012 and x['m'] == 10 and (x['m'] in [30])], axis = 1)
    
    # 공휴일, christmas day and others
    df['holiday'] = df[['m', 'd', 'holiday']].apply(lambda x: (x['holiday'], 1)[x['m'] == 12 and (x['m'] in [24, 26, 31])], axis = 1)
    df['workingday'] = df[['m', 'd', 'workingday']].apply(lambda x: (x['workingday'], 0)[x['m'] == 12 and x['m'] in [24, 31]], axis = 1)
    
    # 불필요한 컬럼 삭제
    df.set_index('regdate', inplace=True)
    df.drop('d', axis=1, inplace=True)
    
    # Outlier 삭제 
    if df_name[i] == 'train':
        del_idx_list = []
        # idx = df[df['weather']==4].index
        # del_idx_list.extend(idx)
        idx = df[df['temp']>40].index
        del_idx_list.extend(idx)
        idx = df[df['windspeed']>50].index
        del_idx_list.extend(idx)
        df.drop(del_idx_list, axis=0, inplace=True)
    
    # 연속형 피쳐 스케일링
    df['temp']      = np.log1p( df['temp'] )
    df['atemp']     = np.log1p( df['atemp'] )
    df['humidity']  = np.log1p( df['humidity'] )
    df['windspeed'] = np.log1p( df['windspeed'] )
        
    # 다중공선 처리
    df.drop(['atemp'], axis=1, inplace=True)
    df.drop(['m'], axis=1, inplace=True)
    df.drop(['holiday', 'workingday'], axis=1, inplace=True)

            
    # 이산형 피쳐 원핫인코딩
    # df = pd.get_dummies(df, columns=['season', 'weather','m', 'y', 'h', 'w', 'day_type'])
    df = pd.get_dummies(df, columns=['season', 'weather','y', 'h', 'w', 'day_type'])
    
    # Target data Scaling
    if df_name[i] == 'train':
        df['casual'] = np.log1p( df['casual'] )
        df['registered'] = np.log1p( df['registered'] )
    
    globals()[df_name[i]] = df.copy()

## windspeed 0 채우기
* 모델 학습을 통해 WindSpeed가 0인 데이터를 채움.

In [6]:
df = pd.concat([train, test], axis=0, ignore_index=True)

target = df[['regcount','casual','registered']]
df = df.drop(['regcount','casual','registered'], axis=1)

df1   = df[df['windspeed'] != 0]
y_df1 = df1['windspeed']
X_df1 = df1.drop('windspeed', axis=1)

rf = LGBMRegressor(random_state=0)  #RandomForestRegressor(random_state=11)
X_df1_8, X_df1_2, y_df1_8, y_df1_2 = train_test_split(X_df1, y_df1, test_size=0.2, random_state=11)
rf.fit(X_df1_8, y_df1_8)
    
pred = rf.predict(X_df1_2)
mse_score = mean_squared_error(y_df1_2, pred)
print("RMSE : ", np.sqrt(mse_score) )

df0 = train[train['windspeed'] == 0]
widx = df0['windspeed'].index.values
X_df0 = df0.drop(['regcount','casual','registered', 'windspeed'], axis=1)
pred = rf.predict(X_df0)
train.loc[widx, 'windspeed'] = pred
    
df0 = test[test['windspeed'] == 0]
widx = df0['windspeed'].index.values
X_df0 = df0.drop('windspeed', axis=1)
pred = rf.predict(X_df0)
test.loc[widx, 'windspeed'] = pred

RMSE :  0.385180981022705


# 학습

## 기본 모델, Training Score 확인

In [7]:
def my_fit_score(df, chart_view=False) :
    model_list = [ 
                   ("RIDGE"  , Ridge(alpha=1.0, random_state=0)),
                   ("LASSO"  , Lasso(alpha=1.0, random_state=0)),
                   ("DTR"    , DecisionTreeRegressor(random_state=0)),
                   ("RF"     , RandomForestRegressor(random_state=0)),
                   ("LR"     , LinearRegression()                   ),
                   ("ABOOST" , AdaBoostRegressor(random_state=0)                  ),
                   ("XGB"    , XGBRegressor(random_state=0)                       ),   #booster=gblinear
                   ("LGBM"   , LGBMRegressor(random_state=0)                      ),
                   ("VR-XGB-LGBM"  , VotingRegressor([("XGB", XGBRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) ) ,
                   ("VR-RF-LGBM"   , VotingRegressor([("DTR", RandomForestRegressor(random_state=0)), ("LGBM", LGBMRegressor(random_state=0))]) )
                 ]
   
    y_c = df['casual'] 
    y_r = df['registered'] 

    X = df.drop(['regcount','casual','registered'], axis=1)
    
    for tpl in model_list :
        print( tpl[0] ) 
        model = tpl[1]
                
        X_train, X_test, y_train, y_r_test = train_test_split(X, y_r, random_state=0, test_size=0.2)
        model.fit(X_train, y_train)
        
        #------------------------------------------------
        # feature_importance 차트 그리기
        if bool(chart_view) : 
            my_view_chart(tpl[0], model, X_train)
        #------------------------------------------------
        y_r_pred = model.predict(X_test)
        
        model = tpl[1]
        X_train, X_test, y_train, y_c_test = train_test_split(X, y_c, random_state=0, test_size=0.2)
        model.fit(X_train, y_train)
        y_c_pred = model.predict(X_test)
        
        #---------(타켓피쳐:로그스케일링) 복원-----------
        y_r_pred = np.maximum(0, np.expm1(y_r_pred))
        y_c_pred = np.maximum(0, np.expm1(y_c_pred))
        
        y_r_test = np.maximum(0, np.expm1(y_r_test))
        y_c_test = np.maximum(0, np.expm1(y_c_test))
        
        y_pred_comb = y_r_pred+y_c_pred
        y_real_comb = y_r_test+y_c_test
        y_pred_comb[y_pred_comb < 0] = 0
        
        msle_r_score = mean_squared_log_error(y_r_test, y_r_pred)
        msle_c_score = mean_squared_log_error(y_c_test, y_c_pred)
        msle_score = mean_squared_log_error(y_real_comb, y_pred_comb)
    
        print("registered RMSLE: ", np.sqrt(msle_r_score)) 
        print("casual RMSLE: ", np.sqrt(msle_c_score))
        print("comb RMSLE: ", np.sqrt(msle_score)) 
        print("-"*30)
        

In [8]:
my_fit_score(train)

RIDGE
registered RMSLE:  0.574229176780559
casual RMSLE:  0.590686610021352
comb RMSLE:  0.5458461815870496
------------------------------
LASSO
registered RMSLE:  1.398876371576369
casual RMSLE:  1.5097386288234824
comb RMSLE:  1.4200361615605412
------------------------------
DTR
registered RMSLE:  0.4311970168970477
casual RMSLE:  0.7091016944197763
comb RMSLE:  0.4140689678128788
------------------------------
RF
registered RMSLE:  0.32321383880222976
casual RMSLE:  0.5051756816314089
comb RMSLE:  0.31406543644804247
------------------------------
LR
registered RMSLE:  0.5742576541177937
casual RMSLE:  0.5903494194884685
comb RMSLE:  0.5459382546378583
------------------------------
ABOOST
registered RMSLE:  1.0423204145938123
casual RMSLE:  0.9714074830825923
comb RMSLE:  1.0150184825862532
------------------------------
XGB
registered RMSLE:  0.3169817132234814
casual RMSLE:  0.5035251935262637
comb RMSLE:  0.30649808359877145
------------------------------
LGBM
registered RMSLE:

## GridSearchCV를 통한 모델 파라메터 튜닝

In [8]:
def my_scoring(y_true, y_pred):
    y_pred = np.expm1(y_pred)
    y_true = np.expm1(y_true)
    y_pred = np.maximum(0, y_pred)
    y_true = np.maximum(0, y_true)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [9]:
def my_fit_score_cv(df) :
    model_list = [ 
                   ("RF"     , RandomForestRegressor(random_state=0)),           
                   ("XGB"    , XGBRegressor(random_state=0)                       ),   
                   ("LGBM"   , LGBMRegressor(random_state=0)                      ),
                 ]
   
    mydic = [{'n_estimators':[100, 200], 'min_samples_split':[2, 3], 'min_samples_leaf':[1, 2, 3]},
        {'n_estimators':[100, 200, 300], 'learning_rate':[0.1, 0.01, 0.005]},
        {'n_estimators':[100, 200, 300], 'learning_rate':[0.1, 0.01, 0.005]}]
    
    y_c = df['casual'] 
    y_r = df['registered'] 

    X = df.drop(['regcount','casual','registered'], axis=1)
       
    ret_model = []
    for i, tpl in enumerate(model_list) :
        # print( tpl[0] ) 
        model = tpl[1]
        kf = KFold(n_splits=5, shuffle=True, random_state=11)

        models1 = GridSearchCV(model, scoring=make_scorer(my_scoring, greater_is_better=False), param_grid=mydic[i], cv=kf)
        models1.fit(X, y_r)
        
        models2 = GridSearchCV(model, scoring=make_scorer(my_scoring, greater_is_better=False), param_grid=mydic[i], cv=kf)
        models2.fit(X, y_c)

        ret_model.append((tpl[0], models1, models2))
        # print(models1.best_score_, models2.best_score_)
        # print("-"*30)
        
    return ret_model    

In [11]:
models = my_fit_score_cv(train)

for m in models:
    print(m[0])
    print("registered model best score", m[1].best_score_)
    print("registered model best parameter", m[1].best_params_)
    print("registered model best estimator", m[1].best_estimator_)
    print("casual model best score", m[2].best_score_)
    print("casual model best parameter", m[2].best_params_)
    print("casual model best estimator", m[2].best_estimator_)
    print("-"*30)

RF
registered model best score -0.31316624830242534
registered model best parameter {'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
registered model best estimator RandomForestRegressor(min_samples_leaf=2, n_estimators=200, random_state=0)
casual model best score -0.5147379935728651
casual model best parameter {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
casual model best estimator RandomForestRegressor(n_estimators=200, random_state=0)
------------------------------
XGB
registered model best score -0.30212669491561267
registered model best parameter {'learning_rate': 0.1, 'n_estimators': 300}
registered model best estimator XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, impor

# 점수보기

## RandomForest

In [12]:
y_pred1 = models[0][1].predict(test)
y_pred2 = models[0][2].predict(test)

result = pd.read_csv('./datasets/sampleSubmission.csv')
result['count'] = np.expm1(y_pred1)+np.expm1(y_pred2)

result.to_csv('./submit_rf.csv', index=False)

## XGB Model

In [13]:
y_pred1 = models[1][1].predict(test)
y_pred2 = models[1][2].predict(test)

result = pd.read_csv('./datasets/sampleSubmission.csv')
cnt_result  = np.expm1(y_pred1)+np.expm1(y_pred2)
result['count'] = np.maximum(0, cnt_result)

result.to_csv('./submit_xgb.csv', index=False)

## LGBM Moel

In [14]:
y_pred1 = models[2][1].predict(test)
y_pred2 = models[2][2].predict(test)

result = pd.read_csv('./datasets/sampleSubmission.csv')
result['count'] = np.expm1(y_pred1)+np.expm1(y_pred2)

result.to_csv('./submit_lgbm.csv', index=False)

## Voting Model

In [None]:
# VotingRegressor만 따로 fit predict
model1= VotingRegressor([("XGB", XGBRegressor(n_estimators=300, random_state=0)), 
                         ("LGBM", LGBMRegressor(n_estimators=300, random_state=0))]) 
model2= VotingRegressor([("XGB", XGBRegressor(n_estimators=300, random_state=0)), 
                         ("LGBM", LGBMRegressor(n_estimators=300, random_state=0))]) 

y_c = train['casual'] 
y_r = train['registered'] 

X = train.drop(['regcount','casual','registered'], axis=1)

model1.fit(X, y_c)
model2.fit(X, y_r)

y_pred1 = model1.predict(test)
y_pred2 = model2.predict(test)

result = pd.read_csv('./datasets/sampleSubmission.csv')
cnt_result  = np.expm1(y_pred1)+np.expm1(y_pred2)
result['count'] = np.maximum(0, cnt_result)
result.to_csv('./submit_vt.csv', index=False)

# Pycaret

In [12]:
from pycaret.regression import *
s = setup(train, target = 'casual', silent=True, session_id=11)

Unnamed: 0,Description,Value
0,session_id,11
1,Target,casual
2,Original Data,"(10881, 53)"
3,Missing Values,False
4,Numeric Features,49
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(7616, 51)"


In [13]:
best_model_list = compare_models(sort='rmsle', n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,0.1645,0.058,0.2407,0.9735,0.1053,0.0819,0.537
lightgbm,Light Gradient Boosting Machine,0.1791,0.0647,0.2543,0.9704,0.1123,0.0863,0.065
rf,Random Forest Regressor,0.177,0.0776,0.2785,0.9645,0.1236,0.0884,1.241
et,Extra Trees Regressor,0.2004,0.0916,0.3026,0.9581,0.1271,0.0968,1.388
dt,Decision Tree Regressor,0.2543,0.1725,0.4149,0.9212,0.1749,0.1303,0.027
gbr,Gradient Boosting Regressor,0.3279,0.1814,0.4258,0.9171,0.1861,0.1528,0.431
lr,Linear Regression,0.3946,0.2601,0.5099,0.8812,0.2221,0.1868,0.452
ridge,Ridge Regression,0.3947,0.2601,0.5099,0.8812,0.2221,0.1867,0.007
br,Bayesian Ridge,0.3947,0.2601,0.5099,0.8812,0.2221,0.1867,0.015
knn,K Neighbors Regressor,0.469,0.3708,0.6086,0.8305,0.2263,0.2156,0.043


In [14]:
s = setup(train, target = 'registered', silent=True, session_id=11)

Unnamed: 0,Description,Value
0,session_id,11
1,Target,registered
2,Original Data,"(10881, 53)"
3,Missing Values,False
4,Numeric Features,49
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(7616, 51)"


In [15]:
best_model_list = compare_models(sort='rmsle', n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,0.014,0.0007,0.0254,0.9997,0.0084,0.0037,0.524
rf,Random Forest Regressor,0.0111,0.001,0.0295,0.9995,0.0113,0.0032,1.179
et,Extra Trees Regressor,0.0136,0.0012,0.0329,0.9994,0.0121,0.0039,1.509
dt,Decision Tree Regressor,0.0214,0.002,0.0444,0.999,0.0138,0.0054,0.025
lightgbm,Light Gradient Boosting Machine,0.0153,0.001,0.0312,0.9995,0.0141,0.0048,0.055
gbr,Gradient Boosting Regressor,0.0301,0.0025,0.0501,0.9987,0.0153,0.0087,0.431
knn,K Neighbors Regressor,0.0771,0.0133,0.1149,0.9932,0.0368,0.0253,0.043
ada,AdaBoost Regressor,0.1723,0.0448,0.2113,0.9769,0.0565,0.0518,0.184
br,Bayesian Ridge,0.3365,0.2193,0.4679,0.8871,0.1303,0.1212,0.015
ridge,Ridge Regression,0.3366,0.2193,0.4679,0.8871,0.1303,0.1213,0.007


In [10]:
# VotingRegressor만 따로 fit predict
model1= VotingRegressor([("XGB", XGBRegressor(n_estimators=700, random_state=0)), 
                         ("LGBM", LGBMRegressor(n_estimators=700, random_state=0))]) 
model2= VotingRegressor([("XGB", XGBRegressor(n_estimators=700, random_state=0)), 
                         ("RF", RandomForestRegressor())]) 

y_c = train['casual'] 
y_r = train['registered'] 

X = train.drop(['regcount','casual','registered'], axis=1)

model1.fit(X, y_c)
model2.fit(X, y_r)

y_pred1 = model1.predict(test)
y_pred2 = model2.predict(test)

result = pd.read_csv('./datasets/sampleSubmission.csv')
cnt_result  = np.expm1(y_pred1)+np.expm1(y_pred2)
result['count'] = np.maximum(0, cnt_result)
result.to_csv('./submit_vt2.csv', index=False)