## 5.9 Regression 실습 - Bike Sharing Demand
### 데이터 클렌징 및 가공

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

bike_df = pd.read_csv('all_data.csv', encoding='cp949')
print(bike_df.shape)
bike_df.head(3)

(423, 11)


Unnamed: 0,자치구,code,name,유흥업소수비율,안심지킴이집수비율,비상벨수비율,안심센터수비율,경찰관서수비율,cctv수비율,보안등수비율,여성1인피해자(천백만명)
0,송파구,1124066,가락1동,2.238806,0.0,0.746269,0.0,0.0,18.656716,48.507463,207.47341
1,송파구,1124067,가락2동,35.416667,2.083333,46.875,1.041667,0.0,129.166667,423.958333,247.12825
2,송파구,1124065,가락본동,105.309735,1.769912,35.39823,0.0,1.769912,97.345133,364.60177,203.692505


In [2]:
bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   자치구            423 non-null    object 
 1   code           423 non-null    int64  
 2   name           423 non-null    object 
 3   유흥업소수비율        423 non-null    float64
 4   안심지킴이집수비율      423 non-null    float64
 5   비상벨수비율         423 non-null    float64
 6   안심센터수비율        423 non-null    float64
 7   경찰관서수비율        423 non-null    float64
 8   cctv수비율        423 non-null    float64
 9   보안등수비율         418 non-null    float64
 10  여성1인피해자(천백만명)  423 non-null    float64
dtypes: float64(8), int64(1), object(2)
memory usage: 36.5+ KB


In [3]:
drop_columns = ['자치구', 'code', 'name']
bike_df.drop(drop_columns, axis=1,inplace=True)

In [4]:
bike_df.columns

Index(['유흥업소수비율', '안심지킴이집수비율', '비상벨수비율', '안심센터수비율', '경찰관서수비율', 'cctv수비율',
       '보안등수비율', '여성1인피해자(천백만명)'],
      dtype='object')

In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# log 값 변환 시 NaN등의 이슈로 log() 가 아닌 log1p() 를 이용하여 RMSLE 계산
def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

# 사이킷런의 mean_square_error() 를 이용하여 RMSE 계산
def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

# MSE, RMSE, RMSLE 를 모두 계산 
def evaluate_regr(y,pred):
    rmsle_val = rmsle(y,pred)
    rmse_val = rmse(y,pred)
    # MAE 는 scikit learn의 mean_absolute_error() 로 계산
    mae_val = mean_absolute_error(y,pred)
    print('RMSLE: {0:.3f}, RMSE: {1:.3F}, MAE: {2:.3F}'.format(rmsle_val, rmse_val, mae_val))

In [6]:
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler객체 생성
scaler = MinMaxScaler()
# MinMaxScaler 로 데이터 셋 변환. fit() 과 transform() 호출.  
scaler.fit(bike_df)
bike_scaled = scaler.transform(bike_df)

bike_data = pd.DataFrame(bike_scaled)

In [7]:
bike_data.columns=["유흥업소수비율", "안심지킴이집수비율", "비상벨수비율", "안심센터수비율", "경찰관서수비율", "cctv수비율", "보안등수비율", "여성1인피해자(천백만명)"]

In [8]:
bike_data

Unnamed: 0,유흥업소수비율,안심지킴이집수비율,비상벨수비율,안심센터수비율,경찰관서수비율,cctv수비율,보안등수비율,여성1인피해자(천백만명)
0,0.009427,0.000000,0.002642,0.000000,0.000000,0.016771,0.012679,0.073316
1,0.149123,0.126736,0.165929,0.286458,0.000000,0.136326,0.110815,0.087505
2,0.443409,0.107670,0.125303,0.000000,0.353982,0.101900,0.095301,0.071963
3,0.463158,0.152083,1.000000,0.000000,0.500000,1.000000,0.390767,0.056139
4,0.110276,0.048280,0.105352,0.000000,0.079365,0.100479,0.069598,0.262599
...,...,...,...,...,...,...,...,...
418,0.028708,0.000000,0.402253,0.000000,0.000000,0.114607,0.177027,0.123854
419,0.058752,0.070736,0.366331,0.000000,0.232558,0.126157,0.267158,0.219023
420,0.100251,0.000000,0.196657,0.000000,0.000000,0.209522,0.209106,0.100314
421,0.044110,0.115873,0.195533,0.261905,0.190476,0.205744,0.153594,0.163775


In [9]:
bike_df=bike_data

### 로그 변환, 피처 인코딩, 모델 학습/예측/평가 

In [19]:
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LinearRegression , Ridge , Lasso

y_target = bike_df['여성1인피해자(천백만명)']
X_features = bike_df.drop(['여성1인피해자(천백만명)'],axis=1,inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.5, random_state=0)

In [20]:
lgbm_reg = LGBMRegressor()
xgb_reg = XGBRegressor()

In [21]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

xgb_reg.fit(X_train, y_train)
lgbm_reg.fit(X_train, y_train)
xgb_pred = xgb_reg.predict(X_test)
lgbm_pred = lgbm_reg.predict(X_test)
pred = 0.5 * xgb_pred + 0.5 * lgbm_pred
preds = {'최종 혼합': pred,
         'XGBM': xgb_pred,
         'LGBM': lgbm_pred}

In [22]:
def get_rmse_pred(preds):
    for key in preds.keys():
        pred_value = preds[key]
        mse = mean_squared_error(y_test , pred_value)
        rmse = np.sqrt(mse)
        print('{0} 모델의 RMSE: {1}'.format(key, rmse))

In [23]:
get_rmse_pred(preds)

최종 혼합 모델의 RMSE: 0.13262858570263777
XGBM 모델의 RMSE: 0.13845415298052838
LGBM 모델의 RMSE: 0.13135388155215894
