# 보충 자료

## 01_linear_regression.ipynb의 주요 내용

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics

%matplotlib inline

In [2]:
# Read the data
data = pd.read_csv('data/bikeshare.csv')

# Year와 Month를 추출
datetime = pd.DatetimeIndex(data['datetime'])
data['year'] = datetime.year
data['month'] = datetime.month
data['hour'] = datetime.hour

# "count" is a method, so it's best to name that column something else
data.rename(columns={'count':'total'}, inplace=True)

# Handling 'season' variable
season_dummies = pd.get_dummies(data.season, prefix='season')
season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)
data = pd.concat([data, season_dummies], axis=1)

# Add derivative variable "daytime"
data['daytime'] = ((data.hour > 6) & (data.hour < 21)).astype(int)

# Handling 'hour' variable
hour_dummies = pd.get_dummies(data.hour, prefix='hour')
hour_dummies.drop(hour_dummies.columns[0], axis=1, inplace=True)
data = pd.concat([data, hour_dummies], axis=1)

## 선형회귀모델을 학습하는 함수를 조금 수정하였습니다.
다음을 포함하는 dictionary를 출력하는 함수로 변경하였습니다.
- 각 변수에 대응하는 계수들(coefficients)과 intercept
- Train set에서의 RMSE, R^2
- Test set에서의 RMSE, R^2

In [3]:
# Define a function that accepts a list of features and
# returns coefficients, intercept, training RMSE/R^2 and testing RMSE/R^2
def train_test_linreg(d, feature_cols):
    X = d[feature_cols]
    Y = d.total
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=123)
    model = LinearRegression()
    model.fit(X_train, Y_train)
    
    # Make series using selected features and corresponding coefficients
    formula = pd.Series(model.coef_, index = feature_cols)
    
    # Save intercept
    intercept = model.intercept_
    
    # Calculate training RMSE and testing RMSE
    Y_pred_train = model.predict(X_train)
    Y_pred_test = model.predict(X_test)
    rmse_train = np.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))
    
    # Calculate training R-square and testing R-square
    rsquared_train = model.score(X_train, Y_train)
    rsquared_test = model.score(X_test, Y_test)
    
    # Make result dictionary
    result={'formula':formula, 'intercept':intercept, 'rmse_train':rmse_train, 'rmse_test':rmse_test,
           'rsquared_train':rsquared_train, 'rsquared_test':rsquared_test}
    
    return result

In [4]:
# Hour에 대한 binary dummy variable만 이용하여 선형회귀모델을 학습
hour_cols = list(data.columns[data.columns.str.startswith('hour_')])
result = train_test_linreg(data, hour_cols)

In [5]:
# Check coefficients, intercept, training RMSE/R^2 and testing RMSE/R^2
result

{'formula': hour_1     -22.580917
 hour_2     -32.757474
 hour_3     -44.209704
 hour_4     -49.961957
 hour_5     -36.711049
 hour_6      16.801172
 hour_7     159.656510
 hour_8     309.943473
 hour_9     160.149618
 hour_10    121.193570
 hour_11    147.090587
 hour_12    202.425259
 hour_13    200.110370
 hour_14    195.261156
 hour_15    195.486156
 hour_16    266.890255
 hour_17    419.952457
 hour_18    379.068371
 hour_19    264.786324
 hour_20    173.622659
 hour_21    125.548656
 hour_22     77.631922
 hour_23     34.730308
 dtype: float64,
 'intercept': 56.263843648209075,
 'rmse_test': 128.47511657303033,
 'rmse_train': 124.92842235488435,
 'rsquared_test': 0.49192464950577053,
 'rsquared_train': 0.52631453386835414}

## Ridge regression & Lasso regression
### 두 모델의 공통점
- **Regularization**: 모델 계수가 커지는 것에 대한 penalty를 부여함으로써 모델의 overfitting(과적합)을 방지
- 기본적인 multiple linear regression (다중선형회귀분석) 은 변수 간의 [다중공산성(multicollinearity)](https://ko.wikipedia.org/wiki/%EB%8B%A4%EC%A4%91%EA%B3%B5%EC%84%A0%EC%84%B1)에 의해 성능이 하락하는데, 이 두 회귀모델은 이에 대해 대처할 수 있는 모델
- 모델의 parameter(모수)가 존재: 계수 크기에 대한 penalty를 얼마나 줄 것인가 (**alpha**)
- alpha가 0이면 단순 다중선형회귀모델과 일치한다.


### Lasso regression의 강점
- Lasso regression은 ridge regression과는 달리 특정 변수의 계수를 0으로 만들어줍니다. 특정 변수의 계수가 0이 아니라는 것은 **lasso regression 모델이 그 변수를 선택**했다고 볼 수 있습니다.
- Lasso regression은 모든 변수가 선택되는 것이 아니라는 점에서 **sparse model** (희소모델)


두 모델을 적용하기에 앞서 다음과 같은 데이터 전처리를 다시 실시하였습니다.
- X에서 가능한 모든 변수를 사용하여, 모델의 성능이 어떻게 나오는지 파악
- 제거한 변수: datetime (수치형 변수가 아니며, year/month/hour로 이미 분리됨),casual & registered (타겟변수인 'total'과 함께 움직이는 변수), total (타겟 변수)

In [6]:
from sklearn.linear_model import Ridge, Lasso

In [7]:
def train_test_ridge(data, alpha_value):
    X = data.drop(['datetime','casual','registered','total'], axis = 1)
    Y = data.total
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=123)
    model = Ridge(alpha = alpha_value)
    model.fit(X_train, Y_train)
    
    # Make series using selected features and corresponding coefficients
    formula = pd.Series(model.coef_, index = list(X.columns.values))
    
    # Save intercept
    intercept = model.intercept_
    
    # Calculate training RMSE and testing RMSE
    Y_pred_train = model.predict(X_train)
    Y_pred_test = model.predict(X_test)
    rmse_train = np.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))
    
    # Calculate training R-square and testing R-square
    rsquared_train = model.score(X_train, Y_train)
    rsquared_test = model.score(X_test, Y_test)
    
    # Make result dictionary
    result={'formula':formula, 'intercept':intercept, 'rmse_train':rmse_train, 'rmse_test':rmse_test,
           'rsquared_train':rsquared_train, 'rsquared_test':rsquared_test}
    
    return result

In [8]:
# Ridge regression 학습 및 테스트 (alpha = 0.1)
result = train_test_ridge(data, 0.1)

In [9]:
result

{'formula': season         -2.644511
 holiday        -0.187192
 workingday      3.602478
 weather       -24.457755
 temp            2.633739
 atemp           2.680801
 humidity       -0.717789
 windspeed      -0.548883
 year           86.747894
 month           8.384052
 hour            4.331612
 season_2       22.823900
 season_3      -11.585274
 season_4       -0.765954
 daytime       140.853699
 hour_1        -24.092058
 hour_2        -35.758448
 hour_3        -53.349607
 hour_4        -60.911974
 hour_5        -45.178525
 hour_6          5.419792
 hour_7         -2.819784
 hour_8        144.635743
 hour_9        -19.253897
 hour_10       -74.057873
 hour_11       -56.503838
 hour_12       -14.484937
 hour_13       -25.203273
 hour_14       -43.032744
 hour_15       -43.953162
 hour_16        19.865097
 hour_17       174.271810
 hour_18       131.144534
 hour_19        20.719563
 hour_20       -70.473541
 hour_21        17.001661
 hour_22       -25.495563
 hour_23       -67.961303
 

In [10]:
def train_test_lasso(data, alpha_value):
    X = data.drop(['datetime','casual','registered','total'], axis = 1)
    Y = data.total
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=123)
    model = Lasso(alpha = alpha_value)
    model.fit(X_train, Y_train)
    
    # Make series using selected features and corresponding coefficients
    formula = pd.Series(model.coef_, index = list(X.columns.values))
    
    # Save intercept
    intercept = model.intercept_
    
    # Calculate training RMSE and testing RMSE
    Y_pred_train = model.predict(X_train)
    Y_pred_test = model.predict(X_test)
    rmse_train = np.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
    rmse_test = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))
    
    # Calculate training R-square and testing R-square
    rsquared_train = model.score(X_train, Y_train)
    rsquared_test = model.score(X_test, Y_test)
    
    # Make result dictionary
    result={'formula':formula, 'intercept':intercept, 'rmse_train':rmse_train, 'rmse_test':rmse_test,
           'rsquared_train':rsquared_train, 'rsquared_test':rsquared_test}
    
    return result

In [13]:
# Lasso regression 학습 및 테스트 (alpha = 0.1)
result = train_test_lasso(data, 0.1) # lasso에서 alpha값이 커지면 0이되는 변수들이 많아진다

In [12]:
result

{'formula': season         -1.118508
 holiday        -0.000000
 workingday      3.143370
 weather       -24.069284
 temp            2.647267
 atemp           2.660164
 humidity       -0.727897
 windspeed      -0.547569
 year           86.293023
 month           7.805339
 hour            3.530808
 season_2       22.837538
 season_3      -10.490398
 season_4        0.000000
 daytime       136.388611
 hour_1        -13.337234
 hour_2        -24.053905
 hour_3        -40.848064
 hour_4        -47.640757
 hour_5        -31.120189
 hour_6         15.524011
 hour_7         12.668550
 hour_8        160.830004
 hour_9         -0.000000
 hour_10       -51.624457
 hour_11       -33.323522
 hour_12         4.643776
 hour_13        -0.447771
 hour_14       -17.510896
 hour_15       -17.623873
 hour_16        42.110149
 hour_17       197.513982
 hour_18       155.005382
 hour_19        45.429748
 hour_20       -40.123880
 hour_21        39.051951
 hour_22        -0.000000
 hour_23       -39.726194
 

### 파악할 부분
- Ridge regression과 Lasso regression의 결과와 단순선형회귀모델의 결과를 비교해보세요.
- 위의 Ridge regression과 Lasso regression에서 alpha값을 변형해가면서 결과가 달라지는지 살펴보세요.