## 다중 (선형)회귀분석
연속형 종속변수와 두 개 이상의 독립변수 간 선형관계 및 설명력을 확인하는 기법

필요 시 모델 성능 향상을 위한 파생변수 생성 및 성능 비교 필요

명목형 변수가 독립변수인 경우 가변수(pd.get_dummies) 변환 후 모델 적합

### 다중 공선성 문제
독립변수 간 강한 상관관계가 나타나는 문제

상관계수를 확인하여 그 값이 높은 것을 사전에 제거

회귀 모델 생성 이후 분산 팽창 계수(VIF) 확인(10 이상)하여 관련 변수 처리

#### patsy - dmatrices()
수식을 기반으로 데이터 행렬을 생성하는 라이브러리 pasty의 함수

분산 팽창 계수(VIF) 확인을 위해 입력 데이터를 전처리할 때 필요한 함수

return_type 인자에 'dataframe'으로 설정 시 후처리 용이

#### statsmodels - variance_inflation_factor()
분산 팽창 계수를 연산하기 위한 statsmodels 함수

분산 팽창 계수 연산을 위해 반복문 또는 list comprehension 사용

In [1]:
import pandas as pd
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [22]:
df = pd.read_csv('ex/bike.csv')
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [7]:
df_sub = df.loc[:, 'season':'casual']
df_sub.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual
0,1,0,0,1,9.84,14.395,81,0.0,3
1,1,0,0,1,9.02,13.635,80,0.0,8


In [9]:
# formula 작성시 "종속변수 ~ 독립변수1 + 독립변수2 + 독립변수3" 처럼
# ~ 앞은 종속변수, 뒤는 독립변수, 독립변수 연결은 +로 해야함
# but! 기본 텍스트 함수 활용하면 간략히 가능
formula = "casual ~ " + " + ".join(df_sub.columns[:-1])
y, X = dmatrices(formula, data=df_sub, return_type='dataframe')

In [10]:
y.head(2)

Unnamed: 0,casual
0,3.0
1,8.0


In [11]:
X.head(2)

Unnamed: 0,Intercept,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1.0,1.0,0.0,0.0,1.0,9.84,14.395,81.0,0.0
1,1.0,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0


In [13]:
df_vif = pd.DataFrame()
df_vif['colname'] = X.columns
df_vif['VIF'] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,colname,VIF
0,Intercept,34.029472
1,season,1.137211
2,holiday,1.069731
3,workingday,1.071196
4,weather,1.23615
5,temp,35.516012
6,atemp,35.550831
7,humidity,1.425034
8,windspeed,1.195704


In [18]:
df_sub = pd.concat([df.loc[:, 'season':'temp'],
                    df.loc[:, 'humidity':'casual']],
                   axis=1)
df_sub.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,casual
0,1,0,0,1,9.84,81,0.0,3
1,1,0,0,1,9.02,80,0.0,8


In [19]:
# VIF 비교
formula = "casual ~ " + " + ".join(df_sub.columns[:-1])
y, X = dmatrices(formula, data=df_sub, return_type='dataframe')

df_vif = pd.DataFrame()
df_vif['colname'] = X.columns
df_vif['VIF'] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,colname,VIF
0,Intercept,31.375118
1,season,1.136866
2,holiday,1.068094
3,workingday,1.070025
4,weather,1.235251
5,temp,1.089028
6,humidity,1.421256
7,windspeed,1.14965


In [20]:
# temp vif 35 -> 1.xxx 
# 다중공선성 문제 해결 (Intercept:절편이니 무시)

In [24]:
df.corr(numeric_only=True).round(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
season,1.0,0.03,-0.01,0.01,0.26,0.26,0.19,-0.15,0.1,0.16,0.16
holiday,0.03,1.0,-0.25,-0.01,0.0,-0.01,0.0,0.01,0.04,-0.02,-0.01
workingday,-0.01,-0.25,1.0,0.03,0.03,0.02,-0.01,0.01,-0.32,0.12,0.01
weather,0.01,-0.01,0.03,1.0,-0.06,-0.06,0.41,0.01,-0.14,-0.11,-0.13
temp,0.26,0.0,0.03,-0.06,1.0,0.98,-0.06,-0.02,0.47,0.32,0.39
atemp,0.26,-0.01,0.02,-0.06,0.98,1.0,-0.04,-0.06,0.46,0.31,0.39
humidity,0.19,0.0,-0.01,0.41,-0.06,-0.04,1.0,-0.32,-0.35,-0.27,-0.32
windspeed,-0.15,0.01,0.01,0.01,-0.02,-0.06,-0.32,1.0,0.09,0.09,0.1
casual,0.1,0.04,-0.32,-0.14,0.47,0.46,-0.35,0.09,1.0,0.5,0.69
registered,0.16,-0.02,0.12,-0.11,0.32,0.31,-0.27,0.09,0.5,1.0,0.97
