### 변수와 상수
- 상수는 변하지 않는 수(컬럼)
- 변수는 변하는 수(컬럼)
- 머신러닝에서 학습을 위해 주어지는 데이터는 변수에 해당하고 학습을 통해 찾아야 할 기울기, 절편 등은 상수에 해당한다.

### 독립변수와 종속변수
- 독립 변수는 다른 컬럼의 결과에 영향을 최소로 받는 것(입력 데이터)
- 종속 변수는 다른 컬럼의 결과에 영향을 많이 받는 것(결과 데이터) 

### 회귀분석에서의 종속관계
- 회귀분석에서 입력 데이터의 모든 컬럼은 결과 데이터와의 종속 관계가 매우 높아야 한다.
- 하지만 입력 데이터의 모든 컬럼간의 종속 관계는 매우 낮아야 한다.
- 각 컬럼간의 상관 관계를 비교하여 상관 관계가 높은 컬럼은 제외를 해야 하는데 이 상관 관계를 볼 수 있는 지표가 VIF 계수이다.

In [1]:
# 기본
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게...
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
# plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 하이퍼 파라미터 튜닝
from sklearn.model_selection import GridSearchCV

# 분류 알고리즘
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 회귀 알고리즘
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 분류용 평가 함수
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용 평가 함수
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 상관관계
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# df1 = pd.read_csv('data/boston.csv')
# df1

df1 = pd.read_excel('data/house_price.xlsx')

a1 = df1['price']

df1.drop(['date', 'price', 'street', 'statezip'], inplace=True, axis=1)
df1['target'] = a1

encoder1 = LabelEncoder()
encoder1.fit(df1['city'])
df1['city'] = encoder1.transform(df1['city'])

encoder2 = LabelEncoder()
encoder2.fit(df1['country'])
df1['country'] = encoder2.transform(df1['country'])


df1

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,country,target
0,3,1.50,1340,7912,1.5,0,0,3,1340,0,1955,2005,36,0,3.130000e+05
1,5,2.50,3650,9050,2.0,0,4,5,3370,280,1921,0,35,0,2.384000e+06
2,3,2.00,1930,11947,1.0,0,0,4,1930,0,1966,0,18,0,3.420000e+05
3,3,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,3,0,4.200000e+05
4,4,2.50,1940,10500,1.0,0,0,4,1140,800,1976,1992,31,0,5.500000e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4545,4,2.75,4230,31747,2.0,0,0,4,4230,0,1985,0,42,0,4.051250e+05
4546,3,1.75,1570,15330,1.0,0,0,3,1080,490,1956,2001,6,0,2.899875e+05
4547,3,2.50,1630,7700,1.0,0,0,3,1120,510,1978,0,18,0,2.642700e+05
4548,4,2.50,2770,10274,2.0,0,0,3,2770,0,1989,0,16,0,6.027610e+05


In [3]:
# 결과 데이터를 제거한다.
df2 = df1.drop('target', axis=1)
df2

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,country
0,3,1.50,1340,7912,1.5,0,0,3,1340,0,1955,2005,36,0
1,5,2.50,3650,9050,2.0,0,4,5,3370,280,1921,0,35,0
2,3,2.00,1930,11947,1.0,0,0,4,1930,0,1966,0,18,0
3,3,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,3,0
4,4,2.50,1940,10500,1.0,0,0,4,1140,800,1976,1992,31,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4545,4,2.75,4230,31747,2.0,0,0,4,4230,0,1985,0,42,0
4546,3,1.75,1570,15330,1.0,0,0,3,1080,490,1956,2001,6,0
4547,3,2.50,1630,7700,1.0,0,0,3,1120,510,1978,0,18,0
4548,4,2.50,2770,10274,2.0,0,0,3,2770,0,1989,0,16,0


In [4]:
# sns.pairplot(df2)
# plt.show()

In [5]:
# VIF 계수 계산을 위한 상수항을 추가한다.
df1['intercept'] = 1

# OLS 회귀 분석
lm = sm.OLS(df1['target'], df1.drop('target', axis=1))

results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.224
Model:,OLS,Adj. R-squared:,0.222
Method:,Least Squares,F-statistic:,109.2
Date:,"Thu, 24 Feb 2022",Prob (F-statistic):,2.55e-239
Time:,13:48:53,Log-Likelihood:,-66148.0
No. Observations:,4550,AIC:,132300.0
Df Residuals:,4537,BIC:,132400.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
bedrooms,-5.845e+04,1.06e+04,-5.513,0.000,-7.92e+04,-3.77e+04
bathrooms,6.026e+04,1.71e+04,3.520,0.000,2.67e+04,9.38e+04
sqft_living,164.7937,9.812,16.795,0.000,145.558,184.030
sqft_lot,-0.7166,0.213,-3.365,0.001,-1.134,-0.299
floors,2.556e+04,1.93e+04,1.323,0.186,-1.23e+04,6.34e+04
waterfront,3.776e+05,9.51e+04,3.971,0.000,1.91e+05,5.64e+05
view,4.953e+04,1.11e+04,4.474,0.000,2.78e+04,7.12e+04
condition,3.612e+04,1.31e+04,2.747,0.006,1.03e+04,6.19e+04
sqft_above,104.9221,9.852,10.649,0.000,85.607,124.238

0,1,2,3
Omnibus:,12766.76,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,610708373.762
Skew:,35.979,Prob(JB):,0.0
Kurtosis:,1796.36,Cond. No.,inf


- Adj. R-squared : 결정계수(회귀계수), 결과데이터와의 종합적인 상관관계 계수. 1과 가까울 수록 좋다.
- P>|t| : P Value. 회귀 분석시 해당 컬럼이 얼마나 유용한지에 대한 수치. 0일 수록 좋다.

### VIF 계수

In [6]:
# ~ 를 기준으로 좌측에는 결과 컬럼, 우측에는 입력 컬럼들을 나열해 준다.
# 입력 컬럼들은 + 로 구분하여 작성해준다.
# ~ 를 기준으로 좌측의 것이 y 변수에, 우측의 것들이 X 변수에 담긴다.
# 이 때, X 변수에 들어가는 데이터 프레임의 첫 번째 컬럼은 상수항이 들어간다.

a1 = df1.drop(['target', 'intercept'], axis=1).columns
a2 = '+'.join(a1)

y, X = dmatrices(f'target ~ {a2}', df1, return_type='dataframe')
display(y)
display(X)

Unnamed: 0,target
0,3.130000e+05
1,2.384000e+06
2,3.420000e+05
3,4.200000e+05
4,5.500000e+05
...,...
4545,4.051250e+05
4546,2.899875e+05
4547,2.642700e+05
4548,6.027610e+05


Unnamed: 0,Intercept,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,country
0,1.0,3.0,1.50,1340.0,7912.0,1.5,0.0,0.0,3.0,1340.0,0.0,1955.0,2005.0,36.0,0.0
1,1.0,5.0,2.50,3650.0,9050.0,2.0,0.0,4.0,5.0,3370.0,280.0,1921.0,0.0,35.0,0.0
2,1.0,3.0,2.00,1930.0,11947.0,1.0,0.0,0.0,4.0,1930.0,0.0,1966.0,0.0,18.0,0.0
3,1.0,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4.0,1000.0,1000.0,1963.0,0.0,3.0,0.0
4,1.0,4.0,2.50,1940.0,10500.0,1.0,0.0,0.0,4.0,1140.0,800.0,1976.0,1992.0,31.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4545,1.0,4.0,2.75,4230.0,31747.0,2.0,0.0,0.0,4.0,4230.0,0.0,1985.0,0.0,42.0,0.0
4546,1.0,3.0,1.75,1570.0,15330.0,1.0,0.0,0.0,3.0,1080.0,490.0,1956.0,2001.0,6.0,0.0
4547,1.0,3.0,2.50,1630.0,7700.0,1.0,0.0,0.0,3.0,1120.0,510.0,1978.0,0.0,18.0,0.0
4548,1.0,4.0,2.50,2770.0,10274.0,2.0,0.0,0.0,3.0,2770.0,0.0,1989.0,0.0,16.0,0.0


In [7]:
vif = pd.DataFrame()

vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = ['intercept'] + df1.columns[:-2].tolist()
vif

Unnamed: 0,VIF Factor,features
0,9298.303912,intercept
1,1.692676,bedrooms
2,3.266551,bathrooms
3,inf,sqft_living
4,1.077202,sqft_lot
5,1.967472,floors
6,1.153493,waterfront
7,1.341704,view
8,1.446659,condition
9,inf,sqft_above
