# 2. Multiple Linear Regression

## 원-핫 인코딩

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('MultipleLinearRegressionData.csv')
dataset.head()

Unnamed: 0,hour,absent,place,score
0,0.5,3,Home,10
1,1.2,4,Library,8
2,1.8,2,Cafe,14
3,2.4,0,Cafe,26
4,2.6,2,Home,22


In [3]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [4]:
X

array([[0.5, 3, 'Home'],
       [1.2, 4, 'Library'],
       [1.8, 2, 'Cafe'],
       [2.4, 0, 'Cafe'],
       [2.6, 2, 'Home'],
       [3.2, 0, 'Home'],
       [3.9, 0, 'Library'],
       [4.4, 0, 'Library'],
       [4.5, 5, 'Home'],
       [5.0, 1, 'Cafe'],
       [5.3, 2, 'Cafe'],
       [5.8, 0, 'Cafe'],
       [6.0, 3, 'Library'],
       [6.1, 1, 'Cafe'],
       [6.2, 1, 'Library'],
       [6.9, 4, 'Home'],
       [7.2, 2, 'Cafe'],
       [8.4, 1, 'Home'],
       [8.6, 1, 'Library'],
       [10.0, 0, 'Library']], dtype=object)

In [5]:
y

array([ 10,   8,  14,  26,  22,  30,  42,  48,  38,  58,  60,  72,  62,
        68,  72,  58,  76,  86,  90, 100])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [2])], remainder='passthrough')
X = ct.fit_transform(X)
X
# X 각 리스트의 0,1번 값이 one-hot encoding되어서 출력됨.
# 1 0 Home
# 0 1 Library
# 0 0 Cafe

array([[1.0, 0.0, 0.5, 3],
       [0.0, 1.0, 1.2, 4],
       [0.0, 0.0, 1.8, 2],
       [0.0, 0.0, 2.4, 0],
       [1.0, 0.0, 2.6, 2],
       [1.0, 0.0, 3.2, 0],
       [0.0, 1.0, 3.9, 0],
       [0.0, 1.0, 4.4, 0],
       [1.0, 0.0, 4.5, 5],
       [0.0, 0.0, 5.0, 1],
       [0.0, 0.0, 5.3, 2],
       [0.0, 0.0, 5.8, 0],
       [0.0, 1.0, 6.0, 3],
       [0.0, 0.0, 6.1, 1],
       [0.0, 1.0, 6.2, 1],
       [1.0, 0.0, 6.9, 4],
       [0.0, 0.0, 7.2, 2],
       [1.0, 0.0, 8.4, 1],
       [0.0, 1.0, 8.6, 1],
       [0.0, 1.0, 10.0, 0]], dtype=object)

## 데이터 세트 분리

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## 학습 (다중 선형 회귀)

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

## 예측값과 실제값 비교 (테스트 세트)

In [10]:
y_pred = lr.predict(X_test)
y_pred

array([ 92.15457859,  10.23753043, 108.36245302,  38.14675204])

In [11]:
y_test

array([ 90,   8, 100,  38])

* 일부 데이터는 근사한 값을 갖지만, score값이 100을 넘는 것과 같이 예측이 잘못 된 경우도 있음.

In [16]:
# 회귀식의 기울기 값 출력
lr.coef_

array([-5.82712824, -1.04450647, 10.40419528, -1.64200104])

In [17]:
# 회귀식의 절편 값 출력
lr.intercept_

5.365006706544754

## 모델 평가

In [18]:
lr.score(X_train, y_train)

0.9623352565265527

In [19]:
lr.score(X_test, y_test)

0.9859956178877445

## 다양한 평가 지표 모델
#### 1. MAE (Mean Absolute Error): 실제 값과 예측 값 차이의 절대값
#### 2. MSE (Mean Squared Error): 실제 값과 예측 값 차이의 제곱
#### 3. RMSE (Root Mean Squared Error): MSE의 루트 값
#### 4. R^2: 결정 계수

In [20]:
# MAE
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

3.2253285188287926

In [21]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

19.90022698151505

In [23]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared  = False)

4.4609670455535815

In [24]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9859956178877445