# 보스톤 다중선형회귀

In [1]:
import numpy as np
import pandas as pd

In [3]:
# 트레인 셋
df_train = pd.read_csv('../static/data/regression/boston_train.csv')
df_train.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,6.53876,0.0,18.1,1.0,0.631,7.016,97.5,1.2024,24.0,666.0,20.2,392.05,2.96,50.0
1,0.0187,85.0,4.15,0.0,0.429,6.516,27.7,8.5353,4.0,351.0,17.9,392.43,6.36,23.1
2,9.82349,0.0,18.1,0.0,0.671,6.794,98.8,1.358,24.0,666.0,20.2,396.9,21.24,13.3


In [4]:
df_train.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')

target과 임의의 feature에 대한 회귀식<br>

In [6]:
# Client에서 받는 feature, 복수선택 가능
feature_list = list(df.columns[:-4])
print(feature_list)
df_train[feature_list]

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX
0,6.53876,0.0,18.10,1.0,0.631,7.016,97.5,1.2024,24.0,666.0
1,0.01870,85.0,4.15,0.0,0.429,6.516,27.7,8.5353,4.0,351.0
2,9.82349,0.0,18.10,0.0,0.671,6.794,98.8,1.3580,24.0,666.0
3,8.98296,0.0,18.10,1.0,0.770,6.212,97.4,2.1222,24.0,666.0
4,0.15445,25.0,5.13,0.0,0.453,6.145,29.2,7.8148,8.0,284.0
...,...,...,...,...,...,...,...,...,...,...
374,0.26363,0.0,8.56,0.0,0.520,6.229,91.2,2.5451,5.0,384.0
375,0.32543,0.0,21.89,0.0,0.624,6.431,98.8,1.8125,4.0,437.0
376,0.01432,100.0,1.32,0.0,0.411,6.816,40.5,8.3248,5.0,256.0
377,0.01301,35.0,1.52,0.0,0.442,7.241,49.3,7.0379,1.0,284.0


데이터를 fit할 땐 numpy array로 바꿔줘야 한다.

In [21]:
X_train = df[feature_list].values
y_train = df.target.values.reshape(-1,1)
X_train.shape, y_train.shape

((379, 10), (379, 1))

In [22]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [23]:
lr.fit(X_train, y_train)

LinearRegression()

In [24]:
weight, bias = lr.coef_, lr.intercept_
print(weight)
print(bias)

[[ -0.19878607   0.07199088  -0.12275418   5.84042453 -13.86027594
    6.69673948  -0.05275528  -1.83295231   0.21417975  -0.01526654]]
[3.99963004]


In [25]:
# 테스트 셋
df_test = pd.read_csv('../static/data/regression/boston_test.csv')
df_test.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.17446,0.0,10.59,1.0,0.489,5.96,92.1,3.8771,4.0,277.0,18.6,393.25,17.27,21.7
1,0.75026,0.0,8.14,0.0,0.538,5.924,94.1,4.3996,4.0,307.0,21.0,394.33,16.3,15.6
2,0.09744,0.0,5.96,0.0,0.499,5.841,61.4,3.3779,5.0,279.0,19.2,377.56,11.41,20.0


In [26]:
# 클라이언트에서 받는 index
index = 20

In [27]:
# 데이터프레임을 2차원 numpy로 바꿈
df_test[feature_list].values

array([[1.74460e-01, 0.00000e+00, 1.05900e+01, ..., 3.87710e+00,
        4.00000e+00, 2.77000e+02],
       [7.50260e-01, 0.00000e+00, 8.14000e+00, ..., 4.39960e+00,
        4.00000e+00, 3.07000e+02],
       [9.74400e-02, 0.00000e+00, 5.96000e+00, ..., 3.37790e+00,
        5.00000e+00, 2.79000e+02],
       ...,
       [7.24400e-02, 6.00000e+01, 1.69000e+00, ..., 1.07103e+01,
        4.00000e+00, 4.11000e+02],
       [1.71200e-01, 0.00000e+00, 8.56000e+00, ..., 2.21100e+00,
        5.00000e+00, 3.84000e+02],
       [5.29305e+00, 0.00000e+00, 1.81000e+01, ..., 2.16780e+00,
        2.40000e+01, 6.66000e+02]])

In [28]:
X_test = df_test[feature_list].values[index, :]
y_test = df_test.target[index]
X_test, y_test

(array([6.1510e-02, 0.0000e+00, 5.1900e+00, 0.0000e+00, 5.1500e-01,
        5.9680e+00, 5.8500e+01, 4.8122e+00, 5.0000e+00, 2.2400e+02]),
 18.7)

In [31]:
# 이렇게 해도 되고
pred = np.dot(X_test, weight.T) + bias
pred[0]

21.922884590302807

In [32]:
# 이렇게 해도 됨
tmp = lr.predict(X_test.reshape(1,-1))
pred = np.round(tmp[0], 2)
pred[0]

21.92

In [35]:
# 다중회귀에 사용된 테스트셋의 feature와 그 값들
df_test.iloc[index, :-1].to_dict()

{'CRIM': 0.06151,
 'ZN': 0.0,
 'INDUS': 5.19,
 'CHAS': 0.0,
 'NOX': 0.515,
 'RM': 5.968,
 'AGE': 58.5,
 'DIS': 4.8122,
 'RAD': 5.0,
 'TAX': 224.0,
 'PTRATIO': 20.2,
 'B': 396.9,
 'LSTAT': 9.29}