# 집값 예측(Linear Regression)
---

In [418]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my

### 데이터 준비

In [419]:
from sklearn.datasets import fetch_california_housing # fetch라는 것은 sklearn에 포함되어있는게 아니고 인터넷에서 다운받아서 그걸 불러일으켜줌.

housing = fetch_california_housing()
type(housing)

sklearn.utils._bunch.Bunch

In [420]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [421]:
df = pd.DataFrame(housing.data,
                  columns = housing.feature_names)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [422]:
df['target'] = housing.target

In [423]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [424]:
df.shape

(20640, 9)

In [425]:
# housing.DESCR

In [426]:
df.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [427]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [428]:
# df.astype('int')

In [429]:
df.duplicated().sum() # 중복치 없음

0

In [430]:
x = df[['MedInc','HouseAge','AveRooms']]
y = df['target']

In [431]:
# 테스트 데이터 분리
# 스케일링 x
# gkrtmq
# RMSE

In [432]:
# 스케일링
# 학슴
# RMSE

In [433]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2022)

In [434]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train) # 선형 회귀로 컴퓨터에 학습시킨다.

In [435]:
lr.score(x_train, y_train) # svm.score = 정확도, lr.score = 0~1사이의 R2값인데, 높을수록 좋다.

0.5147637855350439

In [436]:
lr.coef_, lr.intercept_

(array([ 0.44622735,  0.01654159, -0.03686961]), 0.07072991132201079)

In [437]:
y_pred = lr.predict(x_train)

In [438]:
from sklearn.metrics import mean_squared_error, mean_absolute_error # mse 오차의 제곱의 평균

# MSE
mse = mean_squared_error(y_train, y_pred)
mse

0.6504991625699948

In [439]:
rmse = np.sqrt(mse)
rmse

0.8065352829045949

In [440]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [441]:
scaler.fit(x_train) # x_train_s = scaler.fit_transform(x_train)
x_train_s = scaler.transform(x_train)
y_train = y_train.values

In [442]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train_s, y_train)

In [443]:
y_pred = lr.predict(x_train_s)

In [444]:
mse = mean_squared_error(y_train, y_pred)
mse

0.6504991625699948

In [445]:
from sklearn.model_selection import cross_val_score

mse = cross_val_score(lr, x_test, y_test,
                scoring='neg_mean_squared_error',
               cv=3) 
# cv 는 3등분으로 나눈다는 뜻
# neg_를 쓰는 이유는 mse는 낮을수록 높은건데,
# 여기서의 scoring은 높을수록 좋아야하므로 붙여준다.

In [446]:
np.mean(np.sqrt(-mse))

0.8108427192491199

In [447]:
# x_train, y_train
# x_test, y_test
# 스케일링까지 되어있는 상태...

In [448]:
### 결정트리
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(random_state = 2022)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

In [449]:
mse = cross_val_score(clf, x_test, y_test,
                scoring='neg_mean_squared_error',
               cv=3) 
mse
np.mean(np.sqrt(-mse))

1.038232836911093

In [450]:
np.mean(np.sqrt(-mse))

1.038232836911093

In [451]:
### RandomForest
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(random_state = 2022)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

In [452]:
mse = cross_val_score(clf, x_test, y_test,
                scoring='neg_mean_squared_error',
               cv=3) 
mse
np.mean(np.sqrt(-mse))

0.788670759943944

In [453]:
### Support Vector Machine Regression
from sklearn.svm import SVR

clf = SVR(kernel='poly')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

In [454]:
mse = cross_val_score(clf, x_test, y_test,
                scoring='neg_mean_squared_error',
               cv=3) 
mse
np.mean(np.sqrt(-mse)) 
# 위에서의 rmse계산법은 cross validation, 데이터를 cv값에 따라 등분하여 테스트한 검증값이다.
# rmse = np.sqrt(mean_squared_error(y_train, y_pred))
# rmse

0.9979032749691248

In [456]:
# sklearn.linear_model_lasso
# sklearn.linear_model_