# 1. Data Fetching

In [1]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rickiepark/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
df = load_housing_data()

In [5]:
hw = df.copy()

In [6]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# 2. Data Preprocessing

In [7]:
# 결측치 확인 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [8]:
# total_bedrooms 에 대한 결측치 존재>> imputer를 사용하여 결측치에 대한 보간 진행 

In [9]:
# Scikit-Learn class to deal with missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [10]:
df['total_bedrooms_filled'] = imputer.fit_transform(df.total_bedrooms.to_frame())

In [11]:
# 결측지 확인 : 신규 컬럼(total_bedrooms_filled) 에 결측치가 존재하는지 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   longitude              20640 non-null  float64
 1   latitude               20640 non-null  float64
 2   housing_median_age     20640 non-null  float64
 3   total_rooms            20640 non-null  float64
 4   total_bedrooms         20433 non-null  float64
 5   population             20640 non-null  float64
 6   households             20640 non-null  float64
 7   median_income          20640 non-null  float64
 8   median_house_value     20640 non-null  float64
 9   ocean_proximity        20640 non-null  object 
 10  total_bedrooms_filled  20640 non-null  float64
dtypes: float64(10), object(1)
memory usage: 1.7+ MB


In [12]:
# 결측치 확인 2 : 결측치에 대하여 중앙값이 정상적으로 입력되었는지 확인 
# df[['total_bedrooms_filled', 'total_bedrooms']]

df.loc[df['total_bedrooms'].isna(), 'total_bedrooms_filled']

290      435.0
341      435.0
538      435.0
563      435.0
696      435.0
         ...  
20267    435.0
20268    435.0
20372    435.0
20460    435.0
20484    435.0
Name: total_bedrooms_filled, Length: 207, dtype: float64

In [13]:
# 결측치 확인 2 : 결측치에 대하여 중앙값이 정상적으로 입력되었는지 확인 
df.loc[~(df['total_bedrooms'].isna()), ['total_bedrooms', 'total_bedrooms_filled']]

Unnamed: 0,total_bedrooms,total_bedrooms_filled
0,129.0,129.0
1,1106.0,1106.0
2,190.0,190.0
3,235.0,235.0
4,280.0,280.0
...,...,...
20635,374.0,374.0
20636,150.0,150.0
20637,485.0,485.0
20638,409.0,409.0


# 3. Data Split

In [14]:
#  독립, 종속 변수 정의 
y = df['median_house_value']
# X = df.drop(columns=['median_house_value', 'ocean_proximity'])
X = df[df.columns.difference(['median_house_value', 'ocean_proximity'])]


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 9), (4128, 9), (16512,), (4128,))

# 4. Model Selection

## Homework Assignment: Exercise

### 1.

Question: Try a Support Vector Machine regressor (`sklearn.svm.SVR`), with various hyperparameters such as `kernel="linear"` (with various values for the `C` hyperparameter) or `kernel="rbf"` (with various values for the `C` and `gamma` hyperparameters). Don't worry about what these hyperparameters mean for now. How does the best `SVR` predictor perform?

In [17]:
import numpy as np
from sklearn.svm import SVR

In [18]:
# SVR 모델 중 linear 와 rbf 에 train data 학습


linear_regr = SVR(kernel="linear", C=1.0, epsilon=0.2)
linear_regr.fit(X_train, y_train)

rbf_regr = SVR(kernel="rbf", C=1.0, epsilon=0.2)
rbf_regr.fit(X_train, y_train)


SVR(epsilon=0.2)

In [19]:
#...
import joblib
# linear_regr_loaded = joblib.load("linear_regr.pkl") # DIFF
# rbf_regr_loaded = joblib.load("rbf_regr.pkl") # DIFF
# grid_search_loaded = joblib.load("grid_search.pkl") # DIFF

In [20]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [21]:
# # 학습한 모델에 대한 평가 진행(1. linear_regr)
# from sklearn.model_selection import cross_val_score

# linear_scores = cross_val_score(linear_regr_loaded, X_train, y_train,
#                                 scoring="neg_mean_squared_error", cv=10)
# linear_rmse_scores = np.sqrt(-linear_scores)
# display_scores(linear_rmse_scores)


In [22]:
# # 학습한 모델에 대한 평가 진행(2. rbf)
# from sklearn.model_selection import cross_val_score

# rbf_scores = cross_val_score(rbf_regr_loaded, X_train, y_train,
#                                 scoring="neg_mean_squared_error", cv=10)
# rbf_rmse_scores = np.sqrt(-rbf_scores)
# display_scores(rbf_rmse_scores)


In [23]:
# # 모델 수행에 시간이 오래 걸리므로 저장된 내역 호출하도록 변경 
import joblib
joblib.dump(linear_regr, "linear_regr.pkl") # DIFF
joblib.dump(rbf_regr, "rbf_regr.pkl") # DIFF


['rbf_regr.pkl']

### 2. 

Question: Try replacing `GridSearchCV` with `RandomizedSearchCV`.

In [24]:
rbf_regr = SVR(kernel="rbf")

In [25]:
# 1번 모델링 시 더 좋은 결과를 출력한 rbf 파라미터 모델로 RandomizedSearchCV 모델 수행
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'C': randint(low=1, high=2),
        'epsilon': randint(low=1, high=2),
    }


rnd_svr_search = RandomizedSearchCV(rbf_regr, param_distributions=param_distribs,
                                n_iter=2, cv=2, scoring='neg_mean_squared_error', random_state=42)
rnd_svr_search.fit(X_train, y_train)

RandomizedSearchCV(cv=2, estimator=SVR(), n_iter=2,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fbd88066890>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fbd88066450>},
                   random_state=42, scoring='neg_mean_squared_error')

In [26]:
# from sklearn.model_selection import GridSearchCV

# param_grid = [
#     # 12(=3×4)개의 하이퍼파라미터 조합을 시도합니다.
# #     {'C': [3, 10, 30], 'epsilon': [2, 4, 6, 8]}
#     {'C': [3], 'epsilon': [2]}
#   ]

# grid_svr_search = GridSearchCV(rbf_regr_loaded, param_grid
# #                            , cv=5,
#                            ,
#                            scoring='neg_mean_squared_error',
#                            return_train_score=True)
# grid_svr_search.fit(X_train, y_train)

In [27]:
# joblib.dump(grid_search, "grid_search.pkl") # DIFF

### 3. 

Question: Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance

In [29]:
XXX = pd.DataFrame()

In [30]:
XXX

In [31]:
col_test = []

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin

class bestAttr(BaseEstimator, TransformerMixin):
    def __init__(self, attrNum):
        self.attrNum = attrNum
        print("bestAttr init")
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        result = permutation_importance(rnd_svr_search, X_train, y_train, n_repeats=1, random_state=0)
        col = np.array(pd.DataFrame(result.importances_mean, index = X_train.columns, columns  = ['importances_mean']).sort_values(by = 'importances_mean', ascending=False).head(int(self.attrNum)).index)
        print("bestAttr two_bestAttr!!")
        print(X_train.loc[:, col])
        return X_train.loc[:, col]
#         return X_train.loc[:, col] , col
#         return X_train[:, col] >> 오늘의 깨달음!!! 슬라이싱 할때는 loc쓰자

In [33]:
# X_train[['total_rooms', 'population' ,'households']]

In [34]:
from sklearn.pipeline import Pipeline
K = 3
attrPipeline = Pipeline([
        ('attr', bestAttr(K))
    ])

# best_attr_df, col = attrPipeline.fit_transform(X_train)
best_attr_df = attrPipeline.fit_transform(X_train)

bestAttr init
bestAttr two_bestAttr!!
       total_rooms  population  households
14196       3126.0      2300.0       623.0
8267        3382.0      1314.0       756.0
17445       1897.0       915.0       336.0
14265       1421.0      1418.0       355.0
2271        2382.0       874.0       380.0
...            ...         ...         ...
11284       1330.0       658.0       217.0
11964       3084.0      1753.0       449.0
5390        2101.0      1756.0       527.0
860         3575.0      1777.0       559.0
15795       4226.0      2619.0      1242.0

[16512 rows x 3 columns]


Question: Try creating a single pipeline that does the full data preparation plus the final prediction.

4-1. 데이터 전처리  
4-2. 하이퍼 파라미터 튜닝을 위한 모델 수행  
4-3. feature selection을 위한 모델 수행  
4-4. 상위 N개 feature 선택  
4-5. 최종 모델 수행  

In [35]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,total_bedrooms_filled
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,129.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1106.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,190.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,235.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,280.0


In [36]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class one_preprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
#         self.attrNum = attrNum
        print("one_preprocess init")
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        imputer = SimpleImputer(strategy="median")
        float_col = X.select_dtypes(include='float64').columns
        X[float_col] = imputer.fit_transform(X[float_col])
        object_col = X.select_dtypes(include='object').columns
        X = pd.get_dummies(X, columns = object_col, drop_first=True)
        print("one_preprocess transform!!")
        return X
    
    

In [37]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

class three_modelrunning(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("init")
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        print(X)
        return X
    
    

In [38]:
# 4-1.  데이터 전처리  
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

full_pipeline = Pipeline([
        # 4-1 머신러닝 알고리즘을 위한 데이터 준비
        ('one', one_preprocess()), 
        # 4-2 feature selection
        ('two', attrPipeline, ),
        # 4-3 모델 수행
#         ('three', SVR(C=rnd_svr_search.best_params_.get('C'), epsilon = rnd_svr_search.best_params_.get('epsilon')))])
        ('three', SVR(**rnd_svr_search.best_params_))])


result = full_pipeline.fit(X_train, y_train)

one_preprocess init
one_preprocess transform!!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


bestAttr two_bestAttr!!
       total_rooms  population  households
14196       3126.0      2300.0       623.0
8267        3382.0      1314.0       756.0
17445       1897.0       915.0       336.0
14265       1421.0      1418.0       355.0
2271        2382.0       874.0       380.0
...            ...         ...         ...
11284       1330.0       658.0       217.0
11964       3084.0      1753.0       449.0
5390        2101.0      1756.0       527.0
860         3575.0      1777.0       559.0
15795       4226.0      2619.0      1242.0

[16512 rows x 3 columns]


In [39]:
result.steps[2]

('three', SVR(C=1, epsilon=1))

In [40]:
# 4-1.  데이터 전처리  
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

param_full_pipeline = Pipeline([
        # 4-1 머신러닝 알고리즘을 위한 데이터 준비
        ('one', one_preprocess()), 
        # 4-2 feature selection
        ('two', bestAttr(K) ),
        # 4-3 모델 수행
#         ('three', SVR(C=rnd_svr_search.best_params_.get('C'), epsilon = rnd_svr_search.best_params_.get('epsilon')))])
        ('three', SVR(**rnd_svr_search.best_params_))])


one_preprocess init
bestAttr init


### 5.

Question: Automatically explore some preparation options using `GridSearchCV`.

In [41]:
# https://rudolf-2434.tistory.com/15

p_grid = {
    'two__attrNum': list(str(np.random.randint(1,(len(X_train.columns) + 1))))
#     list(range(1, len(X_train.columns) + 1))
}

grid_search_prep = GridSearchCV(param_full_pipeline, p_grid, cv=2,
                                scoring='neg_mean_squared_error', verbose=2)



grid_search_prep.fit(X_train, y_train)

one_preprocess init
bestAttr init
Fitting 2 folds for each of 1 candidates, totalling 2 fits
one_preprocess init
bestAttr init
[CV] two__attrNum=4 ..................................................
one_preprocess transform!!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


bestAttr two_bestAttr!!
       total_rooms  population  households  total_bedrooms
14196       3126.0      2300.0       623.0           627.0
8267        3382.0      1314.0       756.0           787.0
17445       1897.0       915.0       336.0           331.0
14265       1421.0      1418.0       355.0           367.0
2271        2382.0       874.0       380.0           431.0
...            ...         ...         ...             ...
11284       1330.0       658.0       217.0           201.0
11964       3084.0      1753.0       449.0           570.0
5390        2101.0      1756.0       527.0           569.0
860         3575.0      1777.0       559.0           597.0
15795       4226.0      2619.0      1242.0          1315.0

[16512 rows x 4 columns]
[CV] ................................... two__attrNum=4, total=  32.6s
one_preprocess init
bestAttr init
[CV] two__attrNum=4 ..................................................
one_preprocess transform!!


Traceback (most recent call last):
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/svm/_base.py", line 162, in fit
    accept_large_sparse=False)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 812, in check_X_y
    check_consistent_length(X, y)


bestAttr two_bestAttr!!
       total_rooms  population  households  total_bedrooms
14196       3126.0      2300.0       623.0           627.0
8267        3382.0      1314.0       756.0           787.0
17445       1897.0       915.0       336.0           331.0
14265       1421.0      1418.0       355.0           367.0
2271        2382.0       874.0       380.0           431.0
...            ...         ...         ...             ...
11284       1330.0       658.0       217.0           201.0
11964       3084.0      1753.0       449.0           570.0
5390        2101.0      1756.0       527.0           569.0
860         3575.0      1777.0       559.0           597.0
15795       4226.0      2619.0      1242.0          1315.0

[16512 rows x 4 columns]
[CV] ................................... two__attrNum=4, total=  32.6s
one_preprocess init
bestAttr init
one_preprocess init
bestAttr init
one_preprocess transform!!


Traceback (most recent call last):
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/svm/_base.py", line 162, in fit
    accept_large_sparse=False)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/Users/boysbeanxious/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 812, in check_X_y
    check_consistent_length(X, y)


bestAttr two_bestAttr!!
       total_rooms  population  households  total_bedrooms
14196       3126.0      2300.0       623.0           627.0
8267        3382.0      1314.0       756.0           787.0
17445       1897.0       915.0       336.0           331.0
14265       1421.0      1418.0       355.0           367.0
2271        2382.0       874.0       380.0           431.0
...            ...         ...         ...             ...
11284       1330.0       658.0       217.0           201.0
11964       3084.0      1753.0       449.0           570.0
5390        2101.0      1756.0       527.0           569.0
860         3575.0      1777.0       559.0           597.0
15795       4226.0      2619.0      1242.0          1315.0

[16512 rows x 4 columns]


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('one', one_preprocess()),
                                       ('two', bestAttr(attrNum=3)),
                                       ('three', SVR(C=1, epsilon=1))]),
             param_grid={'two__attrNum': ['4']},
             scoring='neg_mean_squared_error', verbose=2)