# feature scaling

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
movie = {'netflix': [2, 4, 6, 8, 10], 
         'cgv': [1, 2, 3, 4, 5]
         }

In [19]:
movie = pd.DataFrame(data=movie)
movie

Unnamed: 0,netflix,cgv
0,2,1
1,4,2
2,6,3
3,8,4
4,10,5


In [20]:
from sklearn.preprocessing import MinMaxScaler

In [21]:
mms = MinMaxScaler()

In [22]:
mms.fit(movie) # 데이터의 통계적 정보를 학습 (최소값, 최대값, 평균, 분산)

In [23]:
mmsed = mms.transform(movie) # 정규화해서 데이터 변환

In [24]:
mmsed

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])

In [25]:
pd.DataFrame(mmsed, columns=['naver', 'netflix'])

Unnamed: 0,naver,netflix
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


In [26]:
mms.fit_transform(movie) # 한번에

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])

In [27]:
import warnings
warnings.filterwarnings('ignore')

# 회귀 학습기 KFold 수행

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 데이터 확인

In [29]:
columns = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'price']
hpd_df = pd.read_csv('bostton_house_prices.csv', header=None, delimiter=r'\s+', names=columns)
hpd_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


# 데이터 정규화 - MinMaxScaler()

In [30]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [33]:
# scaling function

def data_pre(df, y):
    X = df.drop(y, axis=1).values
    X = MinMaxScaler().fit_transform(X)
    y = df[y]
    return X, y

In [34]:
X, y = data_pre(hpd_df, 'price')
print(X[:2])
print()
print(y[:2])

[[0.00000000e+00 1.80000000e-01 6.78152493e-02 0.00000000e+00
  3.14814815e-01 5.77505269e-01 6.41606591e-01 2.69203139e-01
  0.00000000e+00 2.08015267e-01 2.87234043e-01 1.00000000e+00
  8.96799117e-02]
 [2.35922539e-04 0.00000000e+00 2.42302053e-01 0.00000000e+00
  1.72839506e-01 5.47997701e-01 7.82698249e-01 3.48961980e-01
  4.34782609e-02 1.04961832e-01 5.53191489e-01 1.00000000e+00
  2.04470199e-01]]

0    24.0
1    21.6
Name: price, dtype: float64


In [35]:
print('Extended Feature Shape :', X.shape)

Extended Feature Shape : (506, 13)


# baseline 성능

In [36]:
from sklearn.model_selection import KFold

num_split = 5

kf = KFold(n_splits=num_split)

tot_MSE = 0.0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model_lr = LinearRegression()
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    tot_MSE += mean_squared_error(y_test, y_pred)

avg_MSE = tot_MSE / num_split
print('Avg MSE :', avg_MSE)
print('Avg RMSE :', np.sqrt(avg_MSE))

Avg MSE : 37.13180746769891
Avg RMSE : 6.0935874054368755


# KFold 교차검증 + L2 규제 알고리즘

In [48]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

num_split = 5

kf = KFold(n_splits=num_split)

tot_MSE = 0.0
r2 = 0.0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    ridge_reg = Ridge(alpha=0.2) # 높을수록 과적합
    ridge_reg.fit(X_train, y_train)
    y_pred = ridge_reg.predict(X_test)
    tot_MSE += mean_squared_error(y_test, y_pred)
    r2 += r2_score(y_test, y_pred)

avg_MSE = tot_MSE / num_split
avg_r2 = r2 / num_split
print('Avg MSE :', avg_MSE)
print('Avg RMSE :', np.sqrt(avg_MSE))
print('Avg R2 :', avg_r2)

Avg MSE : 35.586892081276716
Avg RMSE : 5.965475008855265
Avg R2 : 0.3886029597732203


# KFold 교차검증 + L1 규제 알고리즘

In [49]:
from sklearn.linear_model import Lasso

num_split = 5

kf = KFold(n_splits=num_split)

tot_MSE = 0.0
r2 = 0.0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    lasso_reg = Lasso(alpha=0.2) # 높을수록 과적합
    lasso_reg.fit(X_train, y_train)
    y_pred = lasso_reg.predict(X_test)
    tot_MSE += mean_squared_error(y_test, y_pred)
    r2 += r2_score(y_test, y_pred)

avg_MSE = tot_MSE / num_split
avg_r2 = r2 / num_split
print('Avg MSE :', avg_MSE)
print('Avg RMSE :', np.sqrt(avg_MSE))
print('Avg R2 :', avg_r2)

Avg MSE : 41.674593841516916
Avg RMSE : 6.455586250799916
Avg R2 : 0.3308877620610328


# KFold 교차검증 + ElasticNet(L1+L2) 규제 알고리즘

In [50]:
from sklearn.linear_model import ElasticNet

num_split = 5

kf = KFold(n_splits=num_split)

tot_MSE = 0.0
r2 = 0.0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    elasticnet_reg = ElasticNet(alpha=0.2) # 높을수록 과적합
    elasticnet_reg.fit(X_train, y_train)
    y_pred = elasticnet_reg.predict(X_test)
    tot_MSE += mean_squared_error(y_test, y_pred)
    r2 += r2_score(y_test, y_pred)

avg_MSE = tot_MSE / num_split
avg_r2 = r2 / num_split
print('Avg MSE :', avg_MSE)
print('Avg RMSE :', np.sqrt(avg_MSE))
print('Avg R2 :', avg_r2)

Avg MSE : 54.36201321138468
Avg RMSE : 7.373059962551822
Avg R2 : 0.12013702035459942


# GridSearchCV

In [51]:
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, make_scorer, mean_squared_error

## 모델정의

In [56]:
model_lasso = Lasso()

## pipeline 객체 생성

In [110]:
pipeline = Pipeline([('scaler', StandardScaler()), ('lasso', model_lasso)])

## CV(Cross Validation) 정의

In [57]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

## 모델의 하이퍼파라미터 맵 정의
- dict로 정의하기 => 하이퍼파라미터이름 : [값1, 값2]

In [92]:
param_grid = dict(
    alpha =[0.01, 0.1, 1, 10, 100],
    fit_intercept =[True, False] 
)
# False : X와 Y값이 원점에서 시작한다고 가정

## GridSearch CV 객체생성

In [93]:
grid_search = GridSearchCV(model_lasso, # 학습모델
             param_grid=param_grid, # 하이퍼 파라미터 맵
             cv=cv, # cross-validation
             scoring='neg_mean_squared_error', # 성능평가
             refit=True, # 베스트 파라미터로 학습모델을 리턴
             return_train_score=True # 매 학습의 score 리턴
            )

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)

In [95]:
X_train.shape, y_train.shape

((404, 13), (404,))

In [96]:
X_test.shape, y_test.shape

((102, 13), (102,))

In [97]:
grid_search.fit(X_train, y_train)

## 학습된 결과 확인

In [99]:
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Lasso(alpha=0.01)
2. 학습모델 best_params_ :  {'alpha': 0.01, 'fit_intercept': True}
3. 학습모델 best_score_MSE :  24.872481115853716


## GridSearch CV 객체생성2

In [100]:
grid_search = GridSearchCV(model_lasso, # 학습모델
             param_grid=param_grid, # 하이퍼 파라미터 맵
             cv=cv, # cross-validation
             scoring='neg_mean_squared_error', # 성능평가
             refit=True, # 베스트 파라미터로 학습모델을 리턴
             return_train_score=True # 매 학습의 score 리턴
            )

In [101]:
grid_search.fit(X_train, y_train)
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Lasso(alpha=0.01)
2. 학습모델 best_params_ :  {'alpha': 0.01, 'fit_intercept': True}
3. 학습모델 best_score_MSE :  24.872481115853716


## 데이터 준비

In [103]:
scores_df = pd.DataFrame(grid_search.cv_results_)
df_score = scores_df.sort_values(by='mean_test_score', ascending=False)
df_score.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,...,split22_train_score,split23_train_score,split24_train_score,split25_train_score,split26_train_score,split27_train_score,split28_train_score,split29_train_score,mean_train_score,std_train_score
0,0.001112,0.000388,0.000666,0.00054,0.01,True,"{'alpha': 0.01, 'fit_intercept': True}",-18.600987,-20.374728,-13.533069,...,-22.991667,-21.175529,-21.66007,-23.509854,-23.312303,-22.198193,-20.002994,-23.647554,-22.528651,1.202361
2,0.001011,4.4e-05,0.000166,0.000371,0.1,True,"{'alpha': 0.1, 'fit_intercept': True}",-23.221732,-21.223776,-15.527738,...,-26.833597,-24.578193,-25.356443,-27.519864,-27.283125,-25.600572,-23.132705,-27.444338,-26.188061,1.341309
1,0.001487,0.000497,0.000465,0.000497,0.01,False,"{'alpha': 0.01, 'fit_intercept': False}",-20.735089,-28.993183,-16.33746,...,-29.017629,-25.451868,-28.038604,-29.726382,-29.685925,-27.849579,-25.184885,-30.308555,-28.490834,1.634227
3,0.001075,0.000265,0.000465,0.000497,0.1,False,"{'alpha': 0.1, 'fit_intercept': False}",-22.163309,-30.764163,-14.651209,...,-30.644165,-26.692239,-29.930068,-31.816974,-32.068176,-29.908486,-27.413434,-32.428696,-30.414245,1.782827
5,0.001025,5.7e-05,0.000864,0.000339,1.0,False,"{'alpha': 1, 'fit_intercept': False}",-47.405455,-44.887479,-31.931029,...,-51.753007,-47.350865,-50.222354,-53.110519,-52.959775,-52.683886,-47.686137,-53.318612,-51.481697,2.120708


In [113]:
X = hpd_df.drop('price', axis=1)
X.head()
y = hpd_df['price']

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=400)

## GridSearch CV 객체 생성3

In [123]:
# lasso__alpha : pipeline 객체로 실행할 대, 
# 하이퍼파라미터를 구분하기위해 접두어 Lasso를 붙여야 함
param_grid = dict()
param_grid['lasso__alpha'] = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08]
param_grid['lasso__fit_intercept'] = [True, False]

In [124]:
grid_search = GridSearchCV(pipeline, # 학습모델
             param_grid=param_grid, # 하이퍼 파라미터 맵
             cv=cv, # cross-validation
             scoring='neg_mean_squared_error', # 성능평가
             refit=True, # 베스트 파라미터로 학습모델을 리턴
             return_train_score=True # 매 학습의 score 리턴
            )

In [125]:
grid_search.fit(X_train, y_train)
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()), ('lasso', Lasso(alpha=0.03))])
2. 학습모델 best_params_ :  {'lasso__alpha': 0.03, 'lasso__fit_intercept': True}
3. 학습모델 best_score_MSE :  24.554357257463224


## [문제] ElasticNet 모델의 최적의 하이퍼파라미터 찾기

In [135]:
param_grid = dict()
param_grid['elasticnet__alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
param_grid['elasticnet__l1_ratio'] = [0.1, 0.3, 0.5, 0.7, 0.9]
param_grid['elasticnet__fit_intercept'] = [True, False]

In [136]:
pipeline = Pipeline([('scaler', StandardScaler()), ('elasticnet', ElasticNet())])

In [137]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [147]:
grid_search = GridSearchCV(pipeline,
                           param_grid=param_grid,
                           cv=cv,
                           scoring='neg_mean_squared_error',
                           refit=True,
                           return_train_score=True)

In [148]:
grid_search.fit(X_train, y_train)

In [154]:
y_pred = grid_search.predict(X_test)

In [155]:
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)

In [156]:
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_ : ", -1 * grid_search.best_score_)
print("4. 테스트 데이터 : MSE :", MSE, 'RMSE :', RMSE)

1. 학습모델 best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()),
                ('elasticnet', ElasticNet(alpha=0.01, l1_ratio=0.1))])
2. 학습모델 best_params_ :  {'elasticnet__alpha': 0.01, 'elasticnet__fit_intercept': True, 'elasticnet__l1_ratio': 0.1}
3. 학습모델 best_score_ :  25.218434047290216
4. 테스트 데이터 : MSE : 21.16835866154253 RMSE : 4.6009084604611


In [157]:
score_df = pd.DataFrame(grid_search.cv_results_)

In [158]:
score_df = score_df.sort_values(by='rank_test_score', ascending=False)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_elasticnet__alpha,param_elasticnet__fit_intercept,param_elasticnet__l1_ratio,params,split0_test_score,split1_test_score,...,split22_train_score,split23_train_score,split24_train_score,split25_train_score,split26_train_score,split27_train_score,split28_train_score,split29_train_score,mean_train_score,std_train_score
77,0.002897,0.00031,0.001296,0.000452,100.0,False,0.5,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-626.912368,-630.100526,...,-618.427419,-594.617038,-573.660147,-598.54654,-597.289267,-593.636657,-603.437185,-592.950409,-597.139526,9.8957
69,0.002942,0.000285,0.001255,0.000414,10.0,False,0.9,"{'elasticnet__alpha': 10, 'elasticnet__fit_int...",-626.912368,-630.100526,...,-618.427419,-594.617038,-573.660147,-598.54654,-597.289267,-593.636657,-603.437185,-592.950409,-597.139526,9.8957
79,0.002925,0.000253,0.001238,0.000411,100.0,False,0.9,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-626.912368,-630.100526,...,-618.427419,-594.617038,-573.660147,-598.54654,-597.289267,-593.636657,-603.437185,-592.950409,-597.139526,9.8957
78,0.002865,0.000334,0.001335,0.000461,100.0,False,0.7,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-626.912368,-630.100526,...,-618.427419,-594.617038,-573.660147,-598.54654,-597.289267,-593.636657,-603.437185,-592.950409,-597.139526,9.8957
75,0.002936,0.000235,0.001123,0.000347,100.0,False,0.1,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-626.912368,-630.100526,...,-618.427419,-594.617038,-573.660147,-598.54654,-597.289267,-593.636657,-603.437185,-592.950409,-597.139526,9.8957


In [159]:
score_df[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score',
           'split0_test_score', 'split1_test_score', 'split2_test_score',
           'split3_test_score','split4_test_score']].sort_values('rank_test_score')

Unnamed: 0,params,mean_train_score,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
30,"{'elasticnet__alpha': 0.01, 'elasticnet__fit_i...",-22.399037,-25.218434,1,-22.667990,-16.265316,-66.091583,-18.780725,-25.901608
31,"{'elasticnet__alpha': 0.01, 'elasticnet__fit_i...",-22.395436,-25.228180,2,-22.688971,-16.305236,-66.049798,-18.764248,-25.912780
32,"{'elasticnet__alpha': 0.01, 'elasticnet__fit_i...",-22.392168,-25.238580,3,-22.711486,-16.346950,-66.006470,-18.747465,-25.924696
33,"{'elasticnet__alpha': 0.01, 'elasticnet__fit_i...",-22.389265,-25.249867,4,-22.735766,-16.390514,-65.965353,-18.730416,-25.937426
20,"{'elasticnet__alpha': 0.001, 'elasticnet__fit_...",-22.378664,-25.259535,5,-22.788667,-16.598335,-65.613554,-18.777186,-26.065587
...,...,...,...,...,...,...,...,...,...
75,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-597.139526,-597.061799,75,-626.912368,-630.100526,-513.453421,-564.560263,-728.122368
77,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-597.139526,-597.061799,75,-626.912368,-630.100526,-513.453421,-564.560263,-728.122368
69,"{'elasticnet__alpha': 10, 'elasticnet__fit_int...",-597.139526,-597.061799,75,-626.912368,-630.100526,-513.453421,-564.560263,-728.122368
78,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-597.139526,-597.061799,75,-626.912368,-630.100526,-513.453421,-564.560263,-728.122368
