### 규제
* 학습이 과대적합도는 것을 방지하고자 하는 알고리즘
* 라쏘(Lasso)
    - L1규제를 추가한 모형
    - 영향력이 크지 않은 회귀계수 값을 0으로 만드는 특성이 있다.
        * 회귀계수 : 독립변수의 값이 변화함에 따라 종속변수에 미치는 영향력 크기
    - alpha를 이용하여 가중치 제어. alpha값에 따라 과적합될 우려가 있다.
    - 영향력이 작은 회귀계수를 0으로 만듦으로써 모델에서 가장 중요한 특성이 무엇인지 알 수 있다
* 릿지(Ridge)
    - L2규제를 추가한 모형
    - 계수값을 0이 아닌 작게 만드는 특성이 있다.
    - alpha를 이용하여 가중치 제어. alpha값에 따라 과적합될 우려가 있다.
* 엘라스틱넷(ElasticNet)
    - L1, L2를 함께 결합한 모형
    - 피처가 많은 데이터세트에 적용
    - L1 규제로 feature의 수를 줄이고 L2규제로 계수값의 크기를 조정
    - 파라미터
        * alpha : L1규제의 alpha(a) + L2규제의 alpha(b). L1과 L2의 alpha를 합처논 것이다.
        * l1_ratio = 0 : 0에 가까워 질수록 L2규제와 동일
        * l1_ratio = 1 : 1에 가까워 질수록 L1규제와 동일
        * 0 < l1_ratio < 1 : L1과 L2규제를 적절히 적용
* 계수 : 계산해서 얻은 값

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/boston.csv')
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [3]:
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'PRICE'],
      dtype='object')

In [9]:
f = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX','PTRATIO', 'B', 'LSTAT']
label = 'PRICE'
X,y=df[f],df[label]
X,y

(        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
 0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
 1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
 2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
 3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
 4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
 ..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
 501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
 502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
 503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
 504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
 505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   
 
      PTRATIO       B  LSTAT  
 0       15.3  396.90   4.98  
 1       17.8  396.90   

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_train.shape,X_test.shape

((354, 13), (152, 13))

In [14]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
#alpha : 규제 강도
#수치가 높을 수록 강한 강도이다.
#강도가 높다 : 영향력이 높은것들도 0으로 만든다
lasso = Lasso(alpha=0.07)
lasso.fit(X_train,y_train)

train_pred = lasso.predict(X_train)
test_pred = lasso.predict(X_test)

print("score : ", lasso.score(X_train,y_train), "mse : ", mean_squared_error(train_pred,y_train) )
print("score : ", lasso.score(X_test,y_test), "mse : ", mean_squared_error(test_pred,y_test))

score :  0.7344158553791544 mse :  21.13006589903343
score :  0.7042544410804825 mse :  28.1587834739383


In [16]:
alphas = [0.07, 0.1, 0.5, 1.3, 2]

for a in alphas:
    lasso = Lasso(alpha=a)
    lasso.fit(X_train,y_train)

    train_pred = lasso.predict(X_train)
    test_pred = lasso.predict(X_test)
    print("alphas : ",a)
    print("score : ", lasso.score(X_train,y_train), "mse : ", mean_squared_error(train_pred,y_train) )
    print("score : ", lasso.score(X_test,y_test), "mse : ", mean_squared_error(test_pred,y_test))
    print("="*50)

alphas :  0.07
score :  0.7344158553791544 mse :  21.13006589903343
score :  0.7042544410804825 mse :  28.1587834739383
alphas :  0.1
score :  0.7331803930340965 mse :  21.228360173355107
score :  0.7056100386685371 mse :  28.02971313692537
alphas :  0.5
score :  0.7102946359623115 mse :  23.049167495141788
score :  0.6905553087110174 mse :  29.463117184246236
alphas :  1.3
score :  0.6583257394846544 mse :  27.18385034242028
score :  0.6163152117105888 mse :  36.531729893619655
alphas :  2
score :  0.6322103341805496 mse :  29.261610804527283
score :  0.586549392039758 mse :  39.36581901434018


In [19]:
from sklearn.model_selection import GridSearchCV

params = {"alpha" : [ 0.07, 0.1, 0.5, 1.3, 2 ] }

lasso = Lasso()

grid_cv = GridSearchCV(lasso, param_grid=params, cv=5 )
grid_cv.fit( X_train, y_train )

print("최적의 하이퍼 파라미터 : ", grid_cv.best_params_ )
print("train : ", grid_cv.score(X_train, y_train))
print("test : ", grid_cv.score(X_test, y_test))

최적의 하이퍼 파라미터 :  {'alpha': 0.07}
train :  0.7344158553791544
test :  0.7042544410804825


In [20]:
lasso = Lasso(alpha=0.07)
lasso.fit( X_train, y_train )
print(X_train.columns )
lasso.coef_

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')


array([-0.11943443,  0.04418213, -0.01114422,  3.04792601, -0.        ,
        3.41936841, -0.01299172, -1.04558175,  0.22643742, -0.00909006,
       -0.8608476 ,  0.01123677, -0.59246739])

In [21]:
alphas

[0.07, 0.1, 0.5, 1.3, 2]

In [24]:
coeff_df = pd.DataFrame(index=X_train.columns)
#coeff_df

for idx, alpha in enumerate(alphas):
    print(idx, ":",alpha)
    lasso = Lasso(alpha = alpha)
    lasso.fit(X_train,y_train)
    col_name = "alpha : " + str(alpha)
    coeff_df[col_name] = lasso.coef_
coeff_df

0 : 0.07
1 : 0.1
2 : 0.5
3 : 1.3
4 : 2


Unnamed: 0,alpha : 0.07,alpha : 0.1,alpha : 0.5,alpha : 1.3,alpha : 2
CRIM,-0.119434,-0.119369,-0.110564,-0.080829,-0.056671
ZN,0.044182,0.044096,0.040685,0.029888,0.017442
INDUS,-0.011144,-0.007335,-0.0,-0.0,-0.0
CHAS,3.047926,2.575042,0.0,0.0,0.0
NOX,-0.0,-0.0,-0.0,-0.0,-0.0
RM,3.419368,3.322642,1.995547,0.0,0.0
AGE,-0.012992,-0.011233,0.004418,0.027001,0.036084
DIS,-1.045582,-1.02821,-0.826583,-0.353024,-0.0
RAD,0.226437,0.230453,0.240271,0.19304,0.108225
TAX,-0.00909,-0.009335,-0.009952,-0.008015,-0.004797


In [25]:
from sklearn.linear_model import Ridge

alphas = [0.01, 0.1, 1, 10, 100]
alphas

[0.01, 0.1, 1, 10, 100]

In [27]:
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    
    train_pred = ridge.predict(X_train)
    test_pred = ridge.predict(X_test)
    
    train_score = ridge.score(X_train,y_train)
    test_score = ridge.score(X_test,y_test)
    
    train_mse = mean_squared_error(train_pred,y_train)
    test_mse = mean_squared_error(test_pred,y_test)
    
    print("alpha : " ,alpha)
    print("train : ", train_score, "mse : ",train_mse)
    print("test : ", test_score, "mse : ", test_mse)
    print("-"*50)

alpha :  0.01
train :  0.7453555184821212 mse :  20.25969842054966
test :  0.715131120091479 mse :  27.123183648517003
--------------------------------------------------
alpha :  0.1
train :  0.7452919640124198 mse :  20.26475485995002
test :  0.7145502061035897 mse :  27.178494136565302
--------------------------------------------------
alpha :  1
train :  0.7431907334850172 mse :  20.431930274643783
test :  0.7105305900957528 mse :  27.561213313233846
--------------------------------------------------
alpha :  10
train :  0.7358051256853939 mse :  21.019534552501742
test :  0.7068291008440499 mse :  27.91364272840615
--------------------------------------------------
alpha :  100
train :  0.7151289701445298 mse :  22.66454438447359
test :  0.6901926776237923 mse :  29.497644330836017
--------------------------------------------------


In [28]:
alphas

[0.01, 0.1, 1, 10, 100]

In [30]:
coeff_df = pd.DataFrame(index=X_train.columns)
#coeff_df

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    col_name = "alpha : " + str(alpha)
    coeff_df[col_name]  =ridge.coef_
coeff_df

Unnamed: 0,alpha : 0.01,alpha : 0.1,alpha : 1,alpha : 10,alpha : 100
CRIM,-0.127957,-0.127382,-0.124455,-0.123137,-0.124222
ZN,0.04424,0.044263,0.044442,0.045537,0.047699
INDUS,0.04941,0.044382,0.017585,-0.011177,-0.019814
CHAS,3.99632,3.991182,3.892423,2.84727,0.767134
NOX,-15.443189,-14.317347,-8.287912,-1.606827,-0.169688
RM,3.534019,3.538598,3.546727,3.272363,1.758808
AGE,-0.002433,-0.003456,-0.008739,-0.011523,0.002045
DIS,-1.305555,-1.289587,-1.204365,-1.111602,-1.000094
RAD,0.262909,0.259804,0.244216,0.241866,0.282859
TAX,-0.00756,-0.00764,-0.008119,-0.009307,-0.011283


In [31]:
from sklearn.linear_model import ElasticNet

# l1_ratios : 0에 가까울 수록 L2(릿지) 규제에 가깝다
# l1_ratios : 1에 가까울 수록 L1(랏소) 규제에 가깝다


ratios = [0.2, 0.5, 0.8] # L1, L2
alphas = [0.1, 0.7, 1.5] # 규제 강도

In [32]:
el = ElasticNet(alpha=0.7,l1_ratio=0.2)
el.fit(X_train,y_train)

print("train : ", el.score(X_train,y_train))
print("test : ", el.score(X_test,y_test))

train :  0.6985616425424137
test :  0.6706913987638031


In [33]:
params = {
    "alpha" : alphas,
    "l1_ratio" : ratios
}

grid_cv = GridSearchCV(el,param_grid=params, cv=5)
grid_cv.fit(X_train,y_train)

print("최적의 하이퍼 파라미터 : ", grid_cv.best_params_)
print("train : ", grid_cv.score(X_train,y_train))
print("test : ", grid_cv.score(X_test,y_test))

최적의 하이퍼 파라미터 :  {'alpha': 0.1, 'l1_ratio': 0.8}
train :  0.7316692796003413
test :  0.7058618245421977
