# RandomizedSearch

총 조합수 1400개 중 `n_iter=60` 에 따라 60개를 random하게 조합하여 테스트한 결과 값이기 때문에 <u>매번 학습하고 확인할 때마다 최적의 파라미터와 정확도 값이 다릅니다.</u><br>따라서, 이를 실행했을 때 <u>나온 파라미터 값들을 그 주위의 값들과 함께 다시 넣어서</u><br> `GridSearch`를 통해 **세부화해서 조금 더 최적의 하이퍼 파라미터 값들을 찾을 수 있습니다.**

In [13]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [9]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

tree = DecisionTreeClassifier(random_state=0)
params = {
    "max_depth":range(1, 6), # 5
    "max_leaf_nodes":range(3, 31), # 28 
    "max_features":np.arange(0.1, 1.1, 0.1) # 10 # 학습시 사용할 컬럼의 비율
}
print("총 조합수:", (5 * 28 *10))

rs = RandomizedSearchCV(tree, # 모델
                        params, # 하이퍼파라미터 후보
                        n_iter=60, # 테스트해볼 조합의 개수.
                        scoring="accuracy", 
                        cv=4, 
                        n_jobs=-1
                       )
rs.fit(X_train, y_train)

총 조합수: 1400


In [7]:
print("best score:", rs.best_score_)
print("best parameter:", rs.best_params_)

best score: 0.9507362017280903
best parameter: {'max_leaf_nodes': 25, 'max_features': 0.4, 'max_depth': 2}


In [10]:
# RandomizedSearch에 찾은 하이퍼파라미터들을 기준으로 그 근처의 값들을 좀더 세분화해서 찾는다.
params2 = {
    "max_leaf_nodes":[5,6,7,8,9,10,11], 
    "max_depth":[1, 2, 3, 4, 5],
    "max_features":[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
}

gs = GridSearchCV(DecisionTreeClassifier(random_state=0), 
                   params2, 
                   scoring='accuracy', 
                   cv=4, 
                   n_jobs=-1)
gs.fit(X_train, y_train)

In [11]:
### 결과 확인
print(gs.best_score_)
print(gs.best_params_)

0.9577455475224828
{'max_depth': 4, 'max_features': 0.4, 'max_leaf_nodes': 6}


In [14]:
bm = gs.best_estimator_
pred_test = bm.predict(X_test)
accuracy_score(y_test, pred_test)

0.9300699300699301

In [15]:
gs.predict(X_test)

array([1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0])

In [16]:
bm.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02151488, 0.        , 0.        ,
       0.        , 0.03561708, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.7917904 , 0.        , 0.        , 0.        , 0.02031961,
       0.        , 0.        , 0.13075803, 0.        , 0.        ])

# 파이프라인 (Pipeline)

In [17]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [18]:
# 파이프라인 정의
## 실행 순서에 맞춰서 리스트로 작성
### 각 프로세스는 ("이름", 객체)의 튜플로 구현
steps = [
    ("scaler", StandardScaler()), # 첫번째 프로세스
    ("svm", SVC(random_state=0))  # 두번째 프로세스
]
# 생성
pl = Pipeline(steps, verbose=True) 
# verbose: 실행 로그(기록)을 출력 => 어떤 단계를 실행하고 있는지, 실행에 걸린 시간 등을 출력
print(pl.steps)
print(type(pl.steps))

[('scaler', StandardScaler()), ('svm', SVC(random_state=0))]
<class 'list'>


In [19]:
pl.steps[0]

('scaler', StandardScaler())

In [20]:
# X_train_scaled = scaler.fit_transform(X_train) --> svm.fit(X_train_scaled, y_train)
pl.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing svm, total=   0.0s


In [21]:
# X_test_scaled = scaler.transform(X_test) --> pred = svm.predict(X_test_scaled)

pred_train = pl.predict(X_train)
pred_test = pl.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)

(0.9929577464788732, 0.958041958041958)

In [24]:
# 새로운 데이터로 y를 추정 (서비스)
new_x = X_test[:5]

pred_new = pl.predict(new_x)
pred_new

array([1, 0, 0, 1, 0])

## make_pipeline() 함수를 이용 파이프라인 생성

In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA 

pl2 = make_pipeline(StandardScaler(), 
                    PCA(n_components=5), 
                    DecisionTreeClassifier(),
                    SVC(random_state=0, C=1, gamma=0.1))

In [33]:
pl2.steps

[('standardscaler', StandardScaler()),
 ('pca', PCA(n_components=5)),
 ('decisiontreeclassifier', DecisionTreeClassifier()),
 ('svc', SVC(C=1, gamma=0.1, random_state=0))]

In [34]:
pl2.steps[2], pl2.steps[0]

(('decisiontreeclassifier', DecisionTreeClassifier()),
 ('standardscaler', StandardScaler()))

# TODO Adult dataset

- 전처리
    - 범주형
        - 결측치는 최빈값으로 대체한다.
        - 원핫인코딩 처리한다.
    - 연속형
        - 결측치는 중앙값으로 대체한다.
        - StandardScaling을 한다.
- Model: `sklearn.linear_model.LogisticRegression(max_iter=2000)` 를 사용
- Pipeline을 이용해 전처리와 모델을 묶어준다.

In [37]:
cols = ['age', 'workclass','fnlwgt','education', 'education-num', 'marital-status', 'occupation','relationship', 'race', 'gender','capital-gain','capital-loss', 'hours-per-week','native-country', 'income']
category_columns = ['workclass','education','marital-status', 'occupation','relationship','race','gender','native-country']
continuous_columns = ['age','fnlwgt', 'education-num','capital-gain','capital-loss','hours-per-week']
target = 'income'

In [38]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [39]:
import pandas as pd
df = pd.read_csv('../data/adult.data', 
                  header=None,
                  names=cols,
                  skipinitialspace=True,
                  na_values="?")

In [40]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
gender               0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   gender          32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [43]:
# X, y를 분리
X = df.drop(columns="income")
y = LabelEncoder().fit_transform(df.income)

In [44]:
# train/test set 분리
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.25, 
                                                    stratify=y, 
                                                    random_state=0)

In [45]:
# 파이프라인 
### 전처리 파이프라인 - 컬럼(feature) 타입별로 나눠서 구성
### 범주형 - 결측치(최빈값) -> OneHotEncoding
### 연속형 - 결측치(중앙값) -> StandardScaling

cate_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("category", cate_preprocess, category_columns), 
    ("continuous", num_preprocess, continuous_columns)
])

process = Pipeline([
    ("preprocess", preprocessor), 
    ("model", LogisticRegression(max_iter=2000, random_state=0))
])

In [49]:
# 위의 과정을 make_pipeline() 으로 재구현.

from sklearn.pipeline import make_pipeline

make_pl = make_pipeline(
    
    ## 데이터 전처리 파이프라인
    ColumnTransformer([
    ("category", make_pipeline(SimpleImputer(strategy="most_frequent"), 
                               OneHotEncoder(handle_unknown="ignore")), category_columns),
    ("continuous", make_pipeline(SimpleImputer(strategy="median"), 
                                 StandardScaler()), continuous_columns)]),
    ## 학습 모델 파이프라인
    LogisticRegression(max_iter=2000, random_state=0)
)

In [47]:
process.fit(X_train, y_train)

In [50]:
make_pl.fit(X_train,y_train)

In [51]:
pred_train = process.predict(X_train)
pred_test = process.predict(X_test)

pred_train_pl = make_pl.predict(X_train)
pred_test_pl = make_pl.predict(X_test)

In [52]:
# process 로 만든 모델
accuracy_score(y_train , pred_train), accuracy_score(y_test, pred_test)

(0.8524979524979525, 0.8478073946689596)

In [53]:
# make_pl 로 만든 모델
accuracy_score(y_train , pred_train_pl), accuracy_score(y_test, pred_test_pl)

(0.8524979524979525, 0.8478073946689596)

### make_pipeline() 을 통해 process를 만든 `make_pl` 이 잘됐다.

In [54]:
params = {
    "preprocess__continuous__imputer__strategy":["mean","median"],
    "model__C":[0.01, 0.1, 1, 10]
}
gs = GridSearchCV(process, 
                  params, 
                  scoring="accuracy", 
                  cv=5, 
                  n_jobs=-1)

gs.fit(X_train, y_train)

In [55]:
gs.best_score_, gs.best_params_

(0.8510237510237509,
 {'model__C': 1, 'preprocess__continuous__imputer__strategy': 'mean'})

In [56]:
best_model_pipeline = gs.best_estimator_
best_model_pipeline

# SVM

## Linear SVM 
- 하이퍼 파라미터 'C' too ☝ : overfitting / too 👇 : underfitting

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [13]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

## Kernel SVM (비선형(Non Linear) SVM)
- 하이퍼 파라미터 'C', 'gamma' -> too ☝ : overfitting / too 👇 : underfitting


In [15]:
### gamma도 C와 동일
# 작은 값 =================== 적당한값 =================== 큰값
#underfitting              generalization          overfitting
#<---soft margin (약한규제)                -----> hard margin (강한규제)


### 그러나 gamma 와 C는 각자 독립시행이다. 서로에게 영향을 안줌!

# K-최근접 이웃 (K-Nearest Neighbors, KNN)

### 위스콘신 유방암 데이터를 이용한 암환자분류

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.metrics import accuracy_score

In [18]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

### Boston Housing Dataset 집값 예측
- 회귀문제

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor

import pandas as pd

In [23]:
boston = pd.read_csv('../data/boston_hosing.csv')
X = boston.drop(columns='MEDV')
y = boston['MEDV'] ## MEDV 가 집값..

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)