# ModelSelection 모듈

- 훈련 데이터와 테스트 데이터를 분리하지 않고 수행

In [15]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [16]:
iris = load_iris()
dtc = DecisionTreeClassifier(random_state = 1122)
dtc.fit(iris.data, iris.target)

DecisionTreeClassifier(random_state=1122)

In [17]:
dtc.score(iris.data, iris.target)

1.0

- 교차 검증  : cross_validate() 
- 교차검증 + 평가  -> cross_val_score()

In [None]:
# cross_val_score 는 잘 사용 안함.
from sklearn.model_selection import cross_val_score

# 모델, X, y, 정확도로 scoring, cv =  검증 횟수
cross_val_score(dtc, iris.data, iris.target, scoring = 'accuracy', cv = 5)

array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])

In [None]:
cross_val_score(dtc, iris.data, iris.target, scoring = 'accuracy', cv = 5).mean()

0.9666666666666668

#### GdidSearchCV
- 교차검증과 최적 하이퍼 파라미터 튜닝을 한번에

## train test set 분류

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2,
                                                    stratify = iris.target, random_state = 1122)

In [None]:
dtc = DecisionTreeClassifier(random_state = 1122)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 1122,
 'splitter': 'best'}

In [19]:
params = {
    'max_depth' :[2,3,4,5,6],
    'min_samples_split' : [2, 3, 4]
    }

In [22]:
from sklearn.model_selection import GridSearchCV

# 객체 생성
# estimator  : 함수
grid_dt = GridSearchCV(dtc, param_grid= params, scoring = 'accuracy', cv = 5)

In [23]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1122),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

### 베스트 파라미터

In [24]:
# 위에서 돌린 150번 중 베스트 파라미터를 반환해줌
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [26]:
# 위의 경우일 때 score
grid_dt.best_score_

0.9583333333333334

In [28]:
# 최적 분류기 설정

# dtc_best = DecisionTreeClassifier(max_depth = 2, min_samples_split = 2, random_state = 1122)
dtc_best = grid_dt.best_estimator_

### 예측 및 평가

In [29]:
dtc_best.score(X_test, y_test)

0.9333333333333333

## SVM

In [30]:
from sklearn.svm import SVC

svc = SVC(random_state = 1122)

svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 1122,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [31]:
# svm에서는 주로 C값만 봄.
# C 값은 실수라서 0.1 0.01 0.001 다 가능.
params = {'C' : [0.01, 0.1, 1, 10, 100]}

grid_sv = GridSearchCV(svc, params, scoring = 'accuracy', cv = 5)

In [34]:
grid_sv.fit(X_train, y_train)

grid_sv.best_params_

{'C': 10}

- 범위 좁히기

In [35]:
params = {'C' : [0.3, 0.6, 1, 4, 8]}
grid_sv = GridSearchCV(svc, params, scoring = 'accuracy', cv = 5)

In [36]:
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 4}

In [37]:
params = {'C' : [2, 3, 4, 5, 6]}
grid_sv = GridSearchCV(svc, params, scoring = 'accuracy', cv = 5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 3}

In [40]:
svc_best = grid_sv.best_estimator_
svc_best.score(X_test, y_test)

1.0