### sklearn 모델 추천 메서드 활용
---
- sklearn.utils 모듈의 all_estimators()
    * type_filter 파라미터 : 'classifier', 'regressor' 지정
    * 반환 : 해당 타입의 모델 리스트 => 모델 이름, 모델 객체

In [1]:
# 모듈 로딩
from sklearn.utils import all_estimators
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

### [1] 데이터 로딩 및 확인
---

In [2]:
# Bunch 데이터 타입 => dict와 유사한 형태
data = load_iris()

In [3]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
# data, target => numpy  타입
# target_names => 라벨 -> setosa, versicolor ...
# feature_names => 컬럼명 -> petal_length, ...

# 필요한 것은 data와 target
input_data = data['data']
input_target = data['target']

In [5]:
input_data.shape, input_target.shape

((150, 4), (150,))

### [2] 학습 데이터 분리
---

In [6]:
# 학습-테스트 데이터셋 구분 (80:20)
train_X, test_X, train_y, test_y = train_test_split(input_data,
                                                    input_target,
                                                    test_size=0.2)

In [9]:
# 한번 더 추가해서 학습-검증용 데이터셋으로 추가 구분 (80:20)
train_X, val_X, train_y, val_y = train_test_split(train_X,
                                                    train_y,
                                                    test_size=0.2)

In [10]:
# iris의 경우 150개 데이터 밖에 없어서 
# - 이렇게 쪼개면 훈련할 데이터가 너무 적음
# 

### [3] 학습
---
- 학습방법 선정 -> 분류 / 회귀
- 분류 => kNN, LogisticRegression, DecisionTree, SGDClassifier, SVC
- 회귀 => kNN, DecisionTree
- but, 이번에는 유틸을 써서 해보자

- 분류 모델 적용 후, 결과 -> all_estimators()

In [16]:
# 필터 타입에 해당하는 sklearn에 존재하는 모든 모델 이름과 객체 리스트로 반환
models_c = all_estimators(type_filter='classifier')

In [17]:
models_c

[('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('CalibratedClassifierCV', sklearn.calibration.CalibratedClassifierCV),
 ('CategoricalNB', sklearn.naive_bayes.CategoricalNB),
 ('ClassifierChain', sklearn.multioutput.ClassifierChain),
 ('ComplementNB', sklearn.naive_bayes.ComplementNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('ExtraTreeClassifier', sklearn.tree._classes.ExtraTreeClassifier),
 ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('GaussianProcessClassifier',
  sklearn.gaussian_process._gpc.GaussianProcessClassifier),
 ('GradientBoostingClassifier',
  sklearn.ensemble._gb.GradientBoostingClassifier),
 ('HistGradientBoostingClassifier',
  sklearn.ensemble._hist_gradi

In [18]:
models_r = all_estimators(type_filter='regressor')

In [19]:
models_r

[('ARDRegression', sklearn.linear_model._bayes.ARDRegression),
 ('AdaBoostRegressor', sklearn.ensemble._weight_boosting.AdaBoostRegressor),
 ('BaggingRegressor', sklearn.ensemble._bagging.BaggingRegressor),
 ('BayesianRidge', sklearn.linear_model._bayes.BayesianRidge),
 ('CCA', sklearn.cross_decomposition._pls.CCA),
 ('DecisionTreeRegressor', sklearn.tree._classes.DecisionTreeRegressor),
 ('DummyRegressor', sklearn.dummy.DummyRegressor),
 ('ElasticNet', sklearn.linear_model._coordinate_descent.ElasticNet),
 ('ElasticNetCV', sklearn.linear_model._coordinate_descent.ElasticNetCV),
 ('ExtraTreeRegressor', sklearn.tree._classes.ExtraTreeRegressor),
 ('ExtraTreesRegressor', sklearn.ensemble._forest.ExtraTreesRegressor),
 ('GammaRegressor', sklearn.linear_model._glm.glm.GammaRegressor),
 ('GaussianProcessRegressor',
  sklearn.gaussian_process._gpr.GaussianProcessRegressor),
 ('GradientBoostingRegressor', sklearn.ensemble._gb.GradientBoostingRegressor),
 ('HistGradientBoostingRegressor',
  sk

In [None]:
# SGD를 하기에는 횟수가 적다는 것
=> 조건문으로 SGD인 것은 배제하거나
=> warningignore

In [14]:
scores=[]
for name, model in models:
    try :
        # 모델 객체 생성
        md=model()
        # 학습
        md.fit(train_X, train_y)
        #평가
        result = md.score(test_X, test_y)
        scores.append((name, np.round(result,3)))
        # 모델명과 결과를 따로 담아도 되고
    except:
        pass   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [15]:
# 이 중에서 하나의 모델을 골라서
# - 하이퍼 파라미터를 조절하면서 성능을 향상 시켜봐
scores

[('AdaBoostClassifier', 0.933),
 ('BaggingClassifier', 0.933),
 ('BernoulliNB', 0.3),
 ('CalibratedClassifierCV', 0.8),
 ('CategoricalNB', 1.0),
 ('ComplementNB', 0.7),
 ('DecisionTreeClassifier', 0.933),
 ('DummyClassifier', 0.3),
 ('ExtraTreeClassifier', 0.9),
 ('ExtraTreesClassifier', 0.933),
 ('GaussianNB', 0.933),
 ('GaussianProcessClassifier', 0.933),
 ('GradientBoostingClassifier', 0.933),
 ('HistGradientBoostingClassifier', 0.967),
 ('KNeighborsClassifier', 0.933),
 ('LabelPropagation', 0.933),
 ('LabelSpreading', 0.933),
 ('LinearDiscriminantAnalysis', 0.967),
 ('LinearSVC', 0.9),
 ('LogisticRegression', 0.933),
 ('LogisticRegressionCV', 0.933),
 ('MLPClassifier', 1.0),
 ('MultinomialNB', 0.767),
 ('NearestCentroid', 0.967),
 ('NuSVC', 0.933),
 ('PassiveAggressiveClassifier', 0.767),
 ('Perceptron', 0.533),
 ('QuadraticDiscriminantAnalysis', 0.967),
 ('RadiusNeighborsClassifier', 0.933),
 ('RandomForestClassifier', 0.933),
 ('RidgeClassifier', 0.767),
 ('RidgeClassifierCV', 0.