### Bagging
#### 위스콘신 유방암 데이터

- from sklearn.ensemble import BaggingClassifier

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
cancer = datasets.load_breast_cancer()

In [3]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, train_size=0.8, test_size=0.2,
                                                    random_state=156)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score

In [7]:
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(x_train, y_train)

pred_lr = lr_clf.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
print(accuracy_score(y_test, pred_lr))
print(mean_squared_error(y_test, pred_lr))

0.956140350877193
0.043859649122807015


In [10]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(base_estimator=lr_clf,
                            n_estimators=5,
                            verbose=1)

In [11]:
lr_clf_bag = bag_clf.fit(x_train, y_train)
pred_lr_bag = lr_clf_bag.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:
pred_lr_bag

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [13]:
print(accuracy_score(y_test, pred_lr_bag))
print(mean_squared_error(y_test, pred_lr_bag))

0.9473684210526315
0.05263157894736842


In [14]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)
pred_dt = dt_clf.predict(x_test)
print(accuracy_score(y_test,pred_dt))

0.9385964912280702


In [17]:

bag_dt_clf = BaggingClassifier(base_estimator=dt_clf,
                            n_estimators=100,
                            verbose=1)

In [18]:
bag_dt_clf.fit(x_train, y_train)
pred_dt_bag = bag_dt_clf.predict(x_test)

print(accuracy_score(y_test, pred_dt_bag))
print(mean_squared_error(y_test, pred_dt_bag))



0.9649122807017544
0.03508771929824561


### RandomForest

- from sklearn.ensemble import RandomForestClassifier
- from sklearn.model_selection import GridSearchCV

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier(n_estimators=5,
                                max_depth=3,
                                random_state=103,
                                verbose=1)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
print(accuracy_score(y_test, pred))

0.9298245614035088


In [20]:
rf_clf = RandomForestClassifier(n_estimators=500,
                                max_depth=3,
                                random_state=103,
                                verbose=1)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
print(accuracy_score(y_test, pred))

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.6s


0.9385964912280702


[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.0s


In [21]:
rf_clf = RandomForestClassifier(n_estimators=500,
                                max_depth=10,
                                random_state=103,
                                verbose=1)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
print(accuracy_score(y_test, pred))

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


0.9473684210526315


[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.0s


In [22]:
rf_clf4 = RandomForestClassifier()

In [23]:
params = {'n_estimators':[10, 100, 500, 1000],
          'max_depth' : [3, 5, 10, 15]}

rf_clf4 = RandomForestClassifier(random_state=103,
                                 n_jobs=-1,
                                 verbose=1)
grid_cv = GridSearchCV(rf_clf4,
                       param_grid=params,
                       )
grid_cv.fit(x_train, y_train)
print(grid_cv.best_params_)
print(grid_cv.best_score_)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_j

{'max_depth': 10, 'n_estimators': 1000}
0.9670329670329672


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.2s finished
