## 정확도 경진대회
- load_digits
- load_breast_cancer

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

### 1) load_digits

In [2]:
from sklearn.datasets import load_digits
digits = load_digits()

In [3]:
digits.data.shape

(1797, 64)

In [4]:
digits.target.shape

(1797,)

#### 1.1) 정규화 - StandardScaler

In [5]:
scaler = StandardScaler()
scaled_digits = scaler.fit_transform(digits.data)

#### train / test 세트 분리 - test_size=0.2, random_state=2021

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    scaled_digits, digits.target, test_size=0.2, random_state=2021
)

#### 1.1.1) 학습/예측/평가 - dt, lr, svc, knn

In [7]:
dt = DecisionTreeClassifier()
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [8]:
classifiers = [lr, svc, knn]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, pred)
    class_name = classifier.__class__.__name__
    print(f'{class_name} 정확도: {acc:.4f}')

LogisticRegression 정확도: 0.9639
SVC 정확도: 0.9806
KNeighborsClassifier 정확도: 0.9750


#### 1.1.2) 앙상블 학습/예측/평가 수행 - dt, lr, svc, knn

- voting='hard'

In [9]:
vo_clf = VotingClassifier(
    estimators=[('DT', dt), ('LR', lr), ('SVC', svc), ('KNN', knn)], voting='hard'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'앙상블 학습 정확도: {acc:.4f}')

앙상블 학습 정확도: 0.9750


#### 1.1.3) 랜덤 포레스트 학습/예측/평가

In [10]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')

랜덤 포레스트 정확도: 0.9694


In [11]:
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [12]:
params = {
    'n_estimators': [180, 190, 200],
    'max_depth': [12, 14, 16],
    'min_samples_split': [2, 3, 4]
}

In [13]:
rf_clf = RandomForestClassifier(random_state=156, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9749
최적 파라미터: {'max_depth': 12, 'min_samples_split': 2, 'n_estimators': 200}


In [14]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9778


#### 1.2) 정규화 - MinMaxScaler

In [15]:
scaler = MinMaxScaler()
scaled_digits = scaler.fit_transform(digits.data)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    scaled_digits, digits.target, test_size=0.2, random_state=2021
)

#### 1.2.1) 학습/예측/평가 - dt, lr, svc, knn

In [17]:
dt = DecisionTreeClassifier()
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [18]:
classifiers = [lr, svc, knn]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, pred)
    class_name = classifier.__class__.__name__
    print(f'{class_name} 정확도: {acc:.4f}')

LogisticRegression 정확도: 0.9528
SVC 정확도: 0.9778
KNeighborsClassifier 정확도: 0.9889


#### 1.2.2) 앙상블 학습/예측/평가 수행 - dt, lr, svc, knn

- voting='hard'

In [19]:
vo_clf = VotingClassifier(
    estimators=[('DT', dt), ('LR', lr), ('SVC', svc), ('KNN', knn)], voting='hard'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'앙상블 학습 정확도: {acc:.4f}')

앙상블 학습 정확도: 0.9750


- voting='soft' (svc 제외)

In [20]:
vo_clf = VotingClassifier(
    estimators=[('DT', dt), ('LR', lr), ('KNN', knn)], voting='soft'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'앙상블 학습 정확도: {acc:.4f}')

앙상블 학습 정확도: 0.9722


#### 1.2.3) 랜덤 포레스트 학습/예측/평가

In [21]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')

랜덤 포레스트 정확도: 0.9611


In [22]:
params = {
    'n_estimators': [100, 120, 140],
    'max_depth': [8, 12, 16],
    'min_samples_split': [5, 10, 15]
}

In [23]:
rf_clf = RandomForestClassifier(n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9736
최적 파라미터: {'max_depth': 12, 'min_samples_split': 5, 'n_estimators': 120}


In [24]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9583


#### 1.2.4) KNeighborsClassifier

In [25]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'KNeighborsClassifier 정확도: {acc:.4f}')

KNeighborsClassifier 정확도: 0.9889


In [26]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

#### load_digits 최고 정확도 결과값: 0.9889
- 정규화: MinMaxScaler
- 분류모델: KNeighborsClassifier 

### 2) load_breast_cancer

In [28]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [29]:
cancer.data.shape

(569, 30)

In [30]:
cancer.target.shape

(569,)

#### 2.1) 정규화 - StandardScaler

In [31]:
scaler = StandardScaler()
scaled_cancer = scaler.fit_transform(cancer.data)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    scaled_cancer, cancer.target, test_size=0.2, random_state=2021
)

#### 2.1.1) 학습/예측/평가 수행 - dt, lr, svc, knn

In [33]:
dt = DecisionTreeClassifier()
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [34]:
classifiers = [dt, lr, svc, knn]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, pred)
    class_name = classifier.__class__.__name__
    print(f'{class_name} 정확도: {acc:.4f}')

DecisionTreeClassifier 정확도: 0.9298
LogisticRegression 정확도: 0.9912
SVC 정확도: 0.9912
KNeighborsClassifier 정확도: 0.9649


#### 2.1.2) 앙상블 학습/예측/평가 수행 - dt, lr, svc, knn

- voting='hard'

In [35]:
vo_clf = VotingClassifier(
    estimators=[('DT', dt), ('LR', lr), ('SVC', svc), ('KNN', knn)], voting='hard'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'앙상블 학습 정확도: {acc:.4f}')

앙상블 학습 정확도: 0.9912


- voting='soft' (svc 제외)

In [36]:
vo_clf = VotingClassifier(
    estimators=[('DT', dt), ('LR', lr), ('KNN', knn)], voting='soft'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'앙상블 학습 정확도: {acc:.4f}')

앙상블 학습 정확도: 0.9737


#### 2.1.3) 랜덤 포레스트 학습/예측/평가

In [37]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')

랜덤 포레스트 정확도: 0.9649


In [38]:
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [39]:
params = {
    'n_estimators': [100, 120, 140],
    'max_depth': [2, 6, 10],
    'min_samples_split': [6, 8, 10]
}

In [40]:
rf_clf = RandomForestClassifier(n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9604
최적 파라미터: {'max_depth': 10, 'min_samples_split': 8, 'n_estimators': 100}


In [41]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9649


#### 2.2) 정규화 - MinMaxScaler

In [42]:
scaler = MinMaxScaler()
scaled_cancer = scaler.fit_transform(cancer.data)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    scaled_cancer, cancer.target, test_size=0.2, random_state=2021
)

In [44]:
dt = DecisionTreeClassifier()
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [45]:
classifiers = [dt, lr, svc, knn]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, pred)
    class_name = classifier.__class__.__name__
    print(f'{class_name} 정확도: {acc:.4f}')

DecisionTreeClassifier 정확도: 0.9123
LogisticRegression 정확도: 0.9561
SVC 정확도: 0.9912
KNeighborsClassifier 정확도: 0.9825


#### 2.2.2) 앙상블 학습/예측/평가 수행 - dt, lr, svc, knn

- voting='hard'

In [46]:
vo_clf = VotingClassifier(
    estimators=[('DT', dt), ('LR', lr), ('SVC', svc), ('KNN', knn)], voting='hard'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'앙상블 학습 정확도: {acc:.4f}')

앙상블 학습 정확도: 0.9912


- voting='soft' (svc 제외)

In [47]:
vo_clf = VotingClassifier(
    estimators=[('DT', dt), ('LR', lr), ('KNN', knn)], voting='soft'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'앙상블 학습 정확도: {acc:.4f}')

앙상블 학습 정확도: 0.9649


#### 2.2.3) 랜덤 포레스트 학습/예측/평가

In [48]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')

랜덤 포레스트 정확도: 0.9561


In [49]:
params = {
    'n_estimators': [100, 120, 140],
    'max_depth': [8, 12, 16],
    'min_samples_split': [5, 7, 9]
}

In [50]:
rf_clf = RandomForestClassifier(n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9604
최적 파라미터: {'max_depth': 8, 'min_samples_split': 5, 'n_estimators': 100}


In [51]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9649


#### 2.2.4) SVC 학습/예측/평가

In [52]:
svc = SVC()
svc.fit(X_train, y_train)
pred = svc.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'SVC 정확도: {acc:.4f}')

SVC 정확도: 0.9912


In [53]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [54]:
params = {
    'degree': [2, 3, 4]
}
grid_clf = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=130)
grid_clf.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_clf.best_score_:.4f}')
print(f'최적 파라미터: {grid_clf.best_params_}')

최고 평균 정확도: 0.9776
최적 파라미터: {'degree': 2}


In [55]:
best_clf = grid_clf.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 정확도: {acc:.4f}')

최적 파라미터 정확도: 0.9912


#### load_breast_cancer 최고 정확도 결과값: 0.9912
- 정규화: StandardScaler
- 분류모델: LogisticRegression 또는 SVC
****************************************************
- 정규화: MinMaxScaler
- 분류모델: SVC