In [68]:
from sklearn.utils import all_estimators
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

## [1] 데이터 가져오기 및 전처리

In [40]:
mushroom = pd.read_csv("../Data/mushroom.csv", header = None)

In [41]:
mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   object
 2   2       8124 non-null   object
 3   3       8124 non-null   object
 4   4       8124 non-null   object
 5   5       8124 non-null   object
 6   6       8124 non-null   object
 7   7       8124 non-null   object
 8   8       8124 non-null   object
 9   9       8124 non-null   object
 10  10      8124 non-null   object
 11  11      8124 non-null   object
 12  12      8124 non-null   object
 13  13      8124 non-null   object
 14  14      8124 non-null   object
 15  15      8124 non-null   object
 16  16      8124 non-null   object
 17  17      8124 non-null   object
 18  18      8124 non-null   object
 19  19      8124 non-null   object
 20  20      8124 non-null   object
 21  21      8124 non-null   object
 22  22      8124 non-null   

In [66]:
# 데이터와 타겟으로 분리
data = mushroom.iloc[:, 1:]
target = mushroom[0]

In [69]:
# labelencoding 사용
le = LabelEncoder()

for column_name in data.columns:
    if data[column_name].dtype == object:
        data[column_name] = le.fit_transform(data[column_name])
    else:
        pass


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [70]:
# 특징 배열 및 벡터 형태로 변경
X = data.to_numpy()
y = target.to_numpy()

X.shape, y.shape

((8124, 22), (8124,))

In [71]:
# train test 셋 분리
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.2
)

## [2] 분류 모델 선택

In [75]:
def all_estimators_classifier(data, target):
    from collections import defaultdict
    # 필터 타입에 해당하는 sklearn에 존재하는 모든 모델 이름과 객체 리스트로 반환
    models = all_estimators(type_filter = "classifier")

    train_X, test_X, train_y, test_y = train_test_split(
    data, target, test_size=0.2, random_state=42
    )

    scores = defaultdict()

    for name, model in models:
        try:
            # 모델 객체 생성
            md = model()
            # 학습
            md.fit(train_X, train_y)
            # 평가
            score = md.score(test_X, test_y)
            scores[name] = score
        except:
            pass

    return dict(scores)

In [76]:
all_estimators_classifier(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

{'AdaBoostClassifier': 1.0,
 'BaggingClassifier': 1.0,
 'BernoulliNB': 0.8392307692307692,
 'CalibratedClassifierCV': 0.9323076923076923,
 'CategoricalNB': 0.9338461538461539,
 'ComplementNB': 0.7792307692307693,
 'DecisionTreeClassifier': 1.0,
 'DummyClassifier': 0.5130769230769231,
 'ExtraTreeClassifier': 1.0,
 'ExtraTreesClassifier': 1.0,
 'GaussianNB': 0.9138461538461539,
 'GaussianProcessClassifier': 1.0,
 'GradientBoostingClassifier': 1.0,
 'HistGradientBoostingClassifier': 1.0,
 'KNeighborsClassifier': 0.9976923076923077,
 'LabelPropagation': 1.0,
 'LabelSpreading': 1.0,
 'LinearDiscriminantAnalysis': 0.9323076923076923,
 'LinearSVC': 0.9253846153846154,
 'LogisticRegression': 0.9346153846153846,
 'LogisticRegressionCV': 0.9507692307692308,
 'MLPClassifier': 1.0,
 'MultinomialNB': 0.7769230769230769,
 'NearestCentroid': 0.7584615384615384,
 'NuSVC': 0.88,
 'PassiveAggressiveClassifier': 0.933076923076923,
 'Perceptron': 0.9407692307692308,
 'QuadraticDiscriminantAnalysis': 0.652

## [3] RandomForestClassifier 사용

In [73]:
# 모델 학습 및 평가
rf = RandomForestClassifier(
    n_jobs = 4
)

rf.fit(train_X, train_y)

rf.score(test_X, test_y)

1.0

In [77]:
# cross validate 사용
result = cross_validate(rf, X, y, return_train_score=True, cv=10)

In [78]:
result

{'fit_time': array([4.46918106, 0.57073569, 0.49513888, 0.61345172, 0.56470299,
        0.57488656, 0.62123084, 0.49336791, 0.48866487, 0.55358529]),
 'score_time': array([0.1054213 , 0.1048336 , 0.10460329, 0.10425591, 0.10448956,
        0.10429955, 0.10899591, 0.10377288, 0.10436201, 0.10434127]),
 'test_score': array([0.68511685, 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 0.95197044, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}