In [None]:
import pandas as pd

In [None]:
from sklearn.datasets import load_wine

In [None]:
# 와인 데이터를 로드한다.
wine = load_wine()
# 와인 데이터에서 feature 로 되어있는 데이터를 갖고 온다.
wine_data = wine.data
# 와인 데이터에서 target 데이터를 갖고온다.
wine_target = wine.target
# wine 데이터의 feature, target로 Dataframe을 만든다.
df_wine = pd.DataFrame(data = wine_data, columns=[wine.feature_names])
# feature 만 있는 데이터 프레임에 타겟 데이터를 추가한다.
df_wine['target'] = wine.target
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [None]:
df_wine.columns

MultiIndex([(                     'alcohol',),
            (                  'malic_acid',),
            (                         'ash',),
            (           'alcalinity_of_ash',),
            (                   'magnesium',),
            (               'total_phenols',),
            (                  'flavanoids',),
            (        'nonflavanoid_phenols',),
            (             'proanthocyanins',),
            (             'color_intensity',),
            (                         'hue',),
            ('od280/od315_of_diluted_wines',),
            (                     'proline',),
            (                      'target',)],
           )

In [None]:
data = df_wine[['alcohol', 'ash', 'hue']].to_numpy()
target = df_wine['target'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [None]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [None]:
print(sub_input.shape, val_input.shape)

(113, 3) (29, 3)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

1.0
0.896551724137931


In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00188541, 0.00149846, 0.00142407, 0.00153327, 0.00142884]), 'score_time': array([0.00089025, 0.00077391, 0.00073695, 0.00075245, 0.0006969 ]), 'test_score': array([0.82758621, 0.75862069, 0.92857143, 0.75      , 0.96428571])}


In [None]:
import numpy as np

print(np.mean(scores['test_score']))

0.8458128078817735


In [None]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.8458128078817735


In [None]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8323809523809524


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [None]:
gs.fit(train_input, train_target)

In [None]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

1.0


In [None]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [None]:
print(gs.cv_results_['mean_test_score'])

[0.84581281 0.84581281 0.84581281 0.84581281 0.84581281]


In [None]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [None]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [None]:
print(gs.best_params_)

{'max_depth': 5, 'min_impurity_decrease': 0.0001, 'min_samples_split': 2}


In [None]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8458128078817735
