In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()

In [3]:
t = dataset.target
x = dataset.data

In [4]:
x.shape, t.shape

((569, 30), (569,))

In [6]:
from sklearn.model_selection import train_test_split
x_train_val, x_test, t_train_val, t_test = train_test_split(x, t, test_size = 0.2, random_state = 1)

In [7]:
x_train, x_val, t_train, t_val = train_test_split(x_train_val, t_train_val, test_size = 0.3, random_state = 1)

In [8]:
x_train.shape, x_val.shape, x_test.shape

((318, 30), (137, 30), (114, 30))

In [9]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state = 0)

In [10]:
dtree.fit(x_train, t_train)

DecisionTreeClassifier(random_state=0)

In [11]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))

train score :  1.0
validation score :  0.927007299270073


In [13]:
dtree = DecisionTreeClassifier(max_depth = 10, min_samples_split = 30, random_state = 0)

dtree.fit(x_train, t_train)

DecisionTreeClassifier(max_depth=10, min_samples_split=30, random_state=0)

In [14]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))

train score :  0.9308176100628931
validation score :  0.9562043795620438


In [15]:
print('test score : ', dtree.score(x_test, t_test))

test score :  0.9298245614035088


In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
estimator = DecisionTreeClassifier(random_state = 0)

In [18]:
param_grid = [{
    'max_depth': [3, 20, 50],
    'min_samples_split': [3, 20, 30]
}]

In [19]:
cv = 5

In [20]:
tuned_model = GridSearchCV(estimator = estimator,
                          param_grid = param_grid,
                          cv = cv,
                          return_train_score = False)

In [21]:
tuned_model.fit(x_train_val, t_train_val)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid=[{'max_depth': [3, 20, 50],
                          'min_samples_split': [3, 20, 30]}])

In [22]:
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.003629,0.004887,0.003493,0.004036,0.003613,0.004291,0.003635,0.003809,0.004108
std_fit_time,0.000149,0.001002,0.000081,0.000225,0.000283,0.000334,0.000425,0.000231,0.000395
mean_score_time,0.000403,0.000602,0.000354,0.000403,0.000379,0.000481,0.000356,0.00035,0.000454
std_score_time,0.000032,0.000206,0.000016,0.000102,0.000058,0.000087,0.000064,0.000018,0.00012
param_max_depth,3,3,3,20,20,20,50,50,50
param_min_samples_split,3,20,30,3,20,30,3,20,30
params,"{'max_depth': 3, 'min_samples_split': 3}","{'max_depth': 3, 'min_samples_split': 20}","{'max_depth': 3, 'min_samples_split': 30}","{'max_depth': 20, 'min_samples_split': 3}","{'max_depth': 20, 'min_samples_split': 20}","{'max_depth': 20, 'min_samples_split': 30}","{'max_depth': 50, 'min_samples_split': 3}","{'max_depth': 50, 'min_samples_split': 20}","{'max_depth': 50, 'min_samples_split': 30}"
split0_test_score,0.923077,0.912088,0.912088,0.956044,0.912088,0.912088,0.956044,0.912088,0.912088
split1_test_score,0.901099,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.934066,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [23]:
estimator = DecisionTreeClassifier(random_state = 0)
cv = 5
param_grid = [{
    'max_depth': [5, 10, 15],
    'min_samples_split': [10, 12, 15]
}]

In [25]:
tuned_model = GridSearchCV(estimator = estimator,
                          param_grid = param_grid,
                          cv = cv,
                          return_train_score = False)

tuned_model.fit(x_train_val, t_train_val)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid=[{'max_depth': [5, 10, 15],
                          'min_samples_split': [10, 12, 15]}])

In [26]:
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.004435,0.004302,0.004126,0.003521,0.003108,0.003081,0.003072,0.003102,0.002992
std_fit_time,0.000214,0.000202,0.000227,0.000346,0.000172,0.000161,0.000146,0.000127,0.000114
mean_score_time,0.000447,0.000421,0.000371,0.000299,0.000258,0.000254,0.000245,0.000274,0.000244
std_score_time,0.000048,0.00003,0.00005,0.000051,0.00001,0.000007,0.000005,0.000055,0.000001
param_max_depth,5,5,5,10,10,10,15,15,15
param_min_samples_split,10,12,15,10,12,15,10,12,15
params,"{'max_depth': 5, 'min_samples_split': 10}","{'max_depth': 5, 'min_samples_split': 12}","{'max_depth': 5, 'min_samples_split': 15}","{'max_depth': 10, 'min_samples_split': 10}","{'max_depth': 10, 'min_samples_split': 12}","{'max_depth': 10, 'min_samples_split': 15}","{'max_depth': 15, 'min_samples_split': 10}","{'max_depth': 15, 'min_samples_split': 12}","{'max_depth': 15, 'min_samples_split': 15}"
split0_test_score,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088
split1_test_score,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [27]:
tuned_model.best_params_

{'max_depth': 5, 'min_samples_split': 10}

In [28]:
best_model = tuned_model.best_estimator_

print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


In [29]:
from sklearn.model_selection import RandomizedSearchCV

In [30]:
estimator = DecisionTreeClassifier(random_state = 0)

In [31]:
list(range(1, 10, 2))

[1, 3, 5, 7, 9]

In [32]:
param_distributions = {
    'max_depth': list(range(5, 100, 2)),
    'min_samples_split': list(range(2, 50, 1))
}

In [33]:
n_iter = 100
cv = 5

In [34]:
tuned_model = RandomizedSearchCV(estimator = estimator,
                                param_distributions = param_distributions,
                                n_iter = n_iter,
                                cv = cv,
                                random_state = 0,
                                return_train_score = False)

In [35]:
tuned_model.fit(x_train_val, t_train_val)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
                   n_iter=100,
                   param_distributions={'max_depth': [5, 7, 9, 11, 13, 15, 17,
                                                      19, 21, 23, 25, 27, 29,
                                                      31, 33, 35, 37, 39, 41,
                                                      43, 45, 47, 49, 51, 53,
                                                      55, 57, 59, 61, 63, ...],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10, 11, 12,
                                                              13, 14, 15, 16,
                                                              17, 18, 19, 20,
                                                              21, 22, 23, 24,
                                                              25, 26, 27, 28,
                             

In [36]:
pd.DataFrame(tuned_model.cv_results_).sort_values('rank_test_score').T

Unnamed: 0,47,77,82,90,42,19,28,12,11,62,...,40,41,98,50,55,58,60,67,31,99
mean_fit_time,0.004102,0.0032,0.003187,0.003199,0.003182,0.003019,0.003045,0.003014,0.003055,0.003173,...,0.003045,0.003161,0.003076,0.00412,0.003014,0.003028,0.003065,0.003044,0.003034,0.003026
std_fit_time,0.000165,0.000279,0.00012,0.000105,0.00011,0.000124,0.000302,0.000139,0.00013,0.000108,...,0.00011,0.000257,0.000148,0.00019,0.00018,0.000185,0.000113,0.000196,0.000246,0.000129
mean_score_time,0.000364,0.000276,0.000282,0.000292,0.000265,0.000243,0.000238,0.000244,0.000254,0.000268,...,0.000257,0.000282,0.00029,0.000385,0.000255,0.000262,0.000267,0.000264,0.000267,0.00027
std_score_time,0.000025,0.00004,0.000022,0.000067,0.000011,0.000001,0.000005,0.000001,0.000014,0.000021,...,0.000006,0.00002,0.000033,0.000048,0.000007,0.000015,0.000018,0.000026,0.000049,0.000021
param_min_samples_split,10,10,4,4,7,9,11,2,8,7,...,49,31,45,27,43,36,36,47,44,39
param_max_depth,23,65,95,39,15,37,7,87,29,7,...,87,23,19,99,27,27,47,75,95,87
params,"{'min_samples_split': 10, 'max_depth': 23}","{'min_samples_split': 10, 'max_depth': 65}","{'min_samples_split': 4, 'max_depth': 95}","{'min_samples_split': 4, 'max_depth': 39}","{'min_samples_split': 7, 'max_depth': 15}","{'min_samples_split': 9, 'max_depth': 37}","{'min_samples_split': 11, 'max_depth': 7}","{'min_samples_split': 2, 'max_depth': 87}","{'min_samples_split': 8, 'max_depth': 29}","{'min_samples_split': 7, 'max_depth': 7}",...,"{'min_samples_split': 49, 'max_depth': 87}","{'min_samples_split': 31, 'max_depth': 23}","{'min_samples_split': 45, 'max_depth': 19}","{'min_samples_split': 27, 'max_depth': 99}","{'min_samples_split': 43, 'max_depth': 27}","{'min_samples_split': 36, 'max_depth': 27}","{'min_samples_split': 36, 'max_depth': 47}","{'min_samples_split': 47, 'max_depth': 75}","{'min_samples_split': 44, 'max_depth': 95}","{'min_samples_split': 39, 'max_depth': 87}"
split0_test_score,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.956044,0.967033,0.967033,...,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088
split1_test_score,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.901099,0.912088,0.912088,0.912088,...,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099
split2_test_score,0.923077,0.923077,0.912088,0.912088,0.912088,0.912088,0.923077,0.923077,0.912088,0.912088,...,0.945055,0.934066,0.945055,0.934066,0.945055,0.945055,0.945055,0.945055,0.945055,0.945055


In [37]:
tuned_model.best_params_

{'min_samples_split': 10, 'max_depth': 23}

In [38]:
best_model = tuned_model.best_estimator_

In [39]:
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


In [41]:
import optuna

In [42]:
from sklearn.model_selection import cross_val_score

In [43]:
def objective(trial, x, t, cv):
    max_depth = trial.suggest_int('max_depth', 2, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    
    estimator = DecisionTreeClassifier(
        max_depth = max_depth,
        min_samples_split = min_samples_split
    )
    
    print('Current_params : ', trial.params)
    accuracy = cross_val_score(estimator, x, t, cv = cv).mean()
    return accuracy

In [44]:
study = optuna.create_study(direction = 'maximize')

[32m[I 2021-04-17 22:51:33,084][0m A new study created in memory with name: no-name-16bc6e21-e7c6-48b6-8d17-c31bee248313[0m


In [46]:
cv = 5
study.optimize(lambda trial: objective(trial, x_train_val, t_train_val, cv), n_trials = 10)
print(study.best_trial)

[32m[I 2021-04-17 22:57:13,572][0m Trial 0 finished with value: 0.9186813186813187 and parameters: {'max_depth': 32, 'min_samples_split': 31}. Best is trial 0 with value: 0.9186813186813187.[0m
[32m[I 2021-04-17 22:57:13,603][0m Trial 1 finished with value: 0.9186813186813187 and parameters: {'max_depth': 40, 'min_samples_split': 62}. Best is trial 0 with value: 0.9186813186813187.[0m
[32m[I 2021-04-17 22:57:13,636][0m Trial 2 finished with value: 0.9208791208791209 and parameters: {'max_depth': 39, 'min_samples_split': 84}. Best is trial 2 with value: 0.9208791208791209.[0m
[32m[I 2021-04-17 22:57:13,675][0m Trial 3 finished with value: 0.9208791208791209 and parameters: {'max_depth': 75, 'min_samples_split': 37}. Best is trial 2 with value: 0.9208791208791209.[0m
[32m[I 2021-04-17 22:57:13,696][0m Trial 4 finished with value: 0.9186813186813187 and parameters: {'max_depth': 2, 'min_samples_split': 22}. Best is trial 2 with value: 0.9208791208791209.[0m
[32m[I 2021-04-

Current_params :  {'max_depth': 32, 'min_samples_split': 31}
Current_params :  {'max_depth': 40, 'min_samples_split': 62}
Current_params :  {'max_depth': 39, 'min_samples_split': 84}
Current_params :  {'max_depth': 75, 'min_samples_split': 37}
Current_params :  {'max_depth': 2, 'min_samples_split': 22}
Current_params :  {'max_depth': 6, 'min_samples_split': 11}
Current_params :  {'max_depth': 60, 'min_samples_split': 66}


[32m[I 2021-04-17 22:57:13,756][0m Trial 6 finished with value: 0.9208791208791209 and parameters: {'max_depth': 60, 'min_samples_split': 66}. Best is trial 5 with value: 0.945054945054945.[0m
[32m[I 2021-04-17 22:57:13,782][0m Trial 7 finished with value: 0.9186813186813187 and parameters: {'max_depth': 68, 'min_samples_split': 57}. Best is trial 5 with value: 0.945054945054945.[0m
[32m[I 2021-04-17 22:57:13,814][0m Trial 8 finished with value: 0.9208791208791209 and parameters: {'max_depth': 46, 'min_samples_split': 48}. Best is trial 5 with value: 0.945054945054945.[0m
[32m[I 2021-04-17 22:57:13,842][0m Trial 9 finished with value: 0.9186813186813187 and parameters: {'max_depth': 35, 'min_samples_split': 59}. Best is trial 5 with value: 0.945054945054945.[0m


Current_params :  {'max_depth': 68, 'min_samples_split': 57}
Current_params :  {'max_depth': 46, 'min_samples_split': 48}
Current_params :  {'max_depth': 35, 'min_samples_split': 59}
FrozenTrial(number=5, values=[0.945054945054945], datetime_start=datetime.datetime(2021, 4, 17, 22, 57, 13, 697358), datetime_complete=datetime.datetime(2021, 4, 17, 22, 57, 13, 726637), params={'max_depth': 6, 'min_samples_split': 11}, distributions={'max_depth': IntUniformDistribution(high=100, low=2, step=1), 'min_samples_split': IntUniformDistribution(high=100, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=5, state=TrialState.COMPLETE, value=None)


In [47]:
study.best_params

{'max_depth': 6, 'min_samples_split': 11}

In [48]:
best_model = DecisionTreeClassifier(**study.best_params)

best_model.fit(x_train_val, t_train_val)

print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.9473684210526315
