In [1]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [2]:
# Decision Tree (Split data randomly from 90~10% + different depth + different min sample leaf)
from sklearn import tree
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in depthvalues:
        for j in leafvalues:
            clf = tree.DecisionTreeClassifier(random_state = 71, max_depth = i, min_samples_leaf = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train) #train
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test) #train
            test_acc = accuracy_score(y_pred_test, y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_depth = i
                relative_best_leaf = j

print("best ratio of testing data:", relative_best_ratio, "best depth:", relative_best_depth, "best min_sample_leaf:", relative_best_leaf, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best depth: 5 best min_sample_leaf: 1 
Training score: 0.994140625 Testing score: 0.9824561403508771


In [3]:
# Decision Tree RandomizedSearchCV
# (Split training and testing data randomly (80% vs. 20%) + 
# different depth + different min sample leaf)

from sklearn.model_selection import RandomizedSearchCV

from sklearn import tree
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=71) #train_size = 0.8

depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
randomvalue = [71]

tree_param = [{"max_depth": depthvalues},
              {"min_samples_leaf": leafvalues},
              {"random_state": randomvalue}]

clf = RandomizedSearchCV(tree.DecisionTreeClassifier(), tree_param, random_state = 10)

random_result = clf.fit(X_train, y_train)
print(random_result)

print("="*60)

print("best parameters' values:", random_result.best_params_, "best scoure:", random_result.best_score_)

print("="*60)

print("all combinations' results:\n", clf.cv_results_)

RandomizedSearchCV(estimator=DecisionTreeClassifier(),
                   param_distributions=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8,
                                                       9, 10, 11, 12, 13, 14,
                                                       15, 16, 17, 18, 19,
                                                       20]},
                                        {'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9]},
                                        {'random_state': [71]}],
                   random_state=10)
best parameters' values: {'max_depth': 2} best scoure: 0.9208791208791208
all combinations' results:
 {'mean_fit_time': array([0.00399361, 0.00438213, 0.00358405, 0.00240049, 0.0031816 ,
       0.0037972 , 0.00398898, 0.00298781, 0.00358243, 0.0021997 ]), 'std_fit_time': array([1.48545299e-05, 4.90538912e-04, 4.83393927e-04, 4.83907040e-04,
       4.03098702e-04, 3.87289670e-04, 5.00111031e-07, 

In [4]:
# Decision Tree GridSearchCV
# (Split training and testing data randomly (80% vs. 20%) + 
# different depth + different min sample leaf)

from sklearn.model_selection import GridSearchCV

from sklearn import tree
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=71) #train_size = 0.8

depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
randomvalue = [71]

tree_param = [{"max_depth": depthvalues},
              {"min_samples_leaf": leafvalues},
              {"random_state": randomvalue}]

clf = GridSearchCV(tree.DecisionTreeClassifier(), tree_param)

grid_result = clf.fit(X_train, y_train)
print(grid_result)

print("="*60)

print("best parameters' values:", grid_result.best_params_, "best scoure:", grid_result.best_score_)

print("="*60)

print("all combinations' results:\n", clf.cv_results_)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                        13, 14, 15, 16, 17, 18, 19, 20]},
                         {'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
                         {'random_state': [71]}])
best parameters' values: {'max_depth': 2} best scoure: 0.9252747252747252
all combinations' results:
 {'mean_fit_time': array([0.0016027 , 0.00199728, 0.00279212, 0.00318789, 0.00320392,
       0.00359254, 0.00359058, 0.00339131, 0.00378976, 0.00319104,
       0.00339737, 0.00358958, 0.00379047, 0.00339041, 0.00378981,
       0.0035902 , 0.00337992, 0.00369844, 0.00378981, 0.00398335,
       0.00398927, 0.00359097, 0.00439987, 0.00398312, 0.00339575,
       0.00338407, 0.00319128, 0.00319138, 0.00319157, 0.00418868]), 'std_fit_time': array([4.82075514e-04, 6.19044096e-06, 3.99829023e-04, 3.90753461e-04,
       4.08083063e-04, 4.90516834e-04, 4.89169018e-04, 4.7364