In [1]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [2]:
# Decision Tree (Split data randomly from 90~10% + different depth + different min sample leaf)
from sklearn import tree
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in depthvalues:
        for j in leafvalues:
            clf = tree.DecisionTreeClassifier(random_state = 71, max_depth = i, min_samples_leaf = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train) #train
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test) #train
            test_acc = accuracy_score(y_pred_test, y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_depth = i
                relative_best_leaf = j

print("best ratio of testing data:", relative_best_ratio, "best depth:", relative_best_depth, "best min_sample_leaf:", relative_best_leaf, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best depth: 5 best min_sample_leaf: 1 
Training score: 0.994140625 Testing score: 0.9824561403508771


In [3]:
# Decision Tree RandomizedSearchCV
# (Split training and testing data randomly (80% vs. 20%) + 
# different depth + different min sample leaf)

from sklearn.model_selection import RandomizedSearchCV

from sklearn import tree
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=71) #train_size = 0.8

depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
randomvalue = [71]

tree_param = [{"max_depth": depthvalues},
              {"min_samples_leaf": leafvalues},
              {"random_state": randomvalue}]

clf = RandomizedSearchCV(tree.DecisionTreeClassifier(), tree_param, random_state = 10)

random_result = clf.fit(X_train, y_train)
print(random_result)

print("="*60)

print("best parameters' values:", random_result.best_params_, "best scoure:", random_result.best_score_)

print("="*60)

print("all combinations' results:\n", clf.cv_results_)

RandomizedSearchCV(estimator=DecisionTreeClassifier(),
                   param_distributions=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8,
                                                       9, 10, 11, 12, 13, 14,
                                                       15, 16, 17, 18, 19,
                                                       20]},
                                        {'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9]},
                                        {'random_state': [71]}],
                   random_state=10)
best parameters' values: {'max_depth': 2} best scoure: 0.9252747252747252
all combinations' results:
 {'mean_fit_time': array([0.00478754, 0.00438628, 0.00399685, 0.00277882, 0.00357637,
       0.00399199, 0.00419059, 0.00339203, 0.00437603, 0.00260448]), 'std_fit_time': array([7.48002697e-04, 7.84330490e-04, 6.41882665e-04, 7.34185143e-04,
       4.77833675e-04, 2.10914082e-05, 7.37964201e-04, 

In [4]:
# Decision Tree GridSearchCV
# (Split training and testing data randomly (80% vs. 20%) + 
# different depth + different min sample leaf)

from sklearn.model_selection import GridSearchCV

from sklearn import tree
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=71) #train_size = 0.8

depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
randomvalue = [71]

tree_param = [{"max_depth": depthvalues},
              {"min_samples_leaf": leafvalues},
              {"random_state": randomvalue}]

clf = GridSearchCV(tree.DecisionTreeClassifier(), tree_param)

grid_result = clf.fit(X_train, y_train)
print(grid_result)

print("="*60)

print("best parameters' values:", grid_result.best_params_, "best scoure:", grid_result.best_score_)

print("="*60)

print("all combinations' results:\n", clf.cv_results_)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                        13, 14, 15, 16, 17, 18, 19, 20]},
                         {'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
                         {'random_state': [71]}])
best parameters' values: {'max_depth': 2} best scoure: 0.923076923076923
all combinations' results:
 {'mean_fit_time': array([0.00219831, 0.00220327, 0.00319118, 0.00398278, 0.00400023,
       0.0044013 , 0.00439591, 0.0041822 , 0.00398922, 0.00398526,
       0.00418205, 0.00399175, 0.00458741, 0.00419507, 0.00398226,
       0.00399585, 0.00438724, 0.00398893, 0.00420237, 0.00457897,
       0.00479021, 0.00397859, 0.00419326, 0.00418682, 0.00387206,
       0.00380168, 0.0033843 , 0.00359335, 0.00319748, 0.00437465]), 'std_fit_time': array([3.90720658e-04, 3.95117104e-04, 4.12928105e-04, 3.16251259e-05,
       6.19265838e-04, 4.79804808e-04, 5.02927260e-04, 3.86852

In [5]:
# Decision Tree GridSearchCV
# (Split training and testing data randomly (from 90% ~ 10%) + 
# different depth + different min sample leaf)

from sklearn.model_selection import GridSearchCV

from sklearn import tree
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
relative_best_score = 0
relative_best_ratio = 0
relative_best_param = ""


for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)

    depth = 21
    depthvalues = [i for i in range(1, depth)]
    leaf = 10
    leafvalues = [i for i in range(1, leaf)]
    randomvalue = [71]

    tree_param = [{"max_depth": depthvalues},
                  {"min_samples_leaf": leafvalues},
                  {"random_state": randomvalue}]

    clf = GridSearchCV(tree.DecisionTreeClassifier(), tree_param)

    grid_result = clf.fit(X_train, y_train)
    
    print(k, grid_result.best_score_, grid_result.best_params_)
    
    if (grid_result.best_score_ > relative_best_score):
        relative_best_score = grid_result.best_score_
        relative_best_ratio = k
        relative_best_param = grid_result.best_params_

print("\nbest ratio:", k, "best parameters' values:", relative_best_param, "best scoure:", relative_best_score)

10 0.9277174947648963 {'max_depth': 4}
20 0.9208791208791208 {'max_depth': 2}
30 0.934620253164557 {'max_depth': 2}
40 0.935464620630861 {'max_depth': 3}
50 0.9260025062656642 {'min_samples_leaf': 5}
60 0.9254106280193236 {'max_depth': 16}
70 0.9235294117647058 {'max_depth': 1}
80 0.9383399209486166 {'max_depth': 1}
90 0.9818181818181818 {'min_samples_leaf': 3}

best ratio: 90 best parameters' values: {'min_samples_leaf': 3} best scoure: 0.9818181818181818


In [6]:
# KNN + GridSearchCV
# (Split training and testing data randomly (from 90% ~ 10%) + 
# different depth + different min sample leaf)

from sklearn.model_selection import GridSearchCV

from sklearn import neighbors
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
relative_best_score = 0
relative_best_ratio = 0
relative_best_param = ""


for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)

    neighbor = 10
    neighborvalues = [i for i in range(1, neighbor)]
    p = 3
    pvalues = [i for i in range(1, p)]

    knn_param = [{"n_neighbors": neighborvalues},
                 {"p": pvalues}]

    clf1 = GridSearchCV(neighbors.KNeighborsClassifier(), knn_param)
    clf2 = GridSearchCV(tree(), tree_param)

    
    grid_result = clf.fit(X_train, y_train)
    
    print(k, grid_result.best_score_, grid_result.best_params_)
    
    if (grid_result.best_score_ > relative_best_score):
        relative_best_score = grid_result.best_score_
        relative_best_ratio = k
        relative_best_param = grid_result.best_params_

print("\nbest ratio:", k, "best parameters' values:", relative_best_param, "best scoure:", relative_best_score)

10 0.9335427374833427 {'p': 1}
20 0.9296703296703296 {'p': 1}
30 0.9271835443037976 {'p': 1}
40 0.9177749360613812 {'n_neighbors': 7}
50 0.9156641604010025 {'n_neighbors': 9}
60 0.9121739130434781 {'n_neighbors': 5}
70 0.9411764705882353 {'n_neighbors': 3}
80 0.9474308300395258 {'n_neighbors': 4}
90 0.9818181818181818 {'n_neighbors': 1}

best ratio: 90 best parameters' values: {'n_neighbors': 1} best scoure: 0.9818181818181818
