In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [3]:
# Decision Tree (Split data randomly from 90~10% + different depth + different min sample leaf)
from sklearn import tree
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
depth = 21
depthvalues = [i for i in range(1, depth)]
leaf = 10
leafvalues = [i for i in range(1, leaf)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in depthvalues:
        for j in leafvalues:
            clf = tree.DecisionTreeClassifier(random_state = 71, max_depth = i, min_samples_leaf = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train) #train
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test) #train
            test_acc = accuracy_score(y_pred_test, y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_depth = i
                relative_best_leaf = j

print("best ratio of testing data:", relative_best_ratio, "best depth:", relative_best_depth, "best min_sample_leaf:", relative_best_leaf, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best depth: 5 best min_sample_leaf: 1 
Training score: 0.994140625 Testing score: 0.9824561403508771


In [1]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
n_estimator = 250
n_estimatorvalues = [i for i in range(100, n_estimator, 50)]
depth = 8
depthvalues = [i for i in range(1, depth)]
leaf = 5
leafvalues = [i for i in range(1, leaf)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_estimators = 0
relative_best_depth = 0
relative_best_leaf = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for l in n_estimatorvalues:
        for i in depthvalues:
            for j in leafvalues:
                clf = RandomForestClassifier(random_state = 71, n_estimators = l, max_depth = i, min_samples_leaf = j)
                clf.fit(X_train, y_train)
                y_pred_train = clf.predict(X_train) #train
                train_acc = accuracy_score(y_pred_train, y_train)
                y_pred_test = clf.predict(X_test) #train
                test_acc = accuracy_score(y_pred_test, y_test)

                if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                    relative_best_train_score = train_acc
                    relative_best_test_score = test_acc
                    relative_best_ratio = k
                    relative_best_estimators = l
                    relative_best_depth = i
                    relative_best_leaf = j

print("best ratio of testing data:", relative_best_ratio, "best no. of estimators:", relative_best_estimators, "best depth:", relative_best_depth, "best min_sample_leaf:", relative_best_leaf, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

NameError: name 'cancer' is not defined

In [4]:
# XGBoost
from xgboost.sklearn import XGBClassifier
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
n_estimator = 250
n_estimatorvalues = [i for i in range(100, n_estimator, 50)]
depth = 8
depthvalues = [i for i in range(1, depth)]
rate = 3
ratevalues = [i for i in range(1, rate)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_estimators = 0
relative_best_depth = 0
relative_best_rate = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for l in n_estimatorvalues:
        for i in depthvalues:
            for j in ratevalues:
                clf = XGBClassifier(random_state = 71, n_estimators = l, max_depth = i, learning_rate = j)
                clf.fit(X_train, y_train)
                y_pred_train = clf.predict(X_train) #train
                train_acc = accuracy_score(y_pred_train, y_train)
                y_pred_test = clf.predict(X_test) #train
                test_acc = accuracy_score(y_pred_test, y_test)

                if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                    relative_best_train_score = train_acc
                    relative_best_test_score = test_acc
                    relative_best_ratio = k
                    relative_best_estimators = l
                    relative_best_depth = i
                    relative_best_rate = j

print("best ratio of testing data:", relative_best_ratio, "best no. of estimators:", relative_best_estimators, "best depth:", relative_best_depth, "best learning_rate:", relative_best_rate, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

ModuleNotFoundError: No module named 'xgboost'

In [5]:
# SVC
from sklearn import svm
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
kernelvalues = ["rbf", "poly"]#, "sigmoid"]
gammavalues =["scale"]#, "auto"]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_kernel = ""
relative_best_gamma = ""

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in kernelvalues:
        for j in gammavalues:
            clf = svm.SVC(random_state = 71, kernel = i, gamma = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train) #train
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test) #train
            test_acc = accuracy_score(y_pred_test, y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_kernel = i
                relative_best_gamma = j

print("best ratio of testing data:", relative_best_ratio, "best kernel:", relative_best_kernel, "best gamma:", relative_best_gamma, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best kernel: rbf best gamma: scale 
Training score: 0.91796875 Testing score: 0.9473684210526315


In [6]:
# KNN
from sklearn import neighbors
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
neighbor = 10
neighborvalues = [i for i in range(1, neighbor)]
p = 3
pvalues = [i for i in range(1, p)]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_neighbor = 0
relative_best_p = 0

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in neighborvalues:
        for j in pvalues:
            clf = neighbors.KNeighborsClassifier(n_neighbors = i, p = j)
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train) #train
            train_acc = accuracy_score(y_pred_train, y_train)
            y_pred_test = clf.predict(X_test) #train
            test_acc = accuracy_score(y_pred_test, y_test)

            if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
                relative_best_train_score = train_acc
                relative_best_test_score = test_acc
                relative_best_ratio = k
                relative_best_neighbor = i
                relative_best_p = j

print("best ratio of testing data:", relative_best_ratio, "best neighbors:", relative_best_neighbor, "best p:", relative_best_p, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

best ratio of testing data: 10 best neighbors: 1 best p: 1 
Training score: 1.0 Testing score: 0.9473684210526315


In [7]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
X = cancer.data
y = cancer.target

ratio = 100
ratiovalues = [i for i in range(10, ratio, 10)]
penaltyvalues = ["l2", "none"]
relative_best_train_score = 0
relative_best_test_score = 0
relative_best_ratio = 0
relative_best_penalty = ""

for k in ratiovalues:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k/100, random_state=71)
    for i in penaltyvalues:
        clf = LogisticRegression(random_state = 71, penalty = i)
        clf.fit(X_train, y_train)
        y_pred_train = clf.predict(X_train) #train
        train_acc = accuracy_score(y_pred_train, y_train)
        y_pred_test = clf.predict(X_test) #train
        test_acc = accuracy_score(y_pred_test, y_test)

        if ((train_acc > relative_best_train_score) and (test_acc > relative_best_test_score)):
            relative_best_train_score = train_acc
            relative_best_test_score = test_acc
            relative_best_ratio = k
            relative_best_penalty = i

print("best ratio of testing data:", relative_best_ratio, "best penalty:", relative_best_penalty, \
      "\nTraining score:", relative_best_train_score, "Testing score:", relative_best_test_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

best ratio of testing data: 10 best penalty: none 
Training score: 0.953125 Testing score: 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist