In [1]:
#準備
import numpy as np

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

X = data.data
y = data.target

from sklearn import linear_model
clf = linear_model.LogisticRegression()

from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1, 
                  train_size=0.5, 
                  test_size=0.5)

# trainとtestを分割：hold-out
train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [2]:
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [3]:
clf.score(X_test,y_test)

0.9508771929824561

In [4]:
#################################################################################################

In [5]:
#シャッフルの設定を変えてみる。

ss = ShuffleSplit(n_splits=10, 
                  train_size=0.5, 
                  test_size=0.5)
score = 0
for train_index, test_index in ss.split(X, y):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    score +=  clf.score(X_test, y_test)
    print(clf.score(X_test, y_test))
print("###{}###".format(score/10))
print(np.unique(y,       return_counts=True))
print(np.unique(y,       return_counts=True)[1] / y.size)
print(np.unique(y_train, return_counts=True)[1] / y_train.size)
print(np.unique(y_test,  return_counts=True)[1] / y_test.size)

0.9263157894736842
0.9333333333333333
0.9508771929824561
0.9403508771929825
0.9508771929824561
0.9438596491228071
0.9438596491228071
0.9543859649122807
0.9473684210526315
0.9543859649122807
###0.9445614035087718###
(array([0, 1]), array([212, 357]))
[0.37258348 0.62741652]
[0.38028169 0.61971831]
[0.36491228 0.63508772]


In [6]:
#################################################################################################

In [7]:
#クラスバランスの変化を確認

ss = ShuffleSplit(n_splits=1, 
                  train_size=0.95, 
                  test_size=0.05, 
                  random_state=3)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [8]:
#学習データと訓練データにばらつきが生じる＝＞識別が雑になる可能性！！！
print(np.unique(y,       return_counts=True))
print(np.unique(y,       return_counts=True)[1] / y.size)
print(np.unique(y_train, return_counts=True)[1] / y_train.size)
print(np.unique(y_test,  return_counts=True)[1] / y_test.size)

(array([0, 1]), array([212, 357]))
[0.37258348 0.62741652]
[0.38333333 0.61666667]
[0.17241379 0.82758621]


In [9]:
#######################################################################################################

In [10]:
#{ランダムにデータをとるのではなく、各データの割合を保ってデータを分ける}
from sklearn.model_selection import StratifiedShuffleSplit

In [11]:
ss = StratifiedShuffleSplit(n_splits=1, 
                            train_size=0.95, 
                            test_size=0.05, 
                            random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [12]:
#割合を確認
print(np.unique(y,       return_counts=True))
print(np.unique(y,       return_counts=True)[1] / y.size)
print(np.unique(y_train, return_counts=True)[1] / y_train.size)
print(np.unique(y_test,  return_counts=True)[1] / y_test.size)

(array([0, 1]), array([212, 357]))
[0.37258348 0.62741652]
[0.37222222 0.62777778]
[0.37931034 0.62068966]


In [13]:
#StratifiedShuffleSplit

ss =StratifiedShuffleSplit(n_splits=10, 
                  train_size=0.5, 
                  test_size=0.5)
score = 0
for train_index, test_index in ss.split(X, y):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    score += clf.score(X_test, y_test)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
print("###{}###".format(score/10))

0.9403508771929825
0.9368421052631579
0.9543859649122807
0.9859649122807017
0.9508771929824561
0.9508771929824561
0.9473684210526315
0.968421052631579
0.9473684210526315
0.9192982456140351
###0.9571929824561403###


In [14]:
#割合を確認
print(np.unique(y,       return_counts=True))
print(np.unique(y,       return_counts=True)[1] / y.size)
print(np.unique(y_train, return_counts=True)[1] / y_train.size)
print(np.unique(y_test,  return_counts=True)[1] / y_test.size)

(array([0, 1]), array([212, 357]))
[0.37258348 0.62741652]
[0.37323944 0.62676056]
[0.37192982 0.62807018]
