In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
cancer = load_breast_cancer()
X_train, X_test, Y_train, Y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=0)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model = LogisticRegression(max_iter=2500)
clf = model.fit(X_train, Y_train)
print("Train : ", clf.__class__.__name__, clf.score(X_train_std, Y_train))
print("Test : ", clf.__class__.__name__, clf.score(X_test_std, Y_test))

Y_pred = clf.predict(X_test_std)
confusion_M = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix : \n{}".format(confusion_M))

Train :  LogisticRegression 0.6267605633802817
Test :  LogisticRegression 0.6293706293706294
Confusion Matrix : 
[[ 0 53]
 [ 0 90]]


In [53]:
from sklearn.cluster import KMeans
kmeans_pp = KMeans(n_clusters=5)

kmeans_pp.fit(X_train_std)
Y_train_cl = kmeans_pp.fit_predict(X_train_std)

In [54]:
Y_test_cl = kmeans_pp.fit_predict(X_test_std)

In [55]:
cl_train_data =pd.DataFrame(Y_train_cl, columns=['cl_nm']).astype(str)
cl_train_data_dummy = pd.get_dummies(cl_train_data)
cl_train_data_dummy

Unnamed: 0,cl_nm_0,cl_nm_1,cl_nm_2,cl_nm_3,cl_nm_4
0,0,0,0,0,1
1,0,0,1,0,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,0,0,0,1
...,...,...,...,...,...
421,1,0,0,0,0
422,0,0,0,0,1
423,0,1,0,0,0
424,0,0,0,0,1


In [56]:
cl_test_data =pd.DataFrame(Y_test_cl, columns=['cl_nm']).astype(str)
cl_test_data_dummy = pd.get_dummies(cl_test_data)
cl_test_data_dummy

Unnamed: 0,cl_nm_0,cl_nm_1,cl_nm_2,cl_nm_3,cl_nm_4
0,1,0,0,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,1,0,0,0,0
4,0,0,1,0,0
...,...,...,...,...,...
138,0,0,0,0,1
139,1,0,0,0,0
140,0,0,1,0,0
141,1,0,0,0,0


In [57]:
merge_train_data = pd.concat([pd.DataFrame(X_train_std), cl_train_data_dummy, pd.DataFrame(Y_train, columns=['flg'])
                             ], axis=1)
merge_train_data
merge_test_data = pd.concat([pd.DataFrame(X_test_std), cl_test_data_dummy, pd.DataFrame(Y_test, columns=['flg'])
                            ], axis =1)

In [58]:
from sklearn.decomposition import PCA

model = LogisticRegression()
X_train_data =merge_train_data.drop('flg', axis=1)
X_test_data = merge_test_data.drop('flg', axis=1)

Y_train_data = merge_train_data['flg']
Y_test_data = merge_test_data['flg']

best_score = 0
best_num = 0

for num_com in range(20):
    pca = PCA(n_components=num_com+1)
    pca.fit(X_train_data)
    X_train_pca = pca.transform(X_train_data)
    X_test_pca = pca.transform(X_test_data)
    
    logsistic_model = model.fit(X_train_pca, Y_train_data)
    
    train_score = logsistic_model.score(X_train_pca, Y_train_data)
    test_score = logsistic_model.score(X_test_pca, Y_test_data)
    
    if best_score < test_score:
        best_score = test_score
        best_num = num_com+1
        
print('Best score : ', best_score)
print('Best num components : ', best_num)

Best score :  0.965034965034965
Best num components :  8


In [28]:
X_train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,cl_nm_0,cl_nm_1,cl_nm_2,cl_nm_3,cl_nm_4
0,-0.500746,-0.629604,-0.510598,-0.508655,-0.326770,-0.678037,-0.702917,-0.673290,-0.323201,-0.513532,...,-0.449967,-0.494471,-0.429224,-0.465020,-0.447715,0,0,0,0,1
1,0.948356,0.011070,0.931367,0.814498,-0.473158,0.297845,0.191520,0.649428,-1.114571,-1.117685,...,0.565426,0.387699,1.175397,0.053685,-0.302163,0,0,0,1,0
2,-1.005023,-0.151387,-1.005709,-0.884654,0.755356,-0.706644,-0.840513,-0.798055,-1.203323,0.466252,...,-0.964424,-0.915127,-0.748055,-1.142683,-0.316267,1,0,0,0,0
3,-1.634260,0.326831,-1.551415,-1.243587,-0.159571,0.500562,0.556308,-0.699663,1.533191,2.838587,...,1.138476,1.303103,-0.546019,0.712943,3.642956,0,0,1,0,0
4,-0.254149,-0.789772,-0.314642,-0.325885,-0.801097,-0.976997,-1.115819,-1.166748,-0.648624,-0.542097,...,-0.769974,-1.272052,-1.350424,-0.409803,-0.009932,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,3.076292,1.290130,3.196108,3.364465,0.719477,3.077148,3.139269,3.429264,0.079880,0.726195,...,2.275561,1.838270,2.563178,-0.905083,1.188892,0,1,0,0,0
422,-0.750114,-1.933833,-0.745505,-0.694727,-0.182534,-0.705318,-0.684997,-0.584280,-0.519195,-0.229309,...,-0.097767,-0.299465,-0.384660,0.200931,0.067921,0,0,0,0,1
423,-0.112841,-0.135370,-0.127519,-0.233124,0.207117,0.055343,-0.445261,-0.290118,-0.345389,0.813319,...,0.015556,-0.381084,0.195875,-0.078501,0.354510,0,0,0,0,1
424,-1.417864,-1.114686,-1.427336,-1.126879,-1.881787,-1.381483,-1.087404,-1.161397,-0.071738,0.799036,...,-1.299755,-1.254848,-1.516568,0.159099,-0.337705,1,0,0,0,0
