In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 데이터 정규화

In [2]:
from sklearn.datasets import load_breast_cancer
bc = load_breast_cancer()
df = pd.DataFrame(bc.data, columns=bc.feature_names)
df['target'] = bc.target
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [3]:
from sklearn.preprocessing import StandardScaler
bc_std = StandardScaler().fit_transform(bc.data)

### 차원축소

In [4]:
from sklearn.decomposition import PCA
pca2 = PCA(n_components=2)
pca5 = PCA(n_components=5)
pca10 = PCA(n_components=10)

In [5]:
bc_pca2 = pca2.fit_transform(bc_std)
bc_pca5 = pca5.fit_transform(bc_std)
bc_pca10 = pca10.fit_transform(bc_std)

### 분류 정확도

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [8]:
# 정규화된 오리지널 데이터
X_train, X_test, y_train, y_test = train_test_split(
    bc_std, bc.target, stratify=bc.target, random_state=2021
)
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.993006993006993

실전에서는
1. import 모듈
2. 함수
3. 프로그램 코드
4. print()

In [9]:
def pca_accuracy(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=bc.target, random_state=2021
    )
    lrc = LogisticRegression(random_state=2021)
    lrc.fit(X_train, y_train)
    score = lrc.score(X_test, y_test)
    return np.round(score, 4)

In [10]:
print('PCA 2 정확도:', pca_accuracy(bc_pca2, bc.target))
print('PCA 5 정확도:', pca_accuracy(bc_pca5, bc.target))
print('PCA 10 정확도:', pca_accuracy(bc_pca10, bc.target))
print('원데이터 정확도:', pca_accuracy(bc_std, bc.target))

PCA 2 정확도: 0.9441
PCA 5 정확도: 0.993
PCA 10 정확도: 0.986
원데이터 정확도: 0.993


- globals() : 전역 변수를 딕셔너리에 담아두기

In [11]:
for dim in [2,5,10]:
    globals()[f'pca{dim}'] = PCA(n_components=dim)
    globals()[f'bc_pca{dim}'] = globals()[f'pca{dim}'].fit_transform(bc_std)
    acc = pca_accuracy(globals()[f'bc_pca{dim}'], bc.target)
    print(f'PCA {dim} 정확도: {acc}')
print('원데이터 정확도:', pca_accuracy(bc_std, bc.target))

PCA 2 정확도: 0.9441
PCA 5 정확도: 0.993
PCA 10 정확도: 0.986
원데이터 정확도: 0.993
