In [137]:
import numpy as np
import pandas as pd
import matplotlib as pp
%matplotlib inline

In [138]:
from sklearn.decomposition import PCA
from sklearn.datasets import load_breast_cancer

In [139]:
cancer = load_breast_cancer()
cancer.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [140]:
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [141]:
from sklearn.preprocessing import StandardScaler

In [142]:
scaler = StandardScaler()
scaler.fit(df)
df_scaled = scaler.transform(df)
df_scaled.shape

(569, 30)

In [144]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_scaled, cancer['target'], test_size=0.3, random_state=101)

In [145]:
pca = PCA() # run pca without n_components to get all the eigenvectors and then we can check the variance ratios
# to determine number of components to consider to achieve a certain desired variance.
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [146]:
x_pca = pca.transform(X_train)
x_pca.shape

(398, 30)

In [147]:
# if we want to retain 90 percent variance then
index = np.argmax(np.cumsum(pca.explained_variance_ratio_) > .90) 
index
np.cumsum(pca.explained_variance_ratio_[:index + 1])

array([ 0.4451123 ,  0.64923107,  0.73065293,  0.79744894,  0.85361861,
        0.89423853,  0.9145795 ])

In [148]:
index # This indicate the value of k-components to use for the expected variance 
# in this case we should take 7 components

6

In [149]:
# components to keep
evecs = pca.components_[:index +1]

In [150]:
evecs.shape

(7, 30)

In [151]:
vals = x_pca[:, :index+1]
vals.shape

(398, 7)

In [152]:
# Get back approximate values in original dimensions
df_approx = np.dot(vals, evecs)
df_approx.shape

(398, 30)

In [153]:
df_approx = scaler.inverse_transform(df_approx)

In [154]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [155]:
df_approx = pd.DataFrame(df_approx, columns=cancer['feature_names'])

In [156]:
df_approx.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,12.854797,22.011303,80.622153,517.895591,0.063109,0.01102,-0.030442,-0.006763,0.141342,0.050775,...,14.230656,28.416221,90.197073,648.04648,0.083932,0.055559,0.005329,0.026088,0.229343,0.058001
1,14.027096,15.080167,92.700349,688.004855,0.105967,0.144973,0.142552,0.067158,0.202869,0.07073,...,16.194965,18.635434,108.668475,941.874649,0.137887,0.302601,0.332974,0.127188,0.302513,0.094379
2,15.014173,21.489374,99.027728,722.874257,0.10384,0.145923,0.128836,0.068066,0.199519,0.066142,...,18.092024,30.168554,121.322003,1048.578489,0.152002,0.432624,0.455993,0.166877,0.366441,0.103511
3,15.329401,20.000053,99.175788,757.284911,0.090718,0.080491,0.075345,0.04727,0.156104,0.057059,...,17.567855,26.790681,114.808408,1011.458688,0.125602,0.194802,0.243939,0.114907,0.233331,0.073076
4,10.605573,19.177216,67.118755,322.285982,0.085944,0.055664,0.01017,0.005984,0.16556,0.062669,...,11.405942,24.898924,72.696017,350.740878,0.116204,0.116976,0.066259,0.036847,0.245547,0.076202


In [194]:
# We already found number of components needed for more than 90% variance retention
pca = PCA(n_components=2)
pca.fit(X_train)
pca.components_.shape
X_train_pca = pca.transform(X_train)
X_train_pca.shape

(398, 2)

In [195]:
# use the transform values to build a classification model
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()

In [196]:
reg.fit(X_train_pca, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [197]:
X_test_pca = pca.transform(X_test)
y_pred = reg.predict(X_test_pca)

In [198]:
X_test_pca.shape

(171, 2)

In [199]:
from sklearn.metrics import confusion_matrix, classification_report

In [200]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

[[ 63   3]
 [  5 100]]
             precision    recall  f1-score   support

          0       0.93      0.95      0.94        66
          1       0.97      0.95      0.96       105

avg / total       0.95      0.95      0.95       171

