In [1]:
from sklearn import decomposition, linear_model, datasets
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
breast_cancer = datasets.load_breast_cancer()

In [3]:
X = breast_cancer.data
X.shape

(569, 30)

In [4]:
sc = StandardScaler()
X_std = sc.fit_transform(X)

In [9]:
x_train,x_test, y_train, y_test = train_test_split(X_std,breast_cancer.target, random_state =69)

In [10]:
#Reducing the dimension using PCA
pca = decomposition.PCA(n_components = 15)
X_train_pca = pca.fit_transform(x_train)
X_test_pca = pca.transform(x_test)

In [12]:
# Using Linear Regression without PCA
lr = linear_model.LogisticRegression()
start = time.time()
lr.fit(x_train,y_train)
ending = time.time()
without_pca_time = ending-start
without_pca_score = lr.score(x_test,y_test)
print(without_pca_time)
print(without_pca_score)

0.05499911308288574
0.958041958041958


In [17]:
# Using Linear Regression with PCA
lr = linear_model.LogisticRegression()
start = time.time()
lr.fit(X_train_pca,y_train)
ending = time.time()
with_pca_time = ending-start
with_pca_score = lr.score(X_test_pca,y_test)
print(with_pca_time)
print(with_pca_score)

0.024001121520996094
0.972027972027972


In [21]:
pca.explained_variance_

array([13.22759625,  5.22297038,  2.97505234,  1.93759787,  1.56262571,
        1.12937496,  0.67547899,  0.50790502,  0.43686345,  0.37537896,
        0.27301707,  0.24182226,  0.20474057,  0.16292122,  0.09687194])

## Finding Optimal Number of Features

In [22]:
from sklearn import decomposition, linear_model, datasets
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [23]:
breast_cancer = datasets.load_breast_cancer()

In [24]:
X = breast_cancer.data
X.shape

(569, 30)

In [25]:
sc = StandardScaler()
X_std = sc.fit_transform(X)
x_train,x_test, y_train, y_test = train_test_split(X_std,breast_cancer.target, random_state =69)

In [26]:
pca = decomposition.PCA()

In [27]:
pca.fit_transform(x_train)

array([[ 3.62479776e+00, -2.07099293e+00, -1.48156694e+00, ...,
        -1.12211305e-01, -7.30838883e-02, -9.49106482e-03],
       [-1.78289390e+00,  1.26604228e+00, -8.00430000e-01, ...,
        -2.83758321e-02,  1.39953311e-02, -4.98892081e-04],
       [-3.73935047e+00, -1.68757319e+00,  3.09520390e+00, ...,
        -7.32925087e-03, -1.10122785e-03, -3.68074906e-03],
       ...,
       [-2.94450500e+00, -2.32096728e+00, -3.95796639e-01, ...,
         1.23213121e-02,  1.29747769e-02, -5.97799132e-03],
       [ 4.59097400e+00,  3.39571566e+00, -1.93418778e+00, ...,
        -1.31619948e-02, -3.52460588e-02, -3.67727780e-03],
       [-4.25988995e-01, -1.96644116e+00, -1.63326116e+00, ...,
         1.42415713e-02,  1.32111210e-02,  6.49166974e-03]])

In [28]:
pca.explained_variance_

array([1.32275963e+01, 5.22297038e+00, 2.97505234e+00, 1.93759787e+00,
       1.56262571e+00, 1.12937496e+00, 6.75478985e-01, 5.07905021e-01,
       4.36863454e-01, 3.75378957e-01, 2.73017068e-01, 2.41822262e-01,
       2.04740567e-01, 1.62921221e-01, 9.68719418e-02, 8.11293485e-02,
       6.27269893e-02, 5.23785314e-02, 4.97193013e-02, 3.10022307e-02,
       3.05996675e-02, 2.60677979e-02, 2.29342280e-02, 1.76947134e-02,
       1.09961982e-02, 7.55639952e-03, 5.84339811e-03, 1.54424043e-03,
       7.08584154e-04, 1.13368075e-04])

In [31]:
total = sum(pca.explained_variance_)
k = 0
current_variance = 0
# I want 95% variance
while current_variance/total < 0.99:
    current_variance += pca.explained_variance_[k]
    k = k+1

k

14

We got the value of k. Now we can directly use this

In [32]:
#Reducing the dimension using PCA
pca = decomposition.PCA(n_components = k)
X_train_pca = pca.fit_transform(x_train)
X_test_pca = pca.transform(x_test)