In [1]:
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import FactorAnalysis
import pandas as pd
import numpy as np

PCA’s approach to data/dimension reduction is to create one or more index variables from a larger set of measured variables. It does this using a linear combination (basically a weighted average) of a set of variables. The created index variables are called <b>components</b>. components maximize the total variance
<img src = "PCA.png">. image source = https://www.theanalysisfactor.com

In [2]:
iris = load_iris()

In [3]:
X = iris.data

In [4]:
Y = iris.target

In [5]:
cols = [s[:12].strip() for s in iris.feature_names]

In [6]:
cols

['sepal length', 'sepal width', 'petal length', 'petal width']

In [7]:
# Perform Scaling on the Data. This means that we need to center and scale the data. 
#This way the average value of each record would be 0 and the variance for each record would be 1
X = StandardScaler().fit_transform(X)

In [8]:
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(X)

p_Dataframe = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])

In [9]:
p_Dataframe.head(n=2)

Unnamed: 0,PC1,PC2
0,-2.264703,0.480027
1,-2.080961,-0.674134


<b>explained_variance_ratio_</b> attribute provides quantification (in percentage) of the informative value of each extracted component. Higher percentage indicates better retention

In [10]:

print('Explained variance by each component: %s'

      % pca.explained_variance_ratio_)

Explained variance by each component: [0.72962445 0.22850762]


In [11]:
np.sum([0.72962445, 0.22850762])

0.95813207

In [12]:
new_Df = pd.concat([p_Dataframe,pd.DataFrame(Y,columns=['target'])], axis=1)

In [13]:
new_Df.head()

Unnamed: 0,PC1,PC2,target
0,-2.264703,0.480027,0
1,-2.080961,-0.674134,0
2,-2.364229,-0.341908,0
3,-2.299384,-0.597395,0
4,-2.389842,0.646835,0


A <b>Factor Analysis</b> is a model of the measurement of a latent variable. This latent variable cannot be directly measured with a single variable. Instead, it is seen through the relationships it causes in a set of Y variables. The new variable are called <b>factors</b>. Factors maximize the shared portion of the variance. F - the factor is causing response on 4 variables. 
<img src = "factor.png"/>image source = https://www.theanalysisfactor.com

In [14]:
factor = FactorAnalysis(n_components=4).fit(X)

In [15]:
import pandas as pd

print(pd.DataFrame(factor.components_, columns=cols))

   sepal length  sepal width  petal length  petal width
0      0.839139    -0.364774      0.920334     0.901935
1      0.122953     0.339799     -0.019313    -0.009076
2     -0.000000     0.000000      0.000000     0.000000
3     -0.000000     0.000000      0.000000    -0.000000


Interpret the numbers as correlation. At the intersection of each factor and feature, a positive number indicates that a positive proportion exists between the two; a negative number points out that they diverge and that one is contrary to the other.

### Choosing between PCA or Factor analysis
1. If the objective is to just reduce the dimension then use PCA
2. Use factor analysis if the objective to uncover hidden factors in the data

### PCA Application

#### face classification with PCA

In [16]:
from sklearn.datasets import fetch_olivetti_faces

In [17]:
dataset = fetch_olivetti_faces(shuffle=True,random_state=101)

In [18]:
print(dataset.DESCR)

.. _olivetti_faces_dataset:

The Olivetti faces dataset
--------------------------

`This dataset contains a set of face images`_ taken between April 1992 and 
April 1994 at AT&T Laboratories Cambridge. The
:func:`sklearn.datasets.fetch_olivetti_faces` function is the data
fetching / caching function that downloads the data
archive from AT&T.

.. _This dataset contains a set of face images: http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html

As described on the original website:

    There are ten different images of each of 40 distinct subjects. For some
    subjects, the images were taken at different times, varying the lighting,
    facial expressions (open / closed eyes, smiling / not smiling) and facial
    details (glasses / no glasses). All the images were taken against a dark
    homogeneous background with the subjects in an upright, frontal position 
    (with tolerance for some side movement).

**Data Set Characteristics:**

    Classes                        

In [19]:
X_train = dataset.data[:350,:]

X_test  = dataset.data[350:,:]

Y_train = dataset.target[:350]

Y_test = dataset.target[350:]

In [20]:
n_components = 25
pca = PCA(svd_solver='randomized', n_components=n_components,  whiten=True)

pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=25, random_state=None,
    svd_solver='randomized', tol=0.0, whiten=True)

In [21]:
# The resulted decomposition uses 25 components which is about 80% of information help in 4096 features 
print(f'Explained variance by {n_components} components: {np.sum(pca.explained_variance_ratio_)}')

Explained variance by 25 components: 0.7944266200065613


In [22]:
X_train_pca = pca.transform(X_train)
X_test_pca  = pca.transform(X_test)

In [23]:
print(X_train_pca.shape, X_test_pca.shape)

(350, 25) (50, 25)


#### Building the classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC


In [25]:
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }

clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                   param_grid, cv=5, iid=False)

clf = clf.fit(X_train_pca, Y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator_)


Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [26]:
print("Predicting classes on the test set")

Y_pred = clf.predict(X_test_pca)

print(accuracy_score(Y_test, Y_pred))

Predicting classes on the test set
0.96
