**Principal Component Analysis**

You will implement dimensionality reduction with PCA.  

1). Read iris_dataset.csv (4 features, hence 4 PCs)

2). Find the principal components

3). Recontruct the dataset (X_hat)

4). Determine the accuracy of X_hat for 1 PC and 4 PCs using LDA classifier (provided below)


In [3]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from numpy import linalg as LA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score


# Load data - 150 observations, 4 features, 3 classes,
df = pd.read_csv("iris_dataset.csv", header=None)
print(df.describe())
data = df.values
print(np.shape(data))

                0           1           2           3           4
count  150.000000  150.000000  150.000000  150.000000  150.000000
mean     5.843333    3.057333    3.758000    1.199333    2.000000
std      0.828066    0.435866    1.765298    0.762238    0.819232
min      4.300000    2.000000    1.000000    0.100000    1.000000
25%      5.100000    2.800000    1.600000    0.300000    1.000000
50%      5.800000    3.000000    4.350000    1.300000    2.000000
75%      6.400000    3.300000    5.100000    1.800000    3.000000
max      7.900000    4.400000    6.900000    2.500000    3.000000
(150, 5)


In [10]:
## Setup

# Shuffle data randomly
shuffled_data = data;
np.random.shuffle(shuffled_data)
X = shuffled_data[:,0:4]  # 150x4
y = shuffled_data[:,4]
N=len(X)

# Classification accuracy with the original dataset using LDA
model_mean_scores = []
model = LinearDiscriminantAnalysis().fit(X, y)
scores = cross_val_score(model, X, y, cv=10)
model_mean_scores.append(np.mean(scores))
print('>> Average accuracy with the original dataset = {0:0.4f}'.format(model_mean_scores[0]))


>> Average accuracy with the original dataset = 0.9800


In [11]:
def evaluate_accuracy(X_hat, Num_PC, y):

  ###############################################
  # Evaluate classificatin accuracy with LDA
  ###############################################
  '''
    Inputs:
      X_hat: reconstructed dataset. dimension=150x4
      Num_PC: number of PC's used to recover X_hat
      y: class label vector. dimension=150x1

  '''

  from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  from sklearn.model_selection import cross_val_score

  X_train = X_hat[:,0:Num_PC]        # dimensionally reduced dataset
  y_train = y

  model_mean_scores = []
  model = LinearDiscriminantAnalysis().fit(X_train, y_train)
  scores = cross_val_score(model, X_train, y_train, cv=10)
  model_mean_scores.append(np.mean(scores))

  print('Average accuracy = {0:0.4f} with {1:1d} PCs'
     .format(model_mean_scores[0], Num_PC))

In [12]:
### Your code goes here ...
def compute_reconstructed_matrix(X, Num_PC):

  M = np.mean(X, axis=0)
  M = np.repeat([M], repeats=len(X), axis=0)
  XM = X - M
  #print(XM)

  XMCovar = np.cov(np.transpose(XM))
  XMCovar = XMCovar/(N-1)
  values, vectors = LA.eig(XMCovar)
  print("==============================")
  print("Number of PC:", end=" ")
  print(Num_PC)
  print("Eigenvalues:", end= " ")
  print(values)
  print("All Eigenvectors :", end=" ")
  print(vectors)

  PC = np.matmul(XM, vectors)

  req_PC = PC[:,0:Num_PC]
  req_vect = vectors[:,0:Num_PC]
  print("Eigenvector corresponding to number of PC :", end=" ")
  print(req_vect)
  print("================================")

  Xhat = np.matmul(req_PC, np.transpose(req_vect)) + M
  return Xhat

## Use function evaluate_accuracy
X_hat= compute_reconstructed_matrix(X,1)
evaluate_accuracy(X_hat, 1, y)  # classification accuracy with 1 PC

X_hat= compute_reconstructed_matrix(X,4)
evaluate_accuracy(X_hat, 4, y)  # classification accuracy with 4 PCs


Number of PC: 1
Eigenvalues: [0.02837746 0.00162866 0.0005249  0.00015997]
All Eigenvectors : [[ 0.36138659 -0.65658877 -0.58202985  0.31548719]
 [-0.08452251 -0.73016143  0.59791083 -0.3197231 ]
 [ 0.85667061  0.17337266  0.07623608 -0.47983899]
 [ 0.3582892   0.07548102  0.54583143  0.75365743]]
Eigenvector corresponding to number of PC : [[ 0.36138659]
 [-0.08452251]
 [ 0.85667061]
 [ 0.3582892 ]]
Average accuracy = 0.9333 with 1 PCs
Number of PC: 4
Eigenvalues: [0.02837746 0.00162866 0.0005249  0.00015997]
All Eigenvectors : [[ 0.36138659 -0.65658877 -0.58202985  0.31548719]
 [-0.08452251 -0.73016143  0.59791083 -0.3197231 ]
 [ 0.85667061  0.17337266  0.07623608 -0.47983899]
 [ 0.3582892   0.07548102  0.54583143  0.75365743]]
Eigenvector corresponding to number of PC : [[ 0.36138659 -0.65658877 -0.58202985  0.31548719]
 [-0.08452251 -0.73016143  0.59791083 -0.3197231 ]
 [ 0.85667061  0.17337266  0.07623608 -0.47983899]
 [ 0.3582892   0.07548102  0.54583143  0.75365743]]
Average acc