<a href="https://colab.research.google.com/github/Igben-Nehemiah/ML-practice/blob/main/Classical_ML_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dimension Reduction and Latent Variable Methods

## Principal Component Analysis (PCA)

### Dimensionality reduction for Polymer Manufacturing Process

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [7]:
# fetch data and separate training data
data = pd.read_excel('proc1a.xls', skiprows=1, usecols='C:AI')
data_train = data.iloc[0:69,]

In [9]:
# normalise data
scaler = StandardScaler()
data_train_normal = scaler.fit_transform(data_train)

In [11]:
# PCA
pca = PCA()
score_train = pca.fit_transform(data_train_normal)

In [18]:
# confirm no correlation
corr_coef = np.corrcoef(score_train, rowvar=False)

In [26]:
# visualise explained variance
import matplotlib.pyplot as plt

explained_variance = 100*pca.explained_variance_ratio_ # in percentage
cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained

In [None]:
plt.plot(cum_explained_variance, 'r+', label = 'cumulative % variance explained')
plt.plot(explained_variance, 'b+', label = 'variance explained by each PC')
plt.ylabel('Explained variance (in %)')
plt.xlabel('Principal component number')
plt.legend()


In [33]:
# decide number of PCs to retain and compute reduced data in PC space
n_comp = np.argmax(cum_explained_variance >= 90) + 1
score_train_reduced = score_train[:, 0:n_comp]

In [34]:
score_train_reduced.shape

(69, 13)

In [38]:
# confirm that only about 10% of original information is lost
from sklearn.metrics import r2_score

V_matrix = pca.components_.T
P_matrix = V_matrix[:, 0:n_comp]

data_train_normal_reconstruct = np.dot(score_train_reduced, P_matrix.T)
R2_score = r2_score(data_train_normal, data_train_normal_reconstruct)

print('% information lost = ', 100*(1-R2_score))

% information lost =  9.046972754471994
