## **PCA**  
  


In [1]:
import numpy as np # for numeric computation
import pandas as pd # for handling data in table format
pd.set_option('display.max_rows', 200)
import requests # for retrieving web addresses
import io # for storing data
import plotly.express as px # for visualization
import math
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import cm
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
plt.style.use('ggplot')

## **Import Data**

In [2]:
# Clone Git Repo
!git clone -l -s git://github.com/adamehayman/AMII.git cloned-repo
%cd cloned-repo
!ls

Cloning into 'cloned-repo'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 59 (delta 23), reused 40 (delta 11), pack-reused 0[K
Receiving objects: 100% (59/59), 14.70 MiB | 20.99 MiB/s, done.
Resolving deltas: 100% (23/23), done.
/content/cloned-repo
1_NASA_FD001_EDA.ipynb		  3_PCA.ipynb	NASA_TurboFan_Data
2_Decision_Tree_Classifier.ipynb  FIgures.pptx	README.md


In [3]:
train1_PCA = pd.read_csv('/content/cloned-repo/NASA_TurboFan_Data/FD001/train1_new.csv')
test1_PCA = pd.read_csv('/content/cloned-repo/NASA_TurboFan_Data/FD001/test1.csv')

In [4]:
sensor2keep = ['T24', 'T30', 'T50', 'P15', 'P30', 'Ps30', 'phi', 'NRf', 'NRc', 'BPR', 'htBleed', 'w31', 'w32']
col2keep = ['Unit', 'T24', 'T30', 'T50', 'P15', 'P30', 'Ps30', 'phi', 'NRf', 'NRc', 'BPR', 'htBleed', 'w31', 'w32', 'RUL']

In [5]:
test1_PCA = test1_PCA[col2keep]

### **Drop Columns**  
  
From the last notebook, 2_Decision_Tree_Classifier, we saw that some sensors, NRc (corrected core speed) and P15 (total pressure in bypass-duct) did not have ideal distributions that may lead to future problems. Looking back at the EDA, we can see that both features do not correlate very strongly with RUL, -0.31 and -0.13, respectively.

In [6]:
train1_PCA.drop(['Unit','P15', 'NRc'], axis=1, inplace=True)

In [7]:
test1_PCA.drop(['Unit','P15', 'NRc'], axis=1, inplace=True)

In [8]:
sensors = ['T24', 'T30', 'T50', 'P30', 'Ps30', 'phi', 'NRf', 'BPR', 'htBleed', 'w31', 'w32']

In [9]:
train1_PCA

Unnamed: 0,T24,T30,T50,P30,Ps30,phi,NRf,BPR,htBleed,w31,w32,RUL
0,641.82,1589.70,1400.60,554.36,47.47,521.66,2388.02,8.4195,392,39.06,23.4190,191
1,642.15,1591.82,1403.14,553.75,47.49,522.28,2388.07,8.4318,392,39.00,23.4236,190
2,642.35,1587.99,1404.20,554.26,47.27,522.42,2388.03,8.4178,390,38.95,23.3442,189
3,642.35,1582.79,1401.87,554.45,47.13,522.86,2388.08,8.3682,392,38.88,23.3739,188
4,642.37,1582.85,1406.22,554.00,47.28,522.19,2388.04,8.4294,393,38.90,23.4044,187
...,...,...,...,...,...,...,...,...,...,...,...,...
20626,643.49,1597.98,1428.63,551.43,48.07,519.49,2388.26,8.4956,397,38.49,22.9735,4
20627,643.54,1604.50,1433.58,550.86,48.04,519.68,2388.22,8.5139,395,38.30,23.1594,3
20628,643.42,1602.46,1428.18,550.94,48.09,520.01,2388.24,8.5646,398,38.44,22.9333,2
20629,643.23,1605.26,1426.53,550.68,48.39,519.67,2388.23,8.5389,395,38.29,23.0640,1


In [10]:
test1_PCA

Unnamed: 0,T24,T30,T50,P30,Ps30,phi,NRf,BPR,htBleed,w31,w32,RUL
0,643.02,1585.29,1398.21,553.90,47.20,521.72,2388.03,8.4052,392,38.86,23.3735,142
1,641.71,1588.45,1395.42,554.85,47.50,522.16,2388.06,8.3803,393,39.02,23.3916,141
2,642.46,1586.94,1401.34,554.11,47.50,521.97,2388.03,8.4441,393,39.08,23.4166,140
3,642.44,1584.12,1406.42,554.07,47.28,521.38,2388.05,8.3917,391,39.00,23.3737,139
4,642.51,1587.19,1401.92,554.16,47.31,522.15,2388.03,8.4031,390,38.99,23.4130,138
...,...,...,...,...,...,...,...,...,...,...,...,...
13091,643.24,1599.45,1415.79,553.41,47.69,520.69,2388.00,8.4715,394,38.65,23.1974,24
13092,643.22,1595.69,1422.05,553.22,47.60,521.05,2388.09,8.4512,395,38.57,23.2771,23
13093,643.44,1593.15,1406.82,553.04,47.57,521.18,2388.04,8.4569,395,38.62,23.2051,22
13094,643.26,1594.99,1419.36,553.37,47.61,521.33,2388.08,8.4711,395,38.66,23.2699,21


## **Feature Scaling**  
  
It is necessary to normalize our data with the MinMax scaler before performing PCA. This is because PCA calculates a new projection of the data, and the new axis are based on the standard deviation of the variables. We want all the features to have an equal initial weight.  
  
In this example we will not be splitting the data into training and validation sets like we did in the 2_Decision_Tree_Classifier notebook.

In [11]:
# Separate out label from training set
X_train = train1_PCA.drop('RUL', axis=1)
y_train = train1_PCA['RUL'].copy()

In [12]:
# Separate out label from test set
X_test = test1_PCA.drop('RUL', axis=1)
y_test = test1_PCA['RUL'].copy()

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()

In [14]:
X_train = scaler_minmax.fit_transform(X_train)

In [15]:
X_test = scaler_minmax.transform(X_test)

## **PCA Explained Variance Ratio**  
  
Here, we are going to determine the number of principal components that explains 95% of the variance. 

In [16]:
from sklearn.decomposition import PCA

In [17]:
model_pca = PCA(n_components=0.95)
model_pca.fit_transform(X_train, y_train)

print (model_pca.explained_variance_)
print (model_pca.explained_variance_ratio_)
print (model_pca.explained_variance_ratio_.cumsum())

[0.16871202 0.00805035 0.00755918 0.00666273 0.00643207 0.00590817
 0.00576971 0.00457805]
[0.75117292 0.03584336 0.03365647 0.02966512 0.02863814 0.02630551
 0.02568905 0.02038329]
[0.75117292 0.78701628 0.82067275 0.85033787 0.87897601 0.90528152
 0.93097056 0.95135386]
