# Dimensionality Reduction by Extracting Principal Components

[Wine](https://archive.ics.uci.edu/ml/machine-learning-databases/wine/)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
df = pd.read_csv('wine.csv')
df.shape

In [None]:
df.head()

In [None]:
# store all features except the first one ('class') which is index #0 in our X data-frame
X = df.iloc[:, 1:]

In [None]:
# and store 'class' feature for our target variable
y = df['class']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [None]:
# PCA is quitely sensetive to scaling, so we gonna scale our data 
from sklearn.preprocessing import StandardScaler    # importing stuff
# instantiating the standard scaler module
sc = StandardScaler()

In [None]:
# scaling the data 
sc.fit(X_train)
# assigning the new scaled data to 'X_train_sc' and 'X_test_sc' 
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)
# X_train_sc = sc.fit_transform(X_train)
# X_test_sc = sc.fit_transform(X_test) <--- Wrong approach

In [None]:
from sklearn.decomposition import PCA

In [None]:
# we guess two Principal Components are enough
pca = PCA(n_components=2)
pca.fit(X_train_sc)

In [None]:
pca.explained_variance_ratio_

In [None]:
# round the values in 3 digit decimal
print(np.round(pca.explained_variance_ratio_, 3))

In [None]:
# pd.DataFrame(np.round(pca.components_, 3) prints the PCA values without the column names
# so adding the column names to it
# and use '.T' in the end of it to transpose the table we created
pd.DataFrame(np.round(pca.components_, 3), columns=X.columns).T

No preceived idea of the number of PCAs we want

In [None]:
pca = PCA(n_components=None)
pca.fit(X_train_sc)

In [None]:
pca.transform(X_train_sc)

In [None]:
print(np.round(pca.explained_variance_ratio_, 3))

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100.)
plt.xlabel('number of components')
plt.ylabel('cummulative explained variance');

***

In [None]:
res = pca.transform(X_train_sc)
index_name = ['PCA_'+str(k) for k in range(0, len(res))]

In [None]:
df1 = pd.DataFrame(res, columns=df.columns[1:],
                   index=index_name)[0:4]
df1.T.sort_values(by='PCA_0')

***