<a href="https://colab.research.google.com/github/bchabros/machine_learning_bootcamp/blob/main/unsupervised/02_dimensionality_reduction/03_pca_wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

np.set_printoptions(precision=4, suppress=True, edgeitems=5, linewidth=200)

In [3]:
df_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df = df_raw.copy()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
data = df.iloc[:, 1:]
target = df.iloc[:, 0]
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
target.value_counts()

2    71
1    59
3    48
Name: 0, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target)


In [7]:
from sklearn.preprocessing import  StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)
X_train_std[:5]

array([[ 1.077 ,  2.4681, -0.4765,  0.1521, -1.3846, -2.0914, -1.7374,  0.3451, -1.6164, -0.0134, -1.7511, -1.8635, -1.0346],
       [ 0.7817,  0.717 ,  0.6639, -1.3212,  1.1086,  0.6308,  0.9565, -1.478 ,  0.0977,  0.0748, -0.0174,  1.0594,  0.2987],
       [ 0.4741, -0.4834,  0.8712, -1.0449, -0.4843,  0.8682,  0.8664, -0.1305, -0.2696, -0.0575, -0.1999,  0.8579,  1.384 ],
       [ 0.5971,  1.1593, -0.6148, -0.0014, -0.8306, -1.0785, -1.5972,  1.7718, -1.2666,  0.3394, -0.7017, -1.158 , -0.523 ],
       [-1.1379, -1.0611,  0.4911,  1.3798, -1.5231, -0.4771, -0.4956,  0.3451, -0.3571, -1.2041,  1.5795,  0.1379, -0.368 ]])

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.fit_transform(X_test_std)
X_train_pca.shape

(133, 3)

In [9]:
results = pd.DataFrame(data={'explained_variance_ratio': pca.explained_variance_ratio_})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results['component'] = results.index + 1
results

Unnamed: 0,explained_variance_ratio,cumulative,component
0,0.368072,0.368072,1
1,0.239194,0.607265,2
2,0.111035,0.7183,3


In [10]:
fig = go.Figure(data=[go.Bar(x=results['component'], y=results['explained_variance_ratio'],
                             name='explained_variance_ratio'),
                      go.Scatter(x=results['component'], y=results['cumulative'],
                                 name='cumulative')],
                layout=go.Layout(title=f'PCA - {pca.n_components_} components', width=900,
                                 template='plotly_dark'))
fig.show()

In [12]:
X_train_pca_df = pd.DataFrame(data=np.c_[X_train_pca, y_train], columns=['pca_1', 'pca_2', 'pca_3', 'target'])
X_train_pca_df.head()

Unnamed: 0,pca_1,pca_2,pca_3,target
0,4.352189,-0.756353,-1.313582,3.0
1,-2.040747,-0.908999,-0.118968,1.0
2,-1.7217,-0.723873,-0.05499,1.0
3,3.151122,-0.519172,-1.041023,3.0
4,0.605296,2.247894,1.155541,2.0


In [14]:
px.scatter_3d(X_train_pca_df, x='pca_1', y='pca_2', z='pca_3', color='target', template='plotly_dark', width=900)

In [16]:
X_train_pca[:5]

array([[ 4.3522, -0.7564, -1.3136],
       [-2.0407, -0.909 , -0.119 ],
       [-1.7217, -0.7239, -0.055 ],
       [ 3.1511, -0.5192, -1.041 ],
       [ 0.6053,  2.2479,  1.1555]])