# Chapter 10 - Unsupervised Learning 

In [None]:
# %load ../../_data/standard_import.txt
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy.cluster import hierarchy

plt.style.use('seaborn-white')

## Lab 1: Principal Component Analysis

In [None]:
# In R, I exported the dataset to a csv file. It is part of the base R distribution.
df = pd.read_csv('../../_data/USArrests.csv', index_col=0)
df.info()
df.sample(3)

In [None]:
df.mean()

In [None]:
df.var()

### Scaled dataset

In [None]:
X = pd.DataFrame(scale(df), index=df.index, columns=df.columns)
X.sample(3)

### Fit the PCA model and transform X to get the principal components

In [None]:
pca = PCA()
pca.fit(X)

### Loading Vectors - Eigen Vectors

In [None]:
pca.components_

In [None]:
pca_loadings = pd.DataFrame(pca.components_.T, index=df.columns, columns=['V1', 'V2', 'V3', 'V4'])
pca_loadings

### Principle Components

In [None]:
df_plot = pd.DataFrame(pca.fit_transform(X), columns=['PC1', 'PC2', 'PC3', 'PC4'], index=X.index)
df_plot.sample(5)

In [None]:
fig , ax1 = plt.subplots(figsize=(10, 8))

ax1.set_xlim(-3.5, 3.5)
ax1.set_ylim(-3.5, 3.5)

# Plot Principal Components 1 and 2
for i in df_plot.index:
    ax1.annotate(i, (-df_plot.PC1.loc[i], -df_plot.PC2.loc[i]), ha='center')

# Plot reference lines
ax1.hlines(0, -3.5, 3.5, linestyles='dotted', colors='grey')
ax1.vlines(0, -3.5, 3.5, linestyles='dotted', colors='grey')

ax1.set_xlabel('Principal Component 1')
ax1.set_ylabel('Principal Component 2')
    
# Plot Principal Component loading vectors, using a second y-axis.
ax2 = ax1.twinx().twiny()

ax2.set_ylim(-1, 1)
ax2.set_xlim(-1, 1)
ax2.tick_params(axis='x', colors='blue')
ax2.tick_params(axis='y', colors='blue')
ax2.spines['top'].set_color('blue')
ax2.spines['right'].set_color('blue')
ax2.set_xlabel('Principal Component Vector Loadings', color='blue')
ax2.set_ylabel('Principal Component Vector Loadings', color='blue')

# Plot labels for vectors. Variable 'a' is a small offset parameter to separate arrow tip and text.
a = 1.07  
for i in pca_loadings[['V1', 'V2']].index:
    ax2.annotate(i, (-pca_loadings.V1.loc[i]*a, -pca_loadings.V2.loc[i]*a), color='blue')

# Plot vectors
ax2.arrow(0, 0, -pca_loadings.V1[0], -pca_loadings.V2[0], color='blue')
ax2.arrow(0, 0, -pca_loadings.V1[1], -pca_loadings.V2[1], color='blue')
ax2.arrow(0, 0, -pca_loadings.V1[2], -pca_loadings.V2[2], color='blue')
ax2.arrow(0, 0, -pca_loadings.V1[3], -pca_loadings.V2[3], color='blue');

### Explained variance

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ratio_

In [None]:
# Standard deviation of the four principal components
np.sqrt(pca.explained_variance_)

In [None]:
plt.figure(figsize=(7,5))

plt.plot([1,2,3,4], pca.explained_variance_ratio_, '-o', label='Individual component')
plt.plot([1,2,3,4], np.cumsum(pca.explained_variance_ratio_), '-s', label='Cumulative')

plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.xlim(0.75, 4.25)
plt.ylim(0, 1.05)
plt.xticks([1,2,3,4])
plt.title('Variance explained')
plt.legend(loc=2);