# Unsupervised Learning & dimension reduction

---

_You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._

---

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import copy
import matplotlib.pyplot as plt

## Datasets

### Breast cancer dataset

In [None]:
#### Sanity check dataset format
def format_check(X, y):
    import numpy as np
    assert type(X) == type(np.zeros(2))
    assert X.shape[1] > 0
    assert type(y) == type(np.zeros(y.shape))
    try:
        y.shape[1]
        print('{} must be of shape: (n,)'.format(y.shape))  
    except:
        pass
    print('X:\t {} {}\ny:\t {} {}\nclasses: {}\n'.format(X.shape, type(X), y.shape, type(y), set(y)))  
    

In [None]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_cancer, y_cancer = load_breast_cancer(return_X_y = True)

format_check(X_cancer, y_cancer)

### Fruits dataset
 - Plot function needs y as a flat lits/array (n,)

In [None]:
fruits = pd.read_table('../../_data/fruit_data_with_colors.txt')

In [None]:
fruits.loc[:, 'fruit_label'].copy().values.ravel() - 1

In [None]:
X_fruits = fruits[['mass','width','height', 'color_score']].values

y_fruits = fruits.loc[:, 'fruit_label'].values.ravel() - 1
# y_fruits = y_fruits.values.ravel() - 1

format_check(X_fruits, y_fruits)

### Blob dataset

In [None]:
from sklearn.datasets import make_blobs

X_blob, y_blob = make_blobs(n_samples=500,
                              n_features=2, 
                              centers=4, 
                              cluster_std=1.0,
                              shuffle=True, 
                              random_state=1)

format_check(X_blob, y_blob)

### Plot datasets

In [None]:
def plot_labelled_scatter(X, y, class_labels, ax_pad=1):
    import matplotlib.cm as cm
    from matplotlib.colors import ListedColormap, BoundaryNorm
    import matplotlib.patches as mpatches
    import seaborn as sns
    
    # Colors
    colors = ['#FFFF00', '#00AAFF', '#000000', '#FF00AA']
    flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71", '#000000']
    col_pal = sns.color_palette(flatui).as_hex()
    cmap = ListedColormap(col_pal)
    
    # BoundaryNorm maps from data points(labels) to colors based on discrete intervals.
    # Boundaries defines the edges of bins, and data falling within a bin is mapped to the color with the same index.
    # If the number of bins doesn’t equal ncolors, the color is chosen by linear interpolation of the bin number onto color numbers.
    # c=y, cmap=cmap, norm=bnorm => map y to color from pallete cut by bounderies
    num_labels = len(class_labels)
    bounderies = np.arange(num_labels+1)
    bnorm = BoundaryNorm(boundaries=bounderies, ncolors=num_labels)
    
    plt.figure()
    plt.scatter(X[:, 0], X[:, 1], s=65, c=y, cmap=cmap, norm=bnorm, 
                alpha=0.50, edgecolor='black', lw=1) # 

    plt.xlim(X[:, 0].min()-ax_pad, X[:, 0].max()+ax_pad)
    plt.ylim(X[:, 1].min()-ax_pad, X[:, 1].max()+ax_pad)

    # Legend
    legend_handle = [mpatches.Patch(color=flatui[c], label=class_labels[c]) 
                     for c in range(num_labels)]   
    plt.legend(handles=legend_handle)

In [None]:
plot_labelled_scatter(X_cancer, y_cancer, list(set(y_cancer)))

In [None]:
plot_labelled_scatter(X_fruits, y_fruits, list(set(y_fruits)))

In [None]:
plot_labelled_scatter(X_blob, y_blob, list(set(y_blob)))

## Dimensionality Reduction and Manifold Learning

### Principal Components Analysis (PCA)

#### Using PCA to find the first two principal components of the breast cancer dataset

In [None]:
from sklearn.preprocessing import StandardScaler

# Before applying PCA, each feature should be centered (zero mean) and with unit variance
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_cancer)
X_normalized

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2).fit(X_normalized)
X_pca = pca.transform(X_normalized)
print(X_cancer.shape, X_pca.shape)

#### Plotting the PCA-transformed version of the breast cancer dataset

In [None]:
plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign'])

plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Breast Cancer Dataset PCA (n_components = 2)')
plt.show();

#### Plotting the magnitude of each feature value for the first two principal components

In [None]:
fig = plt.figure(figsize=(8, 4))
plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
feature_names = list(cancer.feature_names)

plt.gca().set_xticks(np.arange(-.5, len(feature_names)))
plt.gca().set_yticks(np.arange(0.5, 2))
plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12)
plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12)

plt.colorbar(orientation='horizontal', 
             ticks=[pca.components_.min(), 0, pca.components_.max()], pad=0.65)
plt.show();

#### PCA on the fruit dataset (for comparison)

In [None]:
from sklearn.preprocessing import StandardScaler

# each feature should be centered (zero mean) and with unit variance
fr_scaler = StandardScaler()
X_normalized = fr_scaler.fit_transform(X_fruits) 

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2).fit(X_normalized)
X_pca = pca.transform(X_normalized)
X_pca.shape, y_fruits.shape

In [None]:
plot_labelled_scatter(X_pca, y_fruits, ['apple','mandarin','orange','lemon'])

plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Fruits Dataset PCA (n_components = 2)');

### Manifold learning methods

#### Multidimensional scaling (MDS) on the fruit dataset

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS

# each feature should be centered (zero mean) and with unit variance
X_fruits_normalized = StandardScaler().fit_transform(X_fruits)  

mds = MDS(n_components=2)

X_fruits_mds = mds.fit_transform(X_fruits_normalized)

plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])
plt.xlabel('First MDS feature')
plt.ylabel('Second MDS feature')
plt.title('Fruit sample dataset MDS');

#### Multidimensional scaling (MDS) on the breast cancer dataset

(This example is not covered in the lecture video, but is included here so you can compare it to the results from PCA.)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y=True)

# each feature should be centered (zero mean) and with unit variance
X_normalized = StandardScaler().fit_transform(X_cancer)  

mds = MDS(n_components=2)

X_mds = mds.fit_transform(X_normalized)

plot_labelled_scatter(X_mds, y_cancer, ['malignant', 'benign'])

plt.xlabel('First MDS dimension')
plt.ylabel('Second MDS dimension')
plt.title('Breast Cancer Dataset MDS (n_components=2)');

#### t-SNE on the fruit dataset

(This example from the lecture video is included so that you can see how some dimensionality reduction methods may be less successful on some datasets. Here, it doesn't work as well at finding structure in the small fruits dataset, compared to other methods like MDS.)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(random_state = 0)

X_tsne = tsne.fit_transform(X_fruits_normalized)

plot_labelled_scatter(X_tsne, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])
plt.xlabel('First t-SNE feature')
plt.ylabel('Second t-SNE feature')
plt.title('Fruits dataset t-SNE');

#### t-SNE on the breast cancer dataset

Although not shown in the lecture video, this example is included for comparison, showing the results of running t-SNE on the breast cancer dataset.  See the reading "How to Use t-SNE effectively" for further details on how the visualizations from t-SNE are affected by specific parameter settings.

In [None]:
tsne = TSNE(random_state = 0)
X_tsne = tsne.fit_transform(X_normalized)

plot_labelled_scatter(X_tsne, y_cancer, ['malignant', 'benign'])
plt.xlabel('First t-SNE feature')
plt.ylabel('Second t-SNE feature')
plt.title('Breast cancer dataset t-SNE');

## Clustering

### K-means

This example from the lecture video creates an artificial dataset with make_blobs, then applies k-means to find 3 clusters, and plots the points in each cluster identified by a corresponding color.

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

X, y = make_blobs(random_state=10)

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

plot_labelled_scatter(X, kmeans.labels_, list(set(kmeans.labels_)))

Example showing k-means used to find 4 clusters in the fruits dataset.  Note that in general, it's important to scale the individual features before applying k-means clustering.

In [None]:
# from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

# fruits = pd.read_table('../../_data/fruit_data_with_colors.txt')
# X_fruits = fruits[['mass','width','height', 'color_score']].as_matrix()
# y_fruits = fruits[['fruit_label']] - 1

X_fruits_normalized = MinMaxScaler().fit_transform(X_fruits)  

kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X_fruits_normalized)

plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, list(set(kmeans.labels_)))

### Agglomerative clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

cls = AgglomerativeClustering(n_clusters=3)
cls_assignment = cls.fit_predict(X_blob)

plot_labelled_scatter(X_blob, cls_assignment, list(set(cls_assignment)))

#### Creating a dendrogram (using scipy)

This dendrogram plot is based on the dataset created in the previous step with make_blobs, but for clarity, only 10 samples have been selected for this example, as plotted here:

And here's the dendrogram corresponding to agglomerative clustering of the 10 points above using Ward's method.  The index 0..9 of the points corresponds to the index of the points in the X array above.  For example, point 0 (5.69, -9.47) and point 9 (5.43, -9.76) are the closest two points and are clustered first.

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

plt.figure(figsize=(10,10))
dendrogram(ward(X_blob))
plt.show();

### DBSCAN clustering

In [None]:
from sklearn.cluster import DBSCAN
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
# NOTE: returns more labels than expected?!

# Hyperparameters
max_dist_within_cluster = .4
min_samples_per_cluster = 5 

dbscan = DBSCAN(eps=max_dist_within_cluster, min_samples=min_samples_per_cluster)
cls = dbscan.fit_predict(X_blob) +1  # -1 is noise
print(cls)
# print("Cluster membership values:\n{}".format(cls))
plot_labelled_scatter(X_blob, cls, list(set(cls)))