# Data Preprocessing

In [14]:
%matplotlib notebook
from sklearn import datasets
from pylab import *
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from sklearn.decomposition import KernelPCA, PCA
from sklearn.manifold import LocallyLinearEmbedding, Isomap
from amltlearn.feature_selection import LaplacianScore


iris = datasets.load_iris()
col = ['r', 'g', 'b']
lc = [col[i] for i in iris['target']]

## Dimensionality Reduction - Linear: PCA

In [15]:
pca = PCA()
pdata = pca.fit_transform(iris['data'])
@interact(i= (0,3), j= (0,3))
def g(i=0, j=1):
    fig = plt.figure(figsize=(8,8))
    plt.scatter(pdata[:, i], pdata[:, j], c=lc,s=100);

Looking at the variance explained by each component, we can see that the first one has most of the variance and the two first components explains almost 98%, so only this components would be necessary. 

In [16]:
print(pca.explained_variance_ratio_)

[ 0.92461621  0.05301557  0.01718514  0.00518309]


## Dimensionality Reduction - non Linear: Kernel PCA

In [17]:
@interact(kernel= ['rbf', 'poly'], degree = [2,3, 4, 5])
def g(kernel='rbf', degree=2):
    kpca = KernelPCA(n_components=2, kernel=kernel, degree=degree)
    kpdata = kpca.fit_transform(iris['data'])
    fig = plt.figure(figsize=(8,8))
    plt.scatter(kpdata[:, 0], kpdata[:, 1], c=lc,s=100);

## Dimensionality Reduction - non Linear: ISOMAP

In [18]:
@interact(nn= (1, 15, 2))
def g(nn=3):
    iso = Isomap(n_components=2, n_neighbors=nn)
    isdata = iso.fit_transform(iris['data'])
    fig = plt.figure(figsize=(8,8))
    plt.scatter(isdata[:, 0], isdata[:, 1], c=lc,s=100);

In [19]:
@interact(nn= (1, 15, 2))
def g(nn=3):
    iso = Isomap(n_components=3, n_neighbors=nn)
    is3data = iso.fit_transform(iris['data'])
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111, projection='3d')
    ax.view_init(75, 120)
    plt.scatter(is3data[:, 0], is3data[:, 1], zs=is3data[:, 2], depthshade=False, c=lc,s=100);

## Dimensionality Reduction - non Linear: LLE

In [20]:
@interact(nn= (5, 50, 5))
def g(nn=5):
    lle = LocallyLinearEmbedding(n_neighbors=nn, n_components=2, method='standard')
    lldata = lle.fit_transform(iris['data'])
    print (lle.reconstruction_error_)
    fig = plt.figure(figsize=(8,8))
    plt.scatter(lldata[:, 0], lldata[:, 1], c=lc,s=100);

In [21]:
@interact(nn= (5, 25, 5))
def g(nn=5):
    lle = LocallyLinearEmbedding(n_neighbors=nn, n_components=3, method='standard')
    ll3data = lle.fit_transform(iris['data'])
    print (lle.reconstruction_error_)
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111, projection='3d')
    ax.view_init(240, 90)
    plt.scatter(ll3data[:, 0], ll3data[:, 1], zs=ll3data[:, 2], depthshade=False, c=lc,s=100);

## Feature Selection: Laplacian Score

In [22]:
@interact(nn= (5, 25, 5))
def g(nn=5):
    lap = LaplacianScore(n_neighbors=nn, bandwidth=0.1, k=2)
    irissel = lap.fit_transform(iris['data'])
    print (lap.scores_)
    fig = plt.figure(figsize=(8,8))
    plt.scatter(irissel[:, 0], irissel[:, 1], c=lc,s=100);