In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
# -*- coding: utf-8 -*-
"""r
Created on Tue Jul 25 2017

@author: Nicolas Loffreda

Module to read all the images located on a particular folder, convert them to
an np array, flatten them and put them together on an np Array format to use as
Data Matrix (Feature Vector)
"""

import os
import numpy as np
from PIL import Image
import pylab as pl

def readImages(path, chk_shape=(50, 50)):
    '''
    Read all images from a directory and return an np Array X to use
    as data matrix (Feature Vectors).
    The function will also check for a specific shape and raise an error if 
    dimensions don't match what's expected (usually 50x50, but can be changed)
    
    Parameters & Return
    ----------
    :param path: Path where the images are located
    :param chk_shape: Tuple (1x2) to check image shape
    :return X: Data matrix with each image on a flattened format
    
    Example
    -----------
    :Example:
        
    >> X = readImages("./imgs", chk_shape=(145,148))
    >> X = readImages("./imgs")
    '''
    # Read Images
    images = list()
    labels = list()
    for img in os.listdir(path):
        if img.endswith(".png") or img.endswith(".jpg"):
            im = Image.open(os.path.join(path, img))
            im_grey = im.convert('L')
            im_array = np.array(im_grey)
            if im_array.shape != chk_shape:
                raise ValueError('{} doesnt have the expected dim {}, {} instead'.format(img, chk_shape, im_array.shape))
            images.append(im_array)
            # obtain class value
            class_id = int(img.split('_')[0].lstrip('c'))
            labels.append(class_id)
            
    # Flatten the images and append to Data matrix
    flatimages = list()
    for i in images:
        flatimages.append(i.ravel())
    X = np.asarray(flatimages)
    y = np.asarray(labels)
    return X, y

In [None]:
path = r'C:\Users\Vicky\Documents\berkeley_ext_course\intro_to_ML_using_python\final_project_test\all_data'

In [None]:
# import image data and labels
X, y = readImages(path, chk_shape=(50, 50))

In [None]:
# split test and training data
def train_test_split(X, y, test_size=None, random_state=None):
    """
    Input:
    -X: data, with shape (N, d)
    -y: label, with shape (N, )
    -test_size: float. 0-1
    -random_state: define randome state, for reproducing same random sequence. int   
    """
    if random_state == None:
        random_state = 42
    if test_size == None:
        test_size = 0.25
        
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    
    test_size = int(len(X)*test_size)
    test_ind = indices[:test_size]
    train_ind = indices[test_size:]
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_test, y_test = X[test_ind], y[test_ind]
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# count the number of each class in the training set
from collections import Counter
Counter(y_train)

In [None]:
"""
@author: Nicolas Loffreda
@date: 6/24/2017

Code to apply Principal Component Analysis (PCA) to a feature matrix
"""
import numpy as np
import matplotlib.pyplot as plt

def applyPCA(X, components=2, return_code='P'):
    '''
    Calculate the PCA of a feature matrix.
    Can be controled to set the number of PCA components we want to use.
    Will return P Matrix with dimension N x components
    '''
    if type(X) != np.ndarray:
        raise TypeError('Type of X must be a numpy array')
    if components > X.shape[1]:
        raise ValueError("Number of components can't be more than the number of features: " + str(X.shape[1]))
        
    # Normalize Feature Vector: Z
    mean_vec = X.mean(axis=0)
    Z = X - mean_vec

    # Covariance Matrix: C
    C = np.cov(X, rowvar=False)
    
    # EigenVectors: V
    eigvals, V = np.linalg.eigh(C)
    eigvals=np.flipud(eigvals)
    V=np.flipud(V.T)
    V = V[:components]
    
    # Principal Components matrix: P
    P = np.dot(Z,V.T)
    
    # Recuperated Matrix: X_Rec
    X_rec = np.dot(P, V) + mean_vec
    
    if return_code == 'P':
        return P
    if return_code == 'PX':
        return P, X_rec
    elif return_code == 'all':
        return mean_vec, Z, C, P, V, X_rec, eigvals
    else:
        raise ValueError('Invalid return code: P, PX, all')

In [None]:
# PCA
mean_vec, _, _, P, V, _, eigvals = applyPCA(X_train, components=2, return_code='all')

In [None]:
class_ids = [0,1,2,3,4,5]
colors = ['cyan', 'red', 'green', 'orange', 'blue', 'magenta']
# plot first 2 principle components
plt.figure(figsize=(10,8))
ax = plt.gca()
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data', 0))
ax.spines['bottom'].set_color('gray')
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data', 0))
ax.spines['left'].set_color('gray')
for i in np.arange(len(class_ids)):
    plt.scatter(P[:,0][y_train==class_ids[i]], P[:,1][y_train==class_ids[i]], c=colors[i], s = 10, label = "class %i"%class_ids[i], alpha=0.7)
ax.legend()

In [None]:
class Ensemble_linear_classifier:
    """
    Implementation of ensemble linear classifiers algorithm. None linear transformation using Tanh.

    Input:
    -X: training data, with shape (N, d)
    -query: query data, with shape (N, d)

    """
    def __init__(self, n_estimators=30, random_state=42):
        """r
        n_estimators: number of linear classifier
        random_state: for generating random seeding
        Keslerlabel: dictionary to show corresponding class value in Kesler construction
        w: weight
        e: error weight for ensembled classifier
        """
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.Keslerlabel = None
        self.w = None
        self.e = None

    def _Kesler_construction(self, X, y):
        # convert y to Kesler constructions, positive class +1, negative class -1
        class_values = np.unique(y)
        class_num = len(class_values)
        yc = np.zeros_like(y)

        for i, value in enumerate(class_values):
            yc[y==value]=i

        KC = np.zeros((len(X), class_num))
        KC.fill(-1)
        KC[range(len(X)), yc]=1

        self.Keslerlabel = {ind:value for ind, value in enumerate(class_values)}

        return KC

    def _Xa_construction(self, X):
        # add x0=1
        Xa = np.hstack((np.ones((len(X),1)), X))
        return Xa

    def fit(self, X, y):
        # construct weight
        M = self.n_estimators
        N, d = X.shape

        # generate random weights and normalize
        np.random.seed(self.random_state)
        w = np.random.uniform(-1,1,(M,d))
        w = w/np.linalg.norm(w, axis=1, keepdims=True)

        # random select M samples from X
        np.random.seed(self.random_state)
        ind = np.random.choice(len(X), size=M)
        Xk=X[ind]
        w0 = -np.sum(Xk*w, axis=1, keepdims=True)
        w = np.hstack((w0, w))
        self.w = w

        Xa = self._Xa_construction(X)
        KC = self._Kesler_construction(X, y)

        # transform X from d-dim feature to M-dim feature
        C = np.tanh(np.dot(Xa, w.T))
        Ca = self._Xa_construction(C)

        # calculate error weights for ensembled classifier
        e = np.dot(np.linalg.pinv(Ca), KC)
        self.e = e

    def predict(self, query):
        labels = self.Keslerlabel
        query_a = self._Xa_construction(query)

        C = np.tanh(np.dot(query_a, self.w.T))
        Ca = self._Xa_construction(C)

        y = np.argmax(np.dot(Ca, self.e), axis=1)

        pred = np.zeros_like(y)

        for k, v in labels.items():
            pred[y==k]=v

        return pred


In [None]:
# confusion matrix of 6 class
def confusion_matrix(pred, y):   
    conf_mat_6 = np.zeros((6,6))

    for i in np.arange(6):
        for j in np.arange(6):
            conf_mat_6[i][j]=np.sum((y==i)&(pred==j))

    return conf_mat_6.astype('int64')

In [None]:
clf = Ensemble_linear_classifier(n_estimators=30, random_state=42)
clf.fit(P, y_train)

# training set accuracy
pred_tr = clf.predict(P)
conf_mat_tr = confusion_matrix(pred_tr, y_train)

#PCA on test data
P_test = np.dot((X_test-mean_vec), V.T)

# test set accuracy
pred_test = clf.predict(P_test)
conf_mat_test = confusion_matrix(pred_test, y_test)

print (conf_mat_tr, "\n\n", conf_mat_test)

In [None]:
clf = Ensemble_linear_classifier(n_estimators=60, random_state=42)
clf.fit(P, y_train)

# training set accuracy
pred_tr = clf.predict(P)
conf_mat_tr = confusion_matrix(pred_tr, y_train)

#PCA on test datar
P_test = np.dot((X_test-mean_vec), V.T)

# test set accuracy
pred_test = clf.predict(P_test)
conf_mat_test = confusion_matrix(pred_test, y_test)

print (conf_mat_tr, "\n\n", conf_mat_test)

In [None]:
# plot decision boundary
h = 50 # step size in the mesh
x_min, x_max = P[:, 0].min()-100, P[:, 0].max() + 100
y_min, y_max = P[:, 1].min()-100, P[:, 1].max() + 100
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

q = np.array([xx.ravel(), yy.ravel()]).T
pred_mesh = clf.predict(q)
zz = pred_mesh.reshape(xx.shape)

class_ids = [0,1,2,3,4,5]
colors = ['cyan', 'red', 'green', 'orange', 'blue', 'magenta']

plt.figure(figsize=(10,8))
ax = plt.gca()
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data', 0))
ax.spines['bottom'].set_color('gray')
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data', 0))
ax.spines['left'].set_color('gray')
for i in np.arange(len(class_ids)):
    plt.scatter(P[:,0][y_train==class_ids[i]], P[:,1][y_train==class_ids[i]], c=colors[i], s = 10, label = "class %i"%class_ids[i], alpha=0.7)
ax.legend()
plt.contourf(xx, yy, zz, cmap=plt.cm.nipy_spectral, alpha=0.2)
ax.legend()
#plt.savefig("decision_boundary_2PC_ensemble.png")

In [None]:
params = {'w':clf.w, 'e':clf.e}
params['w'].shape, params['e'].shape

In [None]:
np.save("Ensemble_2D.npy", params)