# Apply PCA to MNIST Data

In [17]:
import os
%pylab inline
import numpy as np
from time import time
from __future__ import print_function
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
os.environ["MKL_THREADING_LAYER"] = "GNU"

from keras.datasets import mnist

# PCA Implementation
def pca(X): # I put a k parameter to choose number of pc
    mean = np.zeros(len(X[0])) # It's just initializing mean vector   
    for i in range(len(mean)):
        total = 0
        for row in X:
            total += row[i]
        mean[i]= total/len(X) # calculating mean for all columns
        
    Xsvd = X - mean # Centeralize the data
    u, s, vh = np.linalg.svd(Xsvd, full_matrices=False) # Performing svd
    PCs = u@diag(s) # Principal components are eigenvectors*eigenvalues
    return mean,s,PCs # S vector is our eigenvalues, 

def loadData():
    (x_train, y_train), (x_test, y_test) = mnist.load_data() # We have load dataset, but our features are in 2-dim.

    #Flattening our features
    X_train = np.zeros((len(x_train),len(np.array(x_train[0]).flatten())))
    print("Shape of train features is:",x_train.shape)
    t0 = time()
    for i in range(0,len(x_train)): # Flatten train part
        x_train[i] = np.array(x_train[i])
        X_train[i] =(x_train[i].flatten())/255
    print("Flattened train in",time()-t0,"s")
    print("Flattened train shape is:",X_train.shape)
    return X_train,y_train

def pcaperform(features,labels,count):
    if count>=len(features):
        print ("error")
        return -1
    
    t1 = time()
    mean,values,PCs = pca(features[:count]) # Performed pca to 1000
    print("Pca performed in",time()-t1,"s")
    
    X = PCs[:count] # splitting data
    Y = labels[:count]
    
    return X,Y

features, labels = loadData()
print(features.shape,labels.size)
features, labels = pcaperform(features,labels,2000)
features = features[:,0:45]
print(features.shape,labels.size)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


Shape of train features is: (60000, 28, 28)
Flattened train in 0.8100388050079346 s
Flattened train shape is: (60000, 784)
(60000, 784) 60000
Pca performed in 1.1720056533813477 s
(2000, 45) 2000


I performed pca to dataset's 2000 data. As we can see 2000 of them choosed. 784 PC exists. I Choosed 45 PC from last homework.

# Help Functions

In [18]:
from sklearn.cluster import KMeans

def performCluster(features,labels):
    # do the clustering
    kmeans = KMeans(n_clusters=10).fit(features)
    cluster_labels = kmeans.labels_

    cluster_matrix = numpy.zeros(shape=(10,10)) # Initilizing cluster matrix

    for i in range(0,len(cluster_labels)):
        cluster_matrix[labels[i]][cluster_labels[i]]+=1 # Creating cluster matrix

    return cluster_matrix,kmeans # Return cluster matrix and cluster

def findLabelsToClusters(cluster_matrix):
    
    maximums = np.zeros(shape=(100,3)) # Initilizng maximums

    for i in range(0,10):
        for j in range(0,10): # Creating maximums
            index = i*10 + j
            maximums[index][0]=cluster_matrix[i][j]
            maximums[index][1]=i
            maximums[index][2]=j

    sorted_maximums = maximums[maximums[:,0].argsort()] # Sort it

    label_cluster = np.zeros(10)
    for i in range(0,10):
        label_cluster[i] = -1 # to keep label-cluster relation

    isFinished = 0
    correct = 0
    takenClusters = [] # To keep taken clusters
    index = len(maximums)-1
    while(isFinished <10):
        label = sorted_maximums[index][1]
        cluster = sorted_maximums[index][2]
        if(label_cluster[int(label)]==-1): # If this label is not clustered
            if(cluster not in takenClusters): # and this cluster is not taken
                label_cluster[int(label)]=cluster # take this cluster
                takenClusters.append(cluster)
                correct += sorted_maximums[index][0]
                isFinished += 1
        index -=1
    return(label_cluster,correct)

def split(labels,features,n,k): # this function splits dataset using k-fold-cross-validation
        train_labels = []
        train_features = []
        test_labels = []
        test_features = []
        
        for i in range(0,len(labels)): # iterating dataset
            # This condition works like this : goo.gl/images/WNkSSV n is our iteration number k is our splitting factor.
            if i >= (len(labels)/k)*(n-1) and i < (len(labels)/k)*(n-1)+len(labels)/k: #splitting test data with n
                test_labels.append(labels[i])                                          #nth test data is chosen
                test_features.append(features[i])
            else:
                train_labels.append(labels[i])
                train_features.append(features[i])
                
        return train_labels,train_features,test_labels,test_features # returning splitted datas

# Kmeans with euclidian distance

In [19]:
print("Euclidian Distance")
for i in range(1,6): # do k-fold-validation
    print("Doing cross validation for",i,"th")
    train_labels,train_features,test_labels,test_features = split(labels,features,i,5)
    cluster_matrix,cluster = performCluster(train_features,train_labels)
    print("Cluster matrix labels/clusters")
    print(cluster_matrix,"\n")
    label_cluster,correct = findLabelsToClusters(cluster_matrix)
    print("Labels to clusters indexes are labels, contents are clusters")
    print(label_cluster,"with training error",correct/len(train_features))
    predicts = cluster.predict(test_features)
    correct_predicts = 0
    index = 0
    for predict in predicts:
        if nonzero(label_cluster == predict)[0][0] == test_labels[index]:
            correct_predicts += 1
        index += 1
    print("Test accuracy is",correct_predicts/len(predicts),"\n")

Euclidian Distance
Doing cross validation for 1 th
Cluster matrix labels/clusters
[[  0.   2.   1.   0.   0.   3.   7.  71.  68.   0.]
 [  0.   0.  72.   0.   3.   0.   0.   0.   0.  93.]
 [  0.  28.  21.   2.  12.  53.  13.   0.  11.  16.]
 [  6.   1.  19.   0.  12.   6. 101.   1.   4.   0.]
 [ 91.   3.  13.  13.   0.  45.   0.   0.   0.   5.]
 [ 15.   3.   4.   0.   6.  53.  58.   9.   2.   0.]
 [  3. 125.  19.   0.   0.   9.   2.   5.   0.   1.]
 [ 27.   1.  18. 112.   0.  15.   0.   0.   1.   8.]
 [  7.   2.  19.   1.  89.   7.  16.   0.   0.   1.]
 [ 65.   0.  20.  51.   1.  25.   1.   2.   0.   1.]] 

Labels to clusters indexes are labels, contents are clusters
[7. 9. 5. 6. 0. 8. 1. 3. 4. 2.] with training error 0.473125
Test accuracy is 0.4925 

Doing cross validation for 2 th
Cluster matrix labels/clusters
[[  6.   0.   1.   2. 114.   0.   0.  14.  14.   0.]
 [  1.   0.   0.  71.   0. 101.   0.   0.   1.   0.]
 [  7.   3.   1.  16.   1.   7.   3.  16.  13.  92.]
 [ 23.   1.   1

In [20]:
from nltk.cluster.kmeans import KMeansClusterer
import nltk
def performCluster(features,labels):
    # do the clustering
    
    kclusterer = KMeansClusterer(10, distance=nltk.cluster.util.cosine_distance, repeats=25)
    cluster_labels = kclusterer.cluster(features,assign_clusters=True)
    cluster_matrix = numpy.zeros(shape=(10,10)) # Initilizing cluster matrix

    for i in range(0,len(cluster_labels)):
        cluster_matrix[labels[i]][cluster_labels[i]]+=1 # Creating cluster matrix

    return cluster_matrix,kclusterer # Return cluster matrix and cluster

print("Cosine distance")
for i in range(1,6): # do k-fold-validation
    print("Doing cross validation for",i,"th")
    train_labels,train_features,test_labels,test_features = split(labels,features,i,5)
    cluster_matrix,cluster = performCluster(train_features,train_labels)
    print("Cluster matrix labels/clusters")
    print(cluster_matrix,"\n")
    label_cluster,correct = findLabelsToClusters(cluster_matrix)
    print("Labels to clusters indexes are labels, contents are clusters")
    print(label_cluster,"with training error",correct/len(train_features))
    predicts = []
    for feature in test_features:
        predicts.append(cluster.classify(feature))
    correct_predicts = 0
    index = 0
    for predict in predicts:
        if nonzero(label_cluster == predict)[0][0] == test_labels[index]:
            correct_predicts += 1
        index += 1
    print("Test accuracy is",correct_predicts/len(predicts),"\n")

Cosine distance
Doing cross validation for 1 th
Cluster matrix labels/clusters
[[  0.   3.   4.   2.  86.  57.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.  72.  94.   2.]
 [ 14.   3.  16.  23.   0.  50.   2.  20.  19.   9.]
 [  2.  41.  77.   0.   1.   9.   0.  11.   4.   5.]
 [117.   1.   0.   3.   1.   2.  21.  13.  12.   0.]
 [  3.  38.  35.   4.   7.  10.   1.   5.  35.  12.]
 [ 15.   1.   0. 122.  10.   4.   0.   7.   5.   0.]
 [ 25.   0.   0.   0.   2.   1. 119.  19.  16.   0.]
 [  1.  33.   3.   2.   0.   0.   1.   8.   8.  86.]
 [ 62.   8.   0.   1.   2.   1.  56.  19.  13.   4.]] 

Labels to clusters indexes are labels, contents are clusters
[4. 8. 5. 2. 0. 1. 3. 6. 9. 7.] with training error 0.505
Test accuracy is 0.5375 

Doing cross validation for 2 th
Cluster matrix labels/clusters
[[  1.   1.   3.   3.   0. 138.   0.   4.   1.   0.]
 [  1.   0.   0.   0.   0.   0.   0.   1.  81.  91.]
 [  4.  17.  93.   4.   0.   2.   4.   5.  18.  12.]
 [  3. 108.   5.   1.