In [None]:
# table of contents
# 1. PCA on digits for visualization 
# 2. PCA on digits for compression
# 3. PCA on digits improve classification # 4. K-means clustering on digits

import numpy as np
import pylab as py

%matplotlib inline


In [None]:
# digit recognition setup...

from sklearn.datasets import load_digits 
digits = load_digits()

X, y = digits.data, digits.target
print("data shape: %r, target shape: %r" % (X.shape, y.shape)) 
print("classes: %r" % list(np.unique(y)))

n_samples, n_features = X.shape 
print("n_samples=%d" % n_samples) 
print("n_features=%d" % n_features)


##standarziation of the features

In [None]:

def plot_gallery(data, labels, shape, interpolation='nearest'):
   for i in range(data.shape[0]):
     py.subplot(1, data.shape[0], (i + 1)) 
     py.imshow(data[i].reshape(shape), interpolation=interpolation) 
     py.title(labels[i])
     py.xticks(()), py.yticks(()) 
     py.gray()

subsample = np.random.permutation(X.shape[0])[:5] 
images = X[subsample]
labels = ['True class: %d' % l for l in y[subsample]] 
plot_gallery(images, labels, shape=(8, 8))


In [None]:
!pip install RandomizedPCA


In [None]:
# 1. PCA on digits for visualization
from sklearn.decomposition import PCA as RandomizedPCA
pca = RandomizedPCA(n_components=5)
X_pca = pca.fit_transform(X)
X_pca.shape

from itertools import cycle

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
markers = ['+', 'o', '^', 'v', '<', '>', 'D', 'h', 's']
for i, c, m in zip(np.unique(y), cycle(colors), cycle(markers)): 
  py.scatter(X_pca[y == i, 0], X_pca[y == i, 1],
             c=c, marker=m, label=i, alpha=0.5)

_ = py.legend(loc='best')



In [None]:
print(pca)

In [None]:
labels = ['Component #%d' % i for i in range(len(pca.components_))] 
plot_gallery(pca.components_, labels, shape=(8, 8))

#task 1 part a 


In [None]:
#specified way in the assignment 
 print(np.sum(pca.explained_variance_ratio_))

In [None]:
#another way to calculate the explained variance 
import numpy

import matplotlib.pyplot as plt
explained_variance = numpy.var(X_pca, axis=0)
explained_variance_ratio = explained_variance / numpy.sum(explained_variance)
numpy.cumsum(explained_variance_ratio)


In [None]:
print('Explained Variance ', explained_variance)
print('Explained Variance ratio ', explained_variance_ratio)


#task 2 

#Explained Variance ratio tells us the extent to which each component explains the original dataset.
#so the 1st component is able to explain ~29% of X 
# second component is able to explain 23% of X
#third component explains 20 percent 
#4th componet explains 15 % 
#5th component explains 11% 
#Together they can explain about 98% of the variance of X 

In [None]:
#So if we only needed a 11% variance, we actually need just one component, let's verify
pca=RandomizedPCA(n_components=0.11)
X_new=pca.fit_transform(X)
print (X_new.shape)

In [None]:
#So if we only needed a 15% variance, we actually need two component, let's verify
pca=RandomizedPCA(n_components=0.15)
X_new=pca.fit_transform(X)
print (X_new.shape) # and so on 

In [None]:
#Let's run PCA with 2 components so as to plot the data in 2D
from sklearn.decomposition import PCA

pca_digits=PCA(n_components=5)
X_proj = pca_digits.fit_transform(X_pca)
print (np.sum(pca_digits.explained_variance_ratio_))


In [None]:
!apt install tabulate


In [None]:
# 2. PCA on digits for compression
import pandas as pd 
from tabulate import tabulate
n = 8 # number of digits for demonstration 
dims = [1,2,3,5,10,20,40,64]
image_compression_percent=[]
percent_variance=[]
print('compressed images of first',n,'digits') 
print('with this many PCA components:',dims)
for d in dims:	# dimensionality for compressed signal 
  pca = RandomizedPCA(n_components=d)
  percent_variance.append( np.sum(pca_digits.explained_variance_ratio_))
  image_compression_percent.append(d/64)
  X_pca1=pca.fit_transform(X)
  reduced_X = pca.transform(X[0:n]) # the reduced dimensionality 
  recovered_X = pca.inverse_transform(reduced_X)
  py.figure()
  plot_gallery(recovered_X, y[0:n], shape=(8, 8))


#task 3

In [None]:
df = pd.DataFrame(columns=['no_of_pca_componentes','image_compression_percent','percent_variance'], data=zip(dims, image_compression_percent, percent_variance))
print (df)

#task 1 b 

In [None]:
#specified way in assignment 
print(np.sum(pca.explained_variance_ratio_))

In [None]:
#pca=PCA().fit(X)
import numpy

import matplotlib.pyplot as plt
explained_variance = numpy.var(X_pca1, axis=0)
explained_variance_ratio = explained_variance / numpy.sum(explained_variance)


In [None]:
print('Explained Variance ', explained_variance)
print('Explained Variance ratio ', explained_variance_ratio)

# these all variance ratio's are also worked like in previous example 

In [None]:
# 3. PCA on digits improve classification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

print("train data shape: %r, train target shape: %r"% (X_train.shape, y_train.shape)) 
print("test data shape: %r, test target shape: %r"% (X_test.shape, y_test.shape))

from sklearn.naive_bayes import GaussianNB 
model = GaussianNB().fit(X_train, y_train) 
train_score = model.score(X_train, y_train)
print('training score (overfitting!):',train_score)

test_score = model.score(X_test, y_test) 
print('test score:',test_score)


In [None]:
# but now using PCA features instead of pixels directly!

pca = RandomizedPCA(n_components=10) 
pca.fit(X_train)

tX_train = pca.transform(X_train) 
tX_test = pca.transform(X_test)

model = GaussianNB().fit(tX_train, y_train) 
train_score = model.score(tX_train, y_train) 
print('training score (overfitting!):',train_score)

test_score = model.score(tX_test, y_test) 
print('test score:',test_score)

from sklearn import metrics 
y_test_pred = model.predict(tX_test) 
expected = y_test
predicted = model.predict(tX_test)
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


In [None]:
# let's plot accuracy vs number of components!

accuracy =  [] 
n_comp = range(1,64) 
for i in n_comp:
  pca = RandomizedPCA(n_components=i) 
  pca.fit(X_train)
  tX_train = pca.transform(X_train) 
  tX_test = pca.transform(X_test)
  model = GaussianNB().fit(tX_train, y_train) 
  test_score = model.score(tX_test, y_test) 
  accuracy.append(test_score)

py.plot(n_comp, accuracy) 
py.xlabel('number of PCA components') 
py.ylabel('digit recognition accuracy')


In [None]:
# 4. K-means clustering on digits

# identify 10 clusters (which should correspond to digits) 
from sklearn import cluster

k_means = cluster.KMeans(n_clusters=10) 
k_means.fit(digits.data)

print('true	:',digits.target[::50]) 
print('kmeans:',k_means.labels_[::50])

metrics.adjusted_rand_score(digits.target, k_means.labels_)


In [None]:
dbscan = cluster.DBSCAN(eps = 24, min_samples = 20) 
dbscan.fit(digits.data)

print('true	:',digits.target[::50]) 
print('dbscan:',dbscan.labels_[::50])

metrics.adjusted_rand_score(digits.target, dbscan.labels_)


#task 4


In [None]:
from sklearn.neighbors import KNeighborsClassifier
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 64)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors: knn
    knn = KNeighborsClassifier(n_neighbors = k)

    # Fit the classifier to the training data
    knn.fit(X_train,y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

#no its not possible to get maximum accuracy at 64 diemensions 