**Imports and Fetching data from digits Dataset**

In [None]:
from numpy import mean
from numpy import std
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()

**Convert X_train to zero mean and variance of 1 to be easier to use by using the PCA(Principal Component Analysis) to fit the X_train which reduce the variable numbers to smaller number to be easier and faster to deal with, whiten is used to make input less redundent**

In [None]:
# performing preprocessing part
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = X_train.reshape(len(X_train), -1) #Converting 2d array to Vector
X_train = sc.fit_transform(X_train)

X_test = X_test.reshape(len(X_test), -1) #Converting 2d array to Vector
X_test = sc.transform(X_test)

# Compute a PCA 
n_components = 75
pca = PCA(n_components=n_components, whiten=True).fit(X_train)
#n_components -> principal components used in dimensionality reduction
#whiten -> it is needed for some algorithms. If we are training on images, the raw input is redundant, 
#since adjacent pixel values are highly correlated. The goal of whitening is to make the input less redundant
# apply PCA transformation
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

**KNN is used which gave an accuracy nearly equal to 0.95**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
print("Fitting the classifier to the training set")
clf = KNeighborsClassifier(n_neighbors=6).fit(X_train_pca, y_train)
# from sklearn.naive_bayes import GaussianNB
# clf= GaussianNB().fit(X_train, y_train)
# clf= GaussianNB().fit(X_train_pca, y_train)
#1e-9 -> default variance so when decreasing it , accuracy decreases

Fitting the classifier to the training set


**Using test in prediction and printing a classification report and an accuracy score**

In [None]:
y_pred = clf.predict(X_test_pca)
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       980
           1       0.96      0.99      0.98      1135
           2       0.95      0.94      0.94      1032
           3       0.93      0.95      0.94      1010
           4       0.96      0.95      0.96       982
           5       0.94      0.94      0.94       892
           6       0.97      0.97      0.97       958
           7       0.95      0.94      0.95      1028
           8       0.95      0.92      0.93       974
           9       0.95      0.92      0.93      1009

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000

Accuracy: 0.9513


**KFold is used to cross so that the test data and trained data are merged and the naive bayes model is used**

In [None]:
for i in range(5,20):
  # cv = KFold(n_splits=i, random_state=1, shuffle=True)
  cv = KFold(n_splits=i)
  scores = cross_val_score(clf, X_train_pca, y_train, scoring='accuracy', cv=cv)
  # report performance
  print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


Accuracy: 0.948 (0.002)
Accuracy: 0.948 (0.004)
Accuracy: 0.949 (0.005)
Accuracy: 0.949 (0.005)
Accuracy: 0.949 (0.004)
Accuracy: 0.949 (0.005)
Accuracy: 0.949 (0.006)
Accuracy: 0.949 (0.007)
Accuracy: 0.949 (0.005)
Accuracy: 0.949 (0.005)
Accuracy: 0.950 (0.005)
Accuracy: 0.949 (0.006)
Accuracy: 0.950 (0.006)
Accuracy: 0.949 (0.006)
Accuracy: 0.950 (0.006)
