**Imports and Fetching data from lfw (Labeled Faces in the Wild) Dataset**

In [None]:
from numpy import mean
from numpy import std
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
dataSet = datasets.fetch_lfw_people(min_faces_per_person=70)
# dataSet = datasets.load_digits()
# print(dataSet.feature_names)

**Printing target**

In [None]:
print(dataSet.target)

[5 6 3 ... 5 3 5]


**Printing Target name**

In [None]:
print(dataSet.target_names)

['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
 'Gerhard Schroeder' 'Hugo Chavez' 'Tony Blair']


**Printing the shape of data**

In [None]:
print(dataSet.data.shape)

(1288, 2914)


**X has the image data, y has target (the index of the image) and training the data and splitting it for test**

In [None]:
X = dataSet.data.reshape(len(dataSet.data), -1) #Converting 2d array to Vector
y = dataSet.target
target_names = dataSet.target_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

**Convert X_train to zero mean and variance of 1 to be easier to use by using the PCA(Principal Component Analysis) to fit the X_train which reduce the variable numbers to smaller number to be easier and faster to deal with, whiten is used to make input less redundent**

In [None]:
# performing preprocessing part
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Compute a PCA 
n_components = 75
pca = PCA(n_components=n_components, whiten=True).fit(X_train)
#n_components -> principal components used in dimensionality reduction
#whiten -> it is needed for some algorithms. If we are training on images, the raw input is redundant, 
#since adjacent pixel values are highly correlated. The goal of whitening is to make the input less redundant
# apply PCA transformation
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

**Naive Bayes is used which gave an accuracy between 0.7-0.8**

In [None]:
from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
print("Fitting the classifier to the training set")
# clf = KNeighborsClassifier(n_neighbors=7).fit(X_train_pca, y_train)
clf= GaussianNB().fit(X_train_pca, y_train)
# clf= GaussianNB(var_smoothing=2e-5).fit(X_train_pca, y_train)
#1e-9 -> default variance so when decreasing it , accuracy decreases

Fitting the classifier to the training set


**Using test in prediction and printing a classification report and an accuracy score**

In [None]:
y_pred = clf.predict(X_test_pca)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

                   precision    recall  f1-score   support

     Ariel Sharon       0.56      0.36      0.43        14
     Colin Powell       0.90      0.76      0.83        50
  Donald Rumsfeld       0.65      0.76      0.70        17
    George W Bush       0.76      0.94      0.84       109
Gerhard Schroeder       0.83      0.60      0.70        25
      Hugo Chavez       0.77      0.77      0.77        13
       Tony Blair       0.90      0.60      0.72        30

         accuracy                           0.78       258
        macro avg       0.77      0.69      0.71       258
     weighted avg       0.79      0.78      0.78       258

Accuracy: 0.7829457364341085


**KFold is used to cross so that the test data and trained data are merged and the naive bayes model is used**

In [None]:
for i in range(5,20):
  # cv = KFold(n_splits=i, random_state=1, shuffle=True)
  cv = KFold(n_splits=i)
  scores = cross_val_score(clf, X_train_pca, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
  # report performance
  print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


Accuracy: 0.736 (0.010)
Accuracy: 0.745 (0.007)
Accuracy: 0.746 (0.033)
Accuracy: 0.758 (0.023)
Accuracy: 0.755 (0.029)
Accuracy: 0.758 (0.035)
Accuracy: 0.759 (0.034)
Accuracy: 0.763 (0.027)
Accuracy: 0.762 (0.033)
Accuracy: 0.765 (0.050)
Accuracy: 0.766 (0.044)
Accuracy: 0.761 (0.046)
Accuracy: 0.767 (0.045)
Accuracy: 0.767 (0.038)
Accuracy: 0.761 (0.049)
