# Face Recognition
#### 11-785, Spring 2022, Homework 2 Part 2 (hw2p2-classification) Kaggle Challenge
#### https://www.kaggle.com/c/11-785-s22-hw2p2-classification/overview

### Made By: Gaurav Baweja

### Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd 
import cv2
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import ZeroPadding2D, Convolution2D, MaxPooling2D, Dropout, Flatten, Activation
from tqdm import tqdm,trange,tqdm_notebook
from time import sleep
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score

### Fetch Data

In [2]:
#Class for loading images from direcotry
class DirData():
    def __init__(self, base, name, file):
        self.base = base
        # identity name
        self.name = name
        # image file name
        self.file = file

    def __repr__(self):
        return self.image_path()

    def image_path(self):
        return os.path.join(self.base, self.name, self.file) 


In [3]:
#Loading Train data from Classifiaction folder
source_dir=os.path.join('/kaggle/input/11-785-s22-hw2p2-classification/classification/classification/train')
classesDir = []
cnt = 0
for i in os.listdir(source_dir):
    if(cnt==100): #For testing purpose else comment this
        break
    cnt += 1
    for f in os.listdir(os.path.join(source_dir, i)):
        classesDir.append(DirData(source_dir, i, f))
classesDir = np.array(classesDir)

print('Classes Dir shape :', classesDir.shape)

Classes Dir shape : (2000,)


In [4]:
def load_image(path):
    img = cv2.imread(path, 1)
    # BGR to RGB
    return img[...,::-1]

### Load Model

In [5]:
def vgg_face():
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(224,224, 3)))
    model.add(Convolution2D(64, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(Convolution2D(4096, (7, 7), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Convolution2D(4096, (1, 1), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Convolution2D(2622, (1, 1)))
    model.add(Flatten())
    model.add(Activation('softmax'))
    return model

In [6]:
#Craeting CNN model and loading weights from VGG_face model as a trasfer learning
model = vgg_face()
model.load_weights('../input/vgg-face-weights/vgg_face_weights.h5')
vgg_face_descriptor = Model(inputs=model.layers[0].input, outputs=model.layers[-2].output)

2022-03-16 17:44:43.070599: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-16 17:44:43.169896: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-16 17:44:43.170639: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-16 17:44:43.172606: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

### Preprocessing & Training

In [7]:
#Encoding Images directory name to integer
targets = np.array([m.name for m in classesDir])
targets=[int(name[1:])for name in targets]

In [8]:
#Embedding the images in the model
embeddings = np.zeros((classesDir.shape[0], 2622))
for i, m in tqdm(enumerate(classesDir)):
    img_path = classesDir[i].image_path()
    img = load_image(img_path)
    img = (img / 255.).astype(np.float32)
    img = cv2.resize(img, dsize = (224,224))
    embedding_vector = vgg_face_descriptor.predict(np.expand_dims(img, axis=0))[0]
    embeddings[i]=embedding_vector

0it [00:00, ?it/s]2022-03-16 17:44:51.178481: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-03-16 17:44:52.113026: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005
2000it [03:06, 10.71it/s]


In [9]:
#Convering embedded output of images to data frame
df_train=pd.DataFrame(embeddings)
df_train['label']=targets
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2613,2614,2615,2616,2617,2618,2619,2620,2621,label
0,0.013631,0.006611,0.001769,0.006816,0.017156,0.021896,-0.002151,0.004057,-0.011011,-0.002412,...,0.002127,-0.014581,0.002920,0.018414,-0.026867,-0.004893,-0.032110,0.003280,-0.009990,8215
1,0.001027,0.009659,-0.012803,0.007168,0.014574,0.029039,0.021830,0.023140,0.004749,-0.002216,...,-0.012684,-0.007423,0.017242,0.021423,-0.022279,-0.003210,-0.029162,0.008088,0.021325,8215
2,0.022090,0.010806,-0.009623,0.006210,0.015311,0.027780,0.002337,0.012288,-0.006372,0.019720,...,-0.001646,-0.023101,0.020786,0.018775,-0.037134,-0.009516,-0.028309,0.005284,0.010383,8215
3,0.019124,0.004529,-0.013105,0.013217,0.017883,0.015584,0.006625,0.032401,-0.000201,0.023196,...,-0.009858,-0.006466,0.030965,0.020019,-0.024566,-0.010479,-0.017863,0.009475,0.019993,8215
4,0.010663,0.011862,-0.020526,0.014488,0.019959,-0.002666,-0.019209,0.016727,-0.013771,0.009205,...,-0.030093,-0.014066,0.022290,0.035661,-0.019932,-0.018376,0.016488,0.012769,0.022522,8215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.004430,-0.007365,0.000537,0.021709,0.004793,0.021291,-0.005495,0.005130,0.017735,-0.007102,...,0.005892,0.002860,0.008258,0.008365,0.000243,-0.018445,-0.014606,0.002733,-0.006109,1905
1996,0.012511,0.016096,-0.005551,0.007447,0.001542,0.009078,0.002903,0.006447,0.004962,0.003241,...,0.018543,0.001671,0.018054,0.000586,0.001234,-0.012925,-0.015562,0.014368,0.000974,1905
1997,0.009202,0.004181,0.010670,0.016573,0.009098,0.007065,0.005246,0.007817,0.026373,0.011137,...,-0.001456,-0.000383,0.000351,-0.005544,-0.014237,-0.026565,-0.007928,0.005718,0.004673,1905
1998,0.012931,-0.014421,0.009904,0.025936,0.004444,-0.000863,-0.005330,0.020063,0.018949,0.000314,...,-0.005652,-0.002921,0.003979,0.000119,-0.017577,-0.028429,-0.014860,0.000281,0.012611,1905


### Test-Train Split Data

In [10]:
#Splting dataframe to feature and labels
X = df_train.iloc[:,:-1]
y = df_train.iloc[:,-1]
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2612,2613,2614,2615,2616,2617,2618,2619,2620,2621
0,0.013631,0.006611,0.001769,0.006816,0.017156,0.021896,-0.002151,0.004057,-0.011011,-0.002412,...,-0.014792,0.002127,-0.014581,0.002920,0.018414,-0.026867,-0.004893,-0.032110,0.003280,-0.009990
1,0.001027,0.009659,-0.012803,0.007168,0.014574,0.029039,0.021830,0.023140,0.004749,-0.002216,...,-0.026003,-0.012684,-0.007423,0.017242,0.021423,-0.022279,-0.003210,-0.029162,0.008088,0.021325
2,0.022090,0.010806,-0.009623,0.006210,0.015311,0.027780,0.002337,0.012288,-0.006372,0.019720,...,-0.006503,-0.001646,-0.023101,0.020786,0.018775,-0.037134,-0.009516,-0.028309,0.005284,0.010383
3,0.019124,0.004529,-0.013105,0.013217,0.017883,0.015584,0.006625,0.032401,-0.000201,0.023196,...,-0.011800,-0.009858,-0.006466,0.030965,0.020019,-0.024566,-0.010479,-0.017863,0.009475,0.019993
4,0.010663,0.011862,-0.020526,0.014488,0.019959,-0.002666,-0.019209,0.016727,-0.013771,0.009205,...,-0.021748,-0.030093,-0.014066,0.022290,0.035661,-0.019932,-0.018376,0.016488,0.012769,0.022522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.004430,-0.007365,0.000537,0.021709,0.004793,0.021291,-0.005495,0.005130,0.017735,-0.007102,...,-0.012751,0.005892,0.002860,0.008258,0.008365,0.000243,-0.018445,-0.014606,0.002733,-0.006109
1996,0.012511,0.016096,-0.005551,0.007447,0.001542,0.009078,0.002903,0.006447,0.004962,0.003241,...,-0.009292,0.018543,0.001671,0.018054,0.000586,0.001234,-0.012925,-0.015562,0.014368,0.000974
1997,0.009202,0.004181,0.010670,0.016573,0.009098,0.007065,0.005246,0.007817,0.026373,0.011137,...,0.001841,-0.001456,-0.000383,0.000351,-0.005544,-0.014237,-0.026565,-0.007928,0.005718,0.004673
1998,0.012931,-0.014421,0.009904,0.025936,0.004444,-0.000863,-0.005330,0.020063,0.018949,0.000314,...,-0.004055,-0.005652,-0.002921,0.003979,0.000119,-0.017577,-0.028429,-0.014860,0.000281,0.012611


In [11]:
#Test-Train split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
print('X_train shape : ({0},{1})'.format(X_train.shape[0], X_train.shape[1]))
print('y_train shape : ({0},)'.format(y_train.shape[0]))
print('X_test shape : ({0},{1})'.format(X_test.shape[0], X_test.shape[1]))
print('y_test shape : ({0},)'.format(y_test.shape[0]))

X_train shape : (1800,2622)
y_train shape : (1800,)
X_test shape : (200,2622)
y_test shape : (200,)


In [13]:
#Scalling the input feature
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

### PCA Processing

In [14]:
#PCA Calculation
# Covariance matrix
cov_matrix = np.cov(X_train_std.T)

# Eigen values and vector
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)

# Cumulative variance explained
tot = sum(eig_vals)
var_exp = [(i /tot) * 100 for i in sorted(eig_vals, reverse = True)]
cum_var_exp = np.cumsum(var_exp)

print('Cumulative Variance Explained', cum_var_exp)

Cumulative Variance Explained [ 12.83202781+0.00000000e+00j  19.53975362+0.00000000e+00j
  25.14059773+0.00000000e+00j ... 100.        +0.00000000e+00j
 100.        +4.62439037e-17j 100.        +0.00000000e+00j]


In [15]:
# Get index where cumulative variance explained is > threshold
thres = 95
res = list(filter(lambda i: i > thres, cum_var_exp))[0]
index = (cum_var_exp.tolist().index(res))

print(f'Index of element just greater than {thres}: {str(index)}')

Index of element just greater than 95: 301


In [16]:
#Implementing PCA to reduce dimentions
pca = PCA(n_components=index)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

### SVM Classifier

In [17]:
#Training the data using SVM classfier with tuneable Hyper-paramenters
clf = SVC(C=1., gamma=0.0001,kernel = 'rbf', class_weight = 'balanced')
clf.fit(X_train_pca, y_train)

SVC(class_weight='balanced', gamma=0.0001)

In [18]:
#Predecting the output
y_predict = clf.predict(X_test_pca)

In [19]:
#Accuracy of model
print("Model Accuracy: ", accuracy_score(y_test, y_predict)*100, "%")

Model Accuracy:  90.5 %


In [20]:
#Classification Report
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

          17       1.00      1.00      1.00         3
          54       1.00      1.00      1.00         5
         137       1.00      1.00      1.00         2
         208       1.00      1.00      1.00         4
         319       1.00      1.00      1.00         3
         463       1.00      0.50      0.67         2
         570       1.00      0.33      0.50         3
         585       0.67      1.00      0.80         2
         615       1.00      1.00      1.00         1
         641       0.75      1.00      0.86         3
         681       1.00      0.50      0.67         2
         752       0.00      0.00      0.00         1
         962       1.00      1.00      1.00         2
         985       1.00      0.67      0.80         3
        1054       1.00      1.00      1.00         2
        1149       1.00      0.80      0.89         5
        1158       0.33      1.00      0.50         1
        1470       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Final Test Data Processing

In [21]:
#Test Data Directory
test_dir=os.path.join('/kaggle/input/11-785-s22-hw2p2-classification/classification/classification/test')

In [22]:
#Class for loading Test images from direcotry
class TestdataDir():
    def __init__(self, base, file):
        self.base = base
        self.file = file

    def __repr__(self):
        return self.image_path()

    def image_path(self):
        return os.path.join(self.base, self.file) 
    

In [23]:
#Loading Test data from Classifiaction folder
test_data = []
k = 0
for i in os.listdir(test_dir):
     k+=1
     if k == 500: #For testing purpose else comment this
      break
     test_data.append(TestdataDir(test_dir, i))
test_data = np.array(test_data)

In [24]:
#Embedding the Test images in the model
embeddings_test = np.zeros((test_data.shape[0], 2622))
for i, m in tqdm(enumerate(test_data)):
    img_path = test_data[i].image_path()
    img = load_image(img_path)
    img = (img / 255.).astype(np.float32)
    img = cv2.resize(img, dsize = (224,224))
    embedding_vector = vgg_face_descriptor.predict(np.expand_dims(img, axis=0))[0]
    embeddings_test[i]=embedding_vector

499it [00:44, 11.15it/s]


In [25]:
#Preprocessing the test images adn predicting output
res_test = scaler.transform(embeddings_test)
res_test_pca = pca.fit_transform(res_test)
res_predict = clf.predict(res_test_pca)

In [26]:
#Read the sample output format
result = pd.read_csv('../input/11-785-s22-hw2p2-classification/classification_sample_submission.csv')

In [27]:
#Storeing the output in dataframe
for i in range(len(res_predict)):
    result.iloc[i,1]=res_predict[i]

In [28]:
#Exporting the result in for submission in csv file
result.to_csv("submission.csv",index=False)
print("Result Exported!")

Result Exported!
