**Dataset Link:** https://www.kaggle.com/datasets/sushilyadav1998/bollywood-celeb-localized-face-dataset

In [None]:
# Run this packages

# !pip install mtcnn==0.1.0
# !pip install tensorflow==2.3.1
# !pip install keras==2.4.3
# !pip install keras-vggface==0.6
# !pip install keras_applications==1.0.8

## To fetch dataset from kaggle

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
# Dataset API Command that you copied from Kaggle dataset

!kaggle datasets download -d sushilyadav1998/bollywood-celeb-localized-face-dataset

Downloading bollywood-celeb-localized-face-dataset.zip to /content
 98% 27.0M/27.6M [00:01<00:00, 24.1MB/s]
100% 27.6M/27.6M [00:01<00:00, 16.0MB/s]


In [None]:
# To unzip folder and files

import zipfile

zip_ref = zipfile.ZipFile(file="/content/bollywood-celeb-localized-face-dataset.zip", mode='r')
zip_ref.extractall(path='/content')
zip_ref.close()

## Import necessary library

In [None]:
import os
import pickle
from tqdm import tqdm

import numpy as np
from tensorflow.keras.preprocessing import image
from keras_vggface.vggface import VGGFace
from keras_vggface.utils import preprocess_input

In [None]:
# test

os.listdir(path="/content/Bollywood_celeb_face_localized")

['bollywood_celeb_faces_0',
 'bollywood_celeb_faces2',
 'bollywood_celeb_faces_1']

In [None]:
# Extract all the file path from directory

filename_path = []

# main_path = r"/content/Bollywood_celeb_face_localized"

# for i in os.listdir(path=main_path):
#     for j in os.listdir(path=os.path.join(main_path, i)):
#         for k in os.listdir(path=os.path.join(main_path, i, j)):
#             filename_path.append(os.path.join(main_path, i, j, k))

In [None]:
# Length of our file path

len(filename_path)

8664

In [None]:
# make a pickle file for our all filename_path so that we don't need to extract same thing multiple time

drive_path = "/content/drive/MyDrive/Deep Learning/Supervised Learning/CNN With Keras/Projects/Which Bollywood Celebrity Are You/Files"

pickle.dump(obj=filename_path, file=open(file=f"{drive_path}/filename_paths.pkl", mode="wb"))

In [None]:
# Now load our pickle file

path = "/content/drive/MyDrive/Deep Learning/Supervised Learning/CNN With Keras/Projects/Which Bollywood Celebrity Are You/Files/filename_paths.pkl"

myfiles_path = pickle.load(file=open(file=path, mode="rb"))


In [None]:
myfiles_path[0]

'/content/Bollywood_celeb_face_localized/bollywood_celeb_faces_0/Arjun_Kapoor/Arjun_Kapoor.88.jpg'

## Create our Model

In [None]:
# default model = "vgg16"

# "avg" means that global average pooling
# will be applied to the output of the
# last convolutional layer, and thus
# the output of the model will be a 2D tensor.

model = VGGFace(include_top=False, model="resnet50", input_shape=(224,224,3), pooling="avg")

model.summary()

Downloading data from https://github.com/rcmalli/keras-vggface/releases/download/v2.0/rcmalli_vggface_tf_notop_resnet50.h5
Model: "vggface_resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1/7x7_s2 (Conv2D)           (None, 112, 112, 64) 9408        input_1[0][0]                    
__________________________________________________________________________________________________
conv1/7x7_s2/bn (BatchNormaliza (None, 112, 112, 64) 256         conv1/7x7_s2[0][0]               
__________________________________________________________________________________________________
activation (Activation)         (None, 112, 112, 64) 0     

In [None]:
# Create a Features extractor function

def feature_extractor(img_path, model):
    img = image.load_img(path=img_path, target_size=(224,224))
    # Convert img into array
    img_array = image.img_to_array(img=img)
    # Increase the dimension of the image. We know deep learning model take batch size so, we pass 1 batch at a time
    expanded_img_array = np.expand_dims(img_array, axis=0)
    # Now preprocess the image so that img will be compatible for our model
    preprocessed_img = preprocess_input(x=expanded_img_array)
    # Now predict the features
    result = model.predict(x=preprocessed_img).flatten()

    return result

In [None]:
all_features = []

for file in tqdm(myfiles_path):
    all_features.append(feature_extractor(img_path=file, model=model))

100%|██████████| 8664/8664 [31:53<00:00,  4.53it/s]


In [None]:
len(all_features)

8664

In [None]:
# This is list of 1-D arrays

all_features[0]

array([0.        , 0.        , 1.8680837 , ..., 0.        , 1.2708895 ,
       0.10167735], dtype=float32)

In [None]:
# Here 2048 is number of features of every single image

all_features[0].shape

(2048,)

In [None]:
pickle.dump(obj=all_features, file=open(file=f"{drive_path}/all_features.pkl", mode="wb"))