In [1]:
import os
import sys
import cv2
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook

# Library for mobilenet
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model

In [2]:
#root = os.getcwd()
root = r"D:\adham-till-code"

In [11]:
data = pd.read_pickle(os.path.join(root, 'file_path.pkl'))
data.head()

Unnamed: 0_level_0,file_path
image_id,Unnamed: 1_level_1
0000000_,\images\coco2017_train\train2017\000000147328.jpg
0000001_,\images\coco2017_train\train2017\000000414738.jpg
0000002_,\images\coco2017_train\train2017\000000281563.jpg
0000003_,\images\coco2017_train\train2017\000000063879.jpg
0000004_,\images\coco2017_train\train2017\000000531349.jpg


In [12]:
len(data)

214005

In [13]:
def get_path_from_drive(path, pc="windows", drive="None",):
    """
    This function is used for getting path for either windows or macbook
    """ 
    if pc == "windows":
        if drive == "None":
            return "Please check your drive path again"
        return drive + path
        
    elif pc == "apple":
        if drive == "None":
            return "Please check your drive name"
        return "/Volumes/" + drive + path
    else:
        return "Please choose correct PC. Either windows or apple"

In [14]:
data["file_path"] = get_path_from_drive(data.file_path, pc="windows", drive="D:")

In [15]:
data.head()

Unnamed: 0_level_0,file_path
image_id,Unnamed: 1_level_1
0000000_,D:\images\coco2017_train\train2017\00000014732...
0000001_,D:\images\coco2017_train\train2017\00000041473...
0000002_,D:\images\coco2017_train\train2017\00000028156...
0000003_,D:\images\coco2017_train\train2017\00000006387...
0000004_,D:\images\coco2017_train\train2017\00000053134...


### MobileNet

In [16]:
# Load the pre-trained MobileNet model
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

x = base_model.output
embeddings = layers.GlobalAveragePooling2D()(x) # layer before output

model = Model(base_model.input, embeddings)

## strategy for include batch size during training

In [17]:
def index_generator(dataframe, batch_size=100):
    """
    this function is used to generate batch size from the given dataframe. size of batch_size can be adjusted
    """
    total_rows = len(dataframe)
    current_index = 0

    while current_index < total_rows:
        end_index = min(current_index + batch_size, total_rows) # so that the last gen can be included
        yield dataframe.index[current_index:end_index]
        current_index += batch_size

In [18]:
# load the generator to load every batch of images
batch_gen = index_generator(data) #data[:5000]

# Initialize a list to store the embeddings
embeddings = []

# Process images in batches
for batch in tqdm_notebook(batch_gen, desc='get embeddings', total=len(data)/100):
    image_ids = batch.tolist()
    batch_images = []

    # Process images in the current batch
    for image_id in image_ids:
        image_path = data.loc[image_id].file_path

        # Load and preprocess the image
        img = image.load_img(image_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        batch_images.append(x)

    # Concatenate images in the batch
    batch_images = np.concatenate(batch_images, axis=0)

    # Extract the embeddings for the batch
    batch_embeddings = model.predict(batch_images)
    batch_embeddings = batch_embeddings.reshape(batch_embeddings.shape[0], -1)

    # Append the embeddings to the list
    embeddings.append(batch_embeddings)

# Concatenate embeddings from all batches
embeddings = np.concatenate(embeddings, axis=0)

get embeddings:   0%|          | 0/50.0 [00:00<?, ?it/s]



In [25]:
# check the min and max value in the embeddings array
embeddings.min(), embeddings.max()

(0.0, 6.0)

In [26]:
# check how many images and columns
embeddings.shape

(46901, 1024)

In [27]:
# save the results as dataframe
embeddings = np.array(embeddings)
df = pd.DataFrame(embeddings)
df['id'] = data.index 
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0167104_,1.440818,0.557482,0.651697,0.096436,0.749209,0.204965,1.126261,0.141098,1.709153,0.693212,...,0.924774,0.40203,0.629059,0.140267,0.499459,0.0,1.09227,3.707262,0.158702,0.725723
0167105_,0.529445,0.970562,0.223171,0.181106,0.09062,0.415827,0.548535,0.051425,1.604833,0.599037,...,0.262144,0.017002,0.079247,0.079368,0.103532,0.010405,1.679781,1.912656,0.123663,0.1995
0167106_,0.009057,0.371146,0.777233,0.710081,0.067593,0.307698,1.848584,0.026385,0.805457,1.512421,...,1.042889,0.178006,0.034477,0.472321,0.055696,0.0,0.349344,1.529537,0.556076,0.433715
0167107_,0.64254,1.120525,1.37955,0.417988,1.190873,0.280976,0.710613,0.039459,0.0,0.698568,...,0.02178,0.0,0.988695,0.054276,0.241893,0.0,0.822231,1.861334,0.038657,0.125096
0167108_,1.747567,0.222893,0.822371,0.704125,0.034584,0.238657,3.011187,0.042029,0.029861,1.424665,...,0.624335,0.150882,1.139399,1.029908,2.666231,0.071277,0.278832,1.260383,0.799217,1.090482


In [28]:
df.shape 

(46901, 1024)

In [29]:
# change the path back to project root again
os.chdir(root)
# save to csv
#df.to_pickle('embeddings.pkl')

In [30]:
df.to_pickle(os.path.join(root, 'extra_embeddings.pkl'))

In [34]:
old_embeddings = pd.read_pickle(os.path.join(root, 'embeddings.pkl'))
old_embeddings.shape

(214005, 1024)

In [32]:
#pd.concat([old_embeddings, df])

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000000_,1.642078,0.000000,0.743942,0.234420,0.952527,0.116200,1.152540,0.809756,0.051322,0.098664,...,1.006713,0.394465,0.862353,0.153266,0.378229,1.600909,0.144255,0.000000,0.456547,0.318457
0000001_,0.237834,0.743523,1.569128,0.230996,2.245258,0.542455,0.055316,0.201157,0.553351,0.213024,...,0.024717,0.111527,0.622623,0.046947,0.000000,0.000000,0.366749,1.473074,1.539011,1.055535
0000002_,0.965902,1.545597,0.072742,0.000000,0.775761,0.031038,0.260257,0.156828,0.068140,0.194489,...,0.654415,0.473969,3.427306,0.000000,0.305507,0.078855,0.722235,1.329341,0.106226,0.865138
0000003_,0.000000,0.143117,0.109090,0.000000,1.323703,0.000000,0.015345,2.345302,0.021467,2.062124,...,0.151082,0.000000,0.079314,0.457159,0.567558,0.006322,0.000000,1.220526,0.156362,1.563209
0000004_,0.000000,1.236826,0.553544,0.548288,0.167068,0.000000,1.144588,0.106205,0.046213,0.000000,...,0.047160,0.129139,0.075926,0.261244,0.000000,3.620260,0.000000,0.424422,2.139266,0.140564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0214000_,0.045783,0.078676,0.123697,0.227050,0.000000,0.000000,1.030815,0.877860,1.087611,0.643945,...,0.459925,0.000000,0.000000,0.330818,0.193070,0.000000,0.634940,0.971530,0.028711,0.126134
0214001_,0.064208,2.356112,0.353290,1.411432,0.167538,0.586643,0.608950,0.000000,0.390384,2.591943,...,1.890183,0.151756,0.411211,0.742950,0.454994,0.000000,1.242854,1.177235,0.396452,0.297668
0214002_,0.346088,0.951804,1.837962,1.425539,0.377529,0.015019,0.613205,0.251669,0.459700,1.113725,...,1.246030,0.090693,0.409448,0.167805,0.662108,0.000000,0.317693,3.697030,0.017683,0.109411
0214003_,0.083929,0.866478,0.217225,0.000000,0.143375,0.036708,1.291276,0.142972,0.115150,3.068969,...,0.083625,0.306661,0.043193,0.989364,0.104672,0.000000,1.817742,0.911448,0.045229,0.030999


In [33]:
# WARNING, only run when confirms to save everything together
#concat_df = pd.concat([old_embeddings, df])
#concat_df.to_pickle(os.path.join(root, 'embeddings.pkl'))