In [1]:
import os
import sys
import cv2
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook

In [2]:
root = os.getcwd()

In [16]:
data = pd.read_csv('file_path.csv', index_col='image_id')
data.head()

Unnamed: 0_level_0,file_path
image_id,Unnamed: 1_level_1
0000000_,\Users\Asus\Desktop\Project\ImageRecommender\w...
0000001_,\Users\Asus\Desktop\Project\ImageRecommender\w...
0000002_,\Users\Asus\Desktop\Project\ImageRecommender\w...
0000003_,\Users\Asus\Desktop\Project\ImageRecommender\w...
0000004_,\Users\Asus\Desktop\Project\ImageRecommender\w...


In [4]:
len(data)

31940

In [17]:
def get_path_from_drive(path, pc="windows", drive="None",):
    """
    This function is used for getting path for either windows or macbook
    """ 
    if pc == "windows":
        if drive == "None":
            return "Please check your drive path again"
        return drive + path
        
    elif pc == "apple":
        if drive == "None":
            return "Please check your drive name"
        return "/Volumes/" + drive + path
    else:
        return "Please choose correct PC. Either windows or apple"

In [19]:
data["file_path"] = get_path_from_drive(data.file_path, pc="windows", drive="C:")

In [20]:
data.head()

Unnamed: 0_level_0,file_path
image_id,Unnamed: 1_level_1
0000000_,C:\Users\Asus\Desktop\Project\ImageRecommender...
0000001_,C:\Users\Asus\Desktop\Project\ImageRecommender...
0000002_,C:\Users\Asus\Desktop\Project\ImageRecommender...
0000003_,C:\Users\Asus\Desktop\Project\ImageRecommender...
0000004_,C:\Users\Asus\Desktop\Project\ImageRecommender...


### MobileNet

In [5]:
import numpy as np
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model

# Load the pre-trained MobileNet model
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

x = base_model.output
embeddings = layers.GlobalAveragePooling2D()(x) # layer before output

model = Model(base_model.input, embeddings)

In [6]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 conv1 (Conv2D)              (None, 112, 112, 32)      864       
                                                                 
 conv1_bn (BatchNormalizatio  (None, 112, 112, 32)     128       
 n)                                                              
                                                                 
 conv1_relu (ReLU)           (None, 112, 112, 32)      0         
                                                                 
 conv_dw_1 (DepthwiseConv2D)  (None, 112, 112, 32)     288       
                                                                 
 conv_dw_1_bn (BatchNormaliz  (None, 112, 112, 32)     128       
 ation)                                                      

                                                                 
 conv_pw_8_bn (BatchNormaliz  (None, 14, 14, 512)      2048      
 ation)                                                          
                                                                 
 conv_pw_8_relu (ReLU)       (None, 14, 14, 512)       0         
                                                                 
 conv_dw_9 (DepthwiseConv2D)  (None, 14, 14, 512)      4608      
                                                                 
 conv_dw_9_bn (BatchNormaliz  (None, 14, 14, 512)      2048      
 ation)                                                          
                                                                 
 conv_dw_9_relu (ReLU)       (None, 14, 14, 512)       0         
                                                                 
 conv_pw_9 (Conv2D)          (None, 14, 14, 512)       262144    
                                                                 
 conv_pw_9

## strategy for include batch size during training

In [7]:
os.getcwd()

'C:\\Users\\Asus\\Desktop\\Project\\ImageRecommender'

In [21]:
def index_generator(dataframe, batch_size=100):
    total_rows = len(dataframe)
    current_index = 0

    while current_index < total_rows:
        end_index = min(current_index + batch_size, total_rows) # so that the last gen can be included
        yield dataframe.index[current_index:end_index]
        current_index += batch_size

In [23]:
# load the generator to load every batch of images
batch_gen = index_generator(data)

# Initialize a list to store the embeddings
embeddings = []

# Process images in batches
for batch in tqdm_notebook(batch_gen, desc='get embeddings', total=len(data)/100):
    image_ids = batch.tolist()
    batch_images = []

    # Process images in the current batch
    for image_id in image_ids:
        image_path = data.loc[image_id].file_path

        # Load and preprocess the image
        img = image.load_img(image_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        batch_images.append(x)

    # Concatenate images in the batch
    batch_images = np.concatenate(batch_images, axis=0)

    # Extract the embeddings for the batch
    batch_embeddings = model.predict(batch_images)
    batch_embeddings = batch_embeddings.reshape(batch_embeddings.shape[0], -1)

    # Append the embeddings to the list
    embeddings.append(batch_embeddings)

# Concatenate embeddings from all batches
embeddings = np.concatenate(embeddings, axis=0)

# Print the shape of the embeddings array
print(embeddings.shape)

get embeddings:   0%|          | 0/319.4 [00:00<?, ?it/s]



(31940, 1024)


In [10]:
embeddings.min(), embeddings.max()

(0.0, 6.0)

In [11]:
embeddings.shape

(31940, 1024)

In [12]:
os.getcwd()

'C:\\Users\\Asus\\Desktop\\Project\\ImageRecommender'

In [13]:
embeddings = np.array(embeddings)
df = pd.DataFrame(embeddings)
df['id'] = data.index 
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000000_,1.702602,0.280345,0.002875,0.392063,0.058732,0.072598,0.823098,0.085063,0.271215,0.0,...,0.001516,0.0,0.916635,0.434451,0.0,2.207779,0.078891,0.004087,0.022139,0.79007
0000001_,1.082599,0.056388,0.0,0.305775,0.001646,0.084357,0.024238,0.0,0.245861,2.627593,...,1.6364,0.074678,0.247777,0.0,0.092174,0.165596,6.0,1.257296,0.0,0.559314
0000002_,2.344831,0.095258,0.929714,0.007168,0.393723,0.0,1.015506,0.055767,1.104065,0.234005,...,0.115671,0.005713,1.477797,1.545036,0.764558,0.96121,0.254579,0.242481,0.213318,0.218131
0000003_,1.923374,0.469801,0.0,0.0,0.034059,0.032183,0.0,0.208176,2.159894,0.041754,...,2.693473,0.07469,0.710116,0.019694,0.221126,0.0,5.245052,0.109389,0.0,0.0
0000004_,1.212457,0.0,0.73053,0.260026,0.692328,1.183607,0.0,0.0,0.023676,1.795062,...,1.602074,0.009075,0.021297,0.241891,0.560821,1.275403,0.321218,0.0,0.0,1.439156


In [14]:
df.shape 

(31940, 1024)

In [15]:
# change the path back to project root again
os.chdir(root)
# save to csv
df.to_csv('embeddings.csv')
df.to_pickle('embeddings.pkl')