##### `Image Search with Pinecone and ConvBae for Feature Extraction`

In [1]:
## Import Libraries
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
import pinecone
import timm    ## PyTorch Image Models (timm)
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image

  from tqdm.autonotebook import tqdm


In [2]:
## Load dotenv file
_ = load_dotenv(override=True)
pinecone_key = os.getenv('PINECONE_API_KEY')
pinecone_env = os.getenv('PINECONE_ENV')

In [3]:
## Get the images paths
FOLDER_PATH = os.path.join('datasets', 'dataset-images')
iamges_paths = [os.path.join(FOLDER_PATH, img) for img in os.listdir(FOLDER_PATH)]

## Create a DF contains the images_paths and create a random ID
df = pd.DataFrame({'paths': iamges_paths})
df['id'] = np.arange(3054, 3054+len(df), 1)

## Take only the first 500 images --> for simplicity
df_use = df.iloc[:500]
df_use

Unnamed: 0,paths,id
0,datasets\dataset-images\0009fc27d9.jpg,3054
1,datasets\dataset-images\0014c2d720.jpg,3055
2,datasets\dataset-images\00196e8fac.jpg,3056
3,datasets\dataset-images\001fc748e6.jpg,3057
4,datasets\dataset-images\002bb8e03b.jpg,3058
...,...,...
495,datasets\dataset-images\16af889d9d.jpg,3549
496,datasets\dataset-images\16b44ef03b.jpg,3550
497,datasets\dataset-images\16b501e949.jpg,3551
498,datasets\dataset-images\16bbc4b4dc.jpg,3552


* `ConvBase for Feature Extraction`

In [4]:
## Here, I will use VGG19 Model ConvBase using timm library as a convbase for feature extraction
## The VGG19 Model after flattening the vector it will be of lenght 4096.

model = timm.create_model('vgg19', pretrained=True)
model = nn.Sequential(*list(model.children())[:-1])
_ = model.eval()

In [15]:
def extract_images_features(images_paths: list):
    ''' This Function is taking a list of images_paths and returns the features extraction from them using VGG19 Model.
    '''

    ## Transformation before extraction
    transform = transforms.Compose([   
                            ## VGG required images (224, 224)
                            transforms.Resize((224, 224)),
                            transforms.ToTensor(),
                            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                                   ])
    
    # Looping over the images_paths
    batch_features = []
    for image_path in images_paths:
        ## Convert it to Pillow and then to tensor
        image_tensor = Image.open(image_path).convert('RGB')
        image_tensor = transform(image_tensor).unsqueeze(0)

        ## Pass the Image and get the Feature Extraction
        with torch.no_grad():
            conv_features = model(image_tensor)
            ## Flatten --> I want a vector as a list in 1D
            image_features = conv_features.view(conv_features.size(0), -1).tolist()[0]

        ## Append to the list
        batch_features.append(image_features)

    return batch_features

## Test the above function
vgg19_vect_length = len(extract_images_features(images_paths=[r'datasets\dataset-images\0009fc27d9.jpg'])[0])
print(f'Vector Lenght using VGG19 Model is: {vgg19_vect_length}')

Vector Lenght using VGG19 Model is: 4096


* `Upserting to Pinecone`

In [16]:
## Connect to pinecone
pinecone.init(
        api_key=pinecone_key,
        environment=pinecone_env
            )

## For Free tier, Only one index is accepted --> So removing any other indexes firstly
try:
    print('Deleting existing indexes for free tier ..')
    _ = [pinecone.delete_index(name=name) for name in pinecone.list_indexes()]
except:
    print('No existing indexes ..')

## Create the index
index_name = 'image-vgg19-course'
if index_name not in pinecone.list_indexes():
    print(f'Creating New Index: {index_name} ...')  
    ## Create
    pinecone.create_index(name=index_name, dimension=vgg19_vect_length, metric='cosine') ## and more like (pods=1, pod_type='p1.x1')
    print('Done ...')

## Index Now is Created, But we want to connect it to upsert vectors to it
index = pinecone.Index(index_name=index_name)

Deleting existing indexes for free tier ..
Creating New Index: image-vgg19-course ...
Done ...


In [17]:
## Create a Function for Upserting to Pinecone
def upsert_to_pinecone(df_images, batch_size=32):

    failed_ids = []

    for batch_start in tqdm(range(0, len(df_images), batch_size)):
        try:
            ## Prepare Batches
            batch_end = min(batch_start+batch_size, len(df_images))
            paths_batch = df_images['paths'][batch_start: batch_end].tolist()    ## Slice the DF according to each batch
            ids_batch = df_images['id'][batch_start: batch_end].tolist()         ## Also, Slice for the Ids according to each batch
            ids_batch_str = [str(id) for id in ids_batch]                        ## Prefered to be string

            ## Call the function (extract_images_features) for getting features for each batch
            batch_features = extract_images_features(images_paths=paths_batch)

            ## Prepare to pinecone 
            to_upsert = list(zip(ids_batch_str, batch_features))

            ## Insert to pinecone
            _ = index.upsert(vectors=to_upsert)

        except Exception as e:
            print(f'Error Upserting: {e}')
            failed_ids.append(ids_batch)
    
    return failed_ids


## Apply the Function
failed_ids = upsert_to_pinecone(df_images=df_use)

100%|██████████| 16/16 [06:04<00:00, 22.79s/it]


In [29]:
## Get simialrity in real-time
image_new_path = df['paths'].iloc[-1]
image_feats_new = extract_images_features(images_paths=[image_new_path])[0]

## Search the Vector Store
results = index.query(vector=image_feats_new, top_k=15)
results['matches']
# [record['id'] for record in results['matches']]

[{'id': '3263', 'score': 0.706210434, 'values': []},
 {'id': '3439', 'score': 0.553301871, 'values': []},
 {'id': '3080', 'score': 0.499896258, 'values': []},
 {'id': '3491', 'score': 0.496369809, 'values': []},
 {'id': '3124', 'score': 0.469109565, 'values': []},
 {'id': '3167', 'score': 0.466286302, 'values': []},
 {'id': '3204', 'score': 0.419103652, 'values': []},
 {'id': '3527', 'score': 0.410927862, 'values': []},
 {'id': '3177', 'score': 0.40235883, 'values': []},
 {'id': '3364', 'score': 0.3941544, 'values': []},
 {'id': '3524', 'score': 0.379548311, 'values': []},
 {'id': '3135', 'score': 0.367767125, 'values': []},
 {'id': '3310', 'score': 0.362493187, 'values': []},
 {'id': '3381', 'score': 0.358663797, 'values': []},
 {'id': '3096', 'score': 0.34970957, 'values': []}]

In [30]:
## You can delete vectors using ids
_ = index.delete(ids=['3328', '3152'])

In [33]:
## To update the embeddings of any id 
image_path_update = df['paths'].iloc[-1]
image_feats_update = extract_images_features(images_paths=[image_path_update])[0]
print(f'The First 10 values of the above vector is : \n {image_feats_update[:10]}')

## Update or you can use upsert (with different image --> Pinecone will upadate the vector)
_ = index.update(id='3096', values=image_feats_update)

The First 10 values of the above vector is : 
 [0.0, 1.3136866092681885, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [38]:
## You can Fetch Vectors using ids
index.fetch(ids=['3096'])['vectors']['3096']['values'][:10] ## Compare it with above values (Great, It is already updated)

[0.0, 1.31368661, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

---