##### `Image Search with Pinecone and ConvBae for Feature Extraction`

In [10]:
## Import Libraries
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
import pinecone
import timm    ## PyTorch Image Models (timm)
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

In [11]:
## Load dotenv file
_ = load_dotenv(override=True)
pinecone_key = os.getenv('PINECONE_API_KEY')
pinecone_env = os.getenv('PINECONE_ENV')

In [12]:
## Get the images paths
imgaes_paths = [os.path.join('dataset-images', img) for img in os.listdir('dataset-images')]

## Create a DF contains the images_paths and create a random ID
df = pd.DataFrame({'paths': imgaes_paths})
df['id'] = np.arange(3054, 3054+len(df), 1)

## Take only the first 500 images --> for simplicity
df_use = df.iloc[:500]
df_use.loc[:, 'class'] = ['class-a', 'class-b'] * 250

df_use

Unnamed: 0,paths,id,class
0,dataset-images\0009fc27d9.jpg,3054,class-a
1,dataset-images\0014c2d720.jpg,3055,class-b
2,dataset-images\00196e8fac.jpg,3056,class-a
3,dataset-images\001fc748e6.jpg,3057,class-b
4,dataset-images\002bb8e03b.jpg,3058,class-a
...,...,...,...
495,dataset-images\16af889d9d.jpg,3549,class-b
496,dataset-images\16b44ef03b.jpg,3550,class-a
497,dataset-images\16b501e949.jpg,3551,class-b
498,dataset-images\16bbc4b4dc.jpg,3552,class-a


* `ConvBase for Feature Extraction`

In [13]:
## Here, I will use VGG19 Model ConvBase using timm library as a convbase for feature extraction
## The VGG19 Model after flattening the vector it will be of lenght 4096.

model = timm.create_model('vgg19', pretrained=True)
model = nn.Sequential(*list(model.children())[:-1])
_ = model.eval()

In [14]:
def extract_images_features(images_paths: list):
    ''' This Function is taking a list of images_paths and returns the features extraction from them using VGG19 Model.
    '''

    ## Transformation before extraction
    transform = transforms.Compose([   
                            ## VGG required images (224, 224)
                            transforms.Resize((224, 224)),
                            transforms.ToTensor(),
                            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                                   ])
    
    # Looping over the images_paths
    batch_features = []
    for image_path in images_paths:
        ## Convert it to Pillow and then to tensor
        image_tensor = Image.open(image_path).convert('RGB')
        image_tensor = transform(image_tensor).unsqueeze(0)

        ## Pass the Image and get the Feature Extraction
        with torch.no_grad():
            conv_features = model(image_tensor)
            ## Flatten --> I want a vector as a list in 1D
            image_features = conv_features.view(conv_features.size(0), -1).tolist()[0]

        ## Append to the list
        batch_features.append(image_features)

    return batch_features

## Test the above function
vgg19_vect_length = len(extract_images_features(images_paths=[r'dataset-images\0009fc27d9.jpg'])[0])
print(f'Vector Lenght using VGG19 Model is: {vgg19_vect_length}')

Vector Lenght using VGG19 Model is: 4096


* `Upserting to Pinecone`

In [15]:
## Connect to pinecone
pinecone.init(
        api_key=pinecone_key,
        environment=pinecone_env
            )

## For Free tier, Only one index is accepted --> So removing any other indexes firstly
try:
    print('Deleting existing indexes for free tier ..')
    _ = [pinecone.delete_index(name=name) for name in pinecone.list_indexes()]
except:
    print('No existing indexes ..')

## Create the index
index_name = 'image-vgg19model-course'
if index_name not in pinecone.list_indexes():
    print(f'Creating New Index: {index_name} ...')  
    ## Create
    pinecone.create_index(name=index_name, dimension=vgg19_vect_length, metric='cosine') ## and more like (pods=1, pod_type='p1.x1')
    print('Done ...')

## Index Now is Created, But we want to connect it to upsert vectors to it
index = pinecone.Index(index_name=index_name)

Deleting existing indexes for free tier ..
Creating New Index: image-vgg19model-course ...
Done ...


In [16]:
## Create a Function for Upserting to Pinecone
def upsert_to_pinecone(df_images, batch_size=32):

    failed_ids = []

    for batch_start in tqdm(range(0, len(df_images), batch_size)):
        try:
            ## Prepare Batches
            batch_end = min(batch_start+batch_size, len(df_images))
            paths_batch = df_images['paths'][batch_start: batch_end].tolist()         ## Slice the DF according to each batch
            ids_batch = df_images['id'][batch_start: batch_end].astype(str).tolist()  ## Also, Slice for the Ids according to each batch
            metadata_batch = df_images['class'][batch_start: batch_end].tolist()      ## The metadata to be used in filtering

            ## Call the function (extract_images_features) for getting features for each batch
            batch_features = extract_images_features(images_paths=paths_batch)

            ## Prepare to pinecone 
            # to_upsert = list(zip(ids_batch, batch_features))
            to_upsert = [(id, feats, {'class': cls}) for id, feats, cls in zip(ids_batch, batch_features, metadata_batch)]

            ## Insert to pinecone
            _ = index.upsert(vectors=to_upsert, namespace='image-vgg19')

        except Exception as e:
            print(f'Error Upserting: {e}')
            failed_ids.append(ids_batch)
    
    return failed_ids


## Apply the Function
failed_ids = upsert_to_pinecone(df_images=df_use)

100%|██████████| 16/16 [05:17<00:00, 19.87s/it]


In [17]:
## Get simialrity in real-time
image_new_path = df['paths'].iloc[-1]
image_feats_new = extract_images_features(images_paths=[image_new_path])[0]

## Search the Vector Store
results = index.query(queries=[image_feats_new], top_k=5, include_metadata=True, filter={'class': 'class-b'}, namespace='image-vgg19')
results['results'][0]['matches']
# [record['id'] for record in results['matches']]

[{'id': '3263',
  'metadata': {'class': 'class-b'},
  'score': 0.706210434,
  'values': []},
 {'id': '3439',
  'metadata': {'class': 'class-b'},
  'score': 0.553301871,
  'values': []},
 {'id': '3491',
  'metadata': {'class': 'class-b'},
  'score': 0.496369809,
  'values': []},
 {'id': '3167',
  'metadata': {'class': 'class-b'},
  'score': 0.466286302,
  'values': []},
 {'id': '3527',
  'metadata': {'class': 'class-b'},
  'score': 0.410927862,
  'values': []}]

In [18]:
## You can delete vectors using ids
_ = index.delete(ids=['3328', '3152'], namespace='image-vgg19')

In [19]:
## To update the embeddings of any id 
image_path_update = df['paths'].iloc[-1]
image_feats_update = extract_images_features(images_paths=[image_path_update])[0]
print(f'The First 10 values of the above vector is : \n {image_feats_update[:10]}')

## Update or you can use upsert (with different image --> Pinecone will upadate the vector)
_ = index.update(id='3096', values=image_feats_update, namespace='image-vgg19')

The First 10 values of the above vector is : 
 [0.0, 1.3136866092681885, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [20]:
## You can Fetch Vectors using ids
index.fetch(ids=['3096'], namespace='image-vgg19')['vectors']['3096']['values'][:10] ## Compare it with above values (Great, It is already updated)

[0.0, 1.31368661, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

---

In [34]:
(['5', [3, 4, 5], {'class': 'a'}])

['5', [3, 4, 5], {'class': 'a'}]

In [37]:
[('5', [3, 4, 5], {'class': 'a'})]

[('5', [3, 4, 5], {'class': 'a'})]