##### `Semantic Search with Pinecone and embeddings using HuggingFace Model`

In [1]:
## Import Libraries
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm


In [2]:
## Load dotenv file
_ = load_dotenv(override=True)
pinecone_key = os.getenv('PINECONE_API_KEY')
pinecone_env = os.getenv('PINECONE_ENV')

In [3]:
# ## Reading the big file once to slice it, then commenting this code
# FOLDER_PATH = os.path.join(os.getcwd(), 'datasets', 'dataset-semantic')
# df = pd.read_csv(os.path.join(FOLDER_PATH, 'medium_articles.csv'))
# ## Data is very big, for simplicity taking only the first 500 records, Download them and reread them again.
# ## Make sure that the title is string
# df['title'] = df['title'].astype(str)

# df = df.loc[:499, 'title']

# df = pd.DataFrame(df, columns=['title'])
# ## Add random id in DF
# df['id'] = np.arange(3054, 3054+len(df), 1)


# ## Let's consider this is our stock that we want to build semantic search for it.
# df.to_csv(os.path.join(FOLDER_PATH, 'articles_new.csv'), index=False)

In [4]:
## Read the New CSV File
FILE_PATH = os.path.join(os.getcwd(), 'dataset-semantic', 'articles_new.csv')
df = pd.read_csv(FILE_PATH)

## Add another clolumn (Just an exmaple) --> to be used as metadata
df['class'] = ['class-a', 'class-b'] * 250
df

Unnamed: 0,title,id,class
0,Mental Note Vol. 24,3054,class-a
1,Your Brain On Coronavirus,3055,class-b
2,Mind Your Nose,3056,class-a
3,The 4 Purposes of Dreams,3057,class-b
4,Surviving a Rod Through the Head,3058,class-a
...,...,...,...
495,Is It Worth to Invest In Mobile E-commerce App...,3549,class-b
496,Let go of these things for a happier 2021,3550,class-a
497,Not Everyone Will like Your Writing,3551,class-b
498,Is Technology Neutral?,3552,class-a


* `Embeddings using HuggingFace Model`

In [5]:
## A famous transformer (all-MiniLM-L6-v2) from here: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

## The Model
model_hugging = SentenceTransformer(model_name_or_path='all-MiniLM-L6-v2', device='cpu')
model_hugging

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [6]:
## test model for embedding 
vect_length_hugging = len(model_hugging.encode(df['title'].iloc[0]))
print('Length of Hugging Face Model is:', vect_length_hugging)

## First 10 values
model_hugging.encode(df['title'].iloc[0])[:10]

Length of Hugging Face Model is: 384


array([-0.0131355 ,  0.06555433, -0.01977038, -0.03618841, -0.07723403,
        0.10825918,  0.0849674 ,  0.02290434,  0.02159446,  0.00035316],
      dtype=float32)

* `Pinecone in Code`

In [7]:
## Connect to pinecone
pinecone.init(
        api_key=pinecone_key,
        environment=pinecone_env
            )

## For Free tier, Only one index is accepted --> So removing any other indexes firstly
try:
    print('Deleting existing indexes for free tier ..')
    _ = [pinecone.delete_index(name=name) for name in pinecone.list_indexes()]
except:
    print('No existing indexes ..')

## Create the index
index_name = 'semantic-huggingface-model-course'
if index_name not in pinecone.list_indexes():
    print(f'Creating New Index: {index_name} ...')  
    ## Create
    pinecone.create_index(name=index_name, dimension=vect_length_hugging, metric='cosine') ## and more like (pods=1, pod_type='p1.x1')
    print('Done ...')

## Index Now is Created, But we want to connect it to upsert vectors to it
index = pinecone.Index(index_name=index_name)

Deleting existing indexes for free tier ..
Creating New Index: semantic-huggingface-model-course ...
Done ...


In [8]:
## Looping over the Dataset and upsert through batches
batch_size = 16
failed_ids = []

for batch_start in tqdm(range(0, len(df), batch_size)):
    try:
        ## Prepare Batches
        batch_end = min(batch_start+batch_size, len(df))              ## to handle the end of each batch
        titles_batch = df['title'][batch_start: batch_end].tolist()   ## Slice the DF according to each batch
        ids_batch = df['id'][batch_start: batch_end].astype(str).tolist()         ## Also, Slice for the Ids according to each batch
        metadata_batch = df['class'][batch_start: batch_end].tolist()

        ## Get Embeddings using HuggingFace model
        embeds_batch = model_hugging.encode(titles_batch).tolist()

        ## Prepare to pinecone 
        # to_upsert = list(zip(ids_batch, embeds_batch))

        ## Prepare data for Pinecone upsert
        to_upsert = [(id, emb, {'class': cls})
                             for id, emb, cls in zip(ids_batch, embeds_batch, metadata_batch)]

        ## Insert to pinecone
        _ = index.upsert(vectors=to_upsert, namespace='semantic-huggingface')
    
    except Exception as e:
        print(f'Error Upserting: {e}')
        failed_ids.append(ids_batch)

100%|██████████| 32/32 [00:28<00:00,  1.13it/s]


In [22]:
## Inference (Query in real-time) (you can make more than query in one, List)
query_text = 'Neutral Technology'

## Generate Embedding for the query_text
query_embedding = model_hugging.encode(query_text).tolist()

## Search in pinecone
results = index.query(queries=[query_embedding], top_k=5, include_metadata=True, namespace='semantic-huggingface', filter={'class': 'class-b'})

results['results'][0]['matches']
# [record['id'] for record in results['results'][0]['matches']]

[{'id': '3393',
  'metadata': {'class': 'class-b'},
  'score': 0.345691055,
  'values': []},
 {'id': '3107',
  'metadata': {'class': 'class-b'},
  'score': 0.330204219,
  'values': []},
 {'id': '3311',
  'metadata': {'class': 'class-b'},
  'score': 0.225352362,
  'values': []},
 {'id': '3277',
  'metadata': {'class': 'class-b'},
  'score': 0.219006956,
  'values': []},
 {'id': '3463',
  'metadata': {'class': 'class-b'},
  'score': 0.206147298,
  'values': []}]

In [None]:
## You can delete vectors using ids
_ = index.delete(ids=['3328', '3152'])

In [None]:
## To update the embeddings of any id 
text_update = 'This is for updating the Id and change embeddings'
embeds_update = model_hugging.encode(text_update).tolist()

## Update or you can use upsert
_ = index.update(id='3191', values=embeds_update)

In [None]:
## Fetch ids
index.fetch(ids=['3191', '3292'])

-----

[{'id': '3393',
  'metadata': {'class': 'class-b'},
  'score': 0.345691055,
  'values': []},
 {'id': '3107',
  'metadata': {'class': 'class-b'},
  'score': 0.330204219,
  'values': []},
 {'id': '3311',
  'metadata': {'class': 'class-b'},
  'score': 0.225352362,
  'values': []},
 {'id': '3277',
  'metadata': {'class': 'class-b'},
  'score': 0.219006956,
  'values': []},
 {'id': '3463',
  'metadata': {'class': 'class-b'},
  'score': 0.206147298,
  'values': []}]

In [25]:
results

{'results': [{'matches': [{'id': '3393',
                           'metadata': {'class': 'class-b'},
                           'score': 0.345691055,
                           'values': []},
                          {'id': '3107',
                           'metadata': {'class': 'class-b'},
                           'score': 0.330204219,
                           'values': []},
                          {'id': '3311',
                           'metadata': {'class': 'class-b'},
                           'score': 0.225352362,
                           'values': []},
                          {'id': '3277',
                           'metadata': {'class': 'class-b'},
                           'score': 0.219006956,
                           'values': []},
                          {'id': '3463',
                           'metadata': {'class': 'class-b'},
                           'score': 0.206147298,
                           'values': []}],
              'namespace': 'semant

In [31]:
e = [{'id': int(record['id']), 'score': float(record['score']), 'class': record['metadata']['class']} for record in results['results'][0]['matches']]
e

[{'id': 3393, 'score': 0.345691055, 'class': 'class-b'},
 {'id': 3107, 'score': 0.330204219, 'class': 'class-b'},
 {'id': 3311, 'score': 0.225352362, 'class': 'class-b'},
 {'id': 3277, 'score': 0.219006956, 'class': 'class-b'},
 {'id': 3463, 'score': 0.206147298, 'class': 'class-b'}]