# Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

  from .autonotebook import tqdm as notebook_tqdm


### Set Some variables

In [3]:
os.chdir("/home/mango/Coding/Projects/ Mastering Vector Databases for AI Applications | Arabic")
FOLDER_DATA_PATH = os.path.join(os.getcwd(),'Data', 'medium-articles')
FILE_FULL_DATA_PATH = os.path.join(os.getcwd(),'Data','medium-articles','medium_articles.csv')
# FILE_TINY_DATA_PATH = os.path.join(os.getcwd(),'medium-articles','tiny_medium_articles.csv')


_ = load_dotenv(override = True)      # return true if .env exist

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
HF_ACCESS_TOKEN = os.getenv('HF_ACCESS_TOKEN')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
NGROK_AUTH_TOKEN = os.getenv('NGROK_AUTH_TOKEN')

### Read Full CSV data file

In [4]:
df = pd.read_csv(FILE_FULL_DATA_PATH)
df['id'] = np.arange(3040,len(df)+3040,1)
df.head(5)

Unnamed: 0,title,text,url,authors,timestamp,tags,id
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci...",3040
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P...",3041
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",3042
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P...",3043
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology...",3044


# Embeddings using HuggingFace **Model**

- A famous transformer (all-MiniLM-L6-v2) from here: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

In [5]:
# !nvidia-smi

In [6]:
model = SentenceTransformer(
    model_name_or_path = 'sentence-transformers/all-MiniLM-L6-v2',
    device="cuda",
    # cache_folder = os.getcwd()
)
# model

### Test model for embedding

In [7]:
vect_length_hugging = len(model.encode(df['title'].iloc[0]))
print('Length of Hugging Face Model is:', vect_length_hugging)

# First 10 values
model.encode(df['title'].iloc[0])[:10].tolist()

Length of Hugging Face Model is: 384


[-0.013135506771504879,
 0.06555432081222534,
 -0.019770393148064613,
 -0.03618846833705902,
 -0.07723397761583328,
 0.10825920850038528,
 0.0849674716591835,
 0.0229044072329998,
 0.021594461053609848,
 0.0003531991387717426]

# Pinecone in Code

In [8]:
# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=PINECONE_API_KEY)

## For Free tier, Only one index is accepted --> So removing any other indexes firstly
# pc.list_indexes().names()
# pc.delete_index(name='')

try:
    print('Deleting existing indexes for free tier ..')
    for idx in pc.list_indexes().names():
      pc.delete_index(idx)
except:
    print('No existing indexes ..')

## Create New Index
index_name = 'semantic-search'
if index_name not in pc.list_indexes().names():
  print(f'Creating New Index: {index_name} ...')
  pc.create_index(
      name=index_name,
      dimension=vect_length_hugging,
      metric='cosine',
      spec=ServerlessSpec(cloud="aws", region="us-east-1"))
  print('Done ...')

## Index Now is Created, But we want to connect it to upsert vectors to it
index = pc.Index(name = index_name)
print(f"Connected to index: {index_name}")

Deleting existing indexes for free tier ..
Creating New Index: semantic-search ...
Done ...
Connected to index: semantic-search


### Upserting Dataset in Pinecone DB

In [9]:
batch_size = 16
failed_ids = []

## Looping over the Dataset and upsert through batches
for batch_start in tqdm(range(0,len(df),batch_size)):
  try:
    # Prepare Batches
    batch_end = min(batch_start + batch_size, len(df))          # To Handle the end of dataframe
    titles_batch = df['title'][batch_start:batch_end].tolist()
    ids_batch = df['id'][batch_start:batch_end].astype(str).tolist()

    # Get Embeddings using HuggingFace model
    embeds_batch = model.encode(titles_batch).tolist()

    # prepare data to upsert
    to_upsert = list(zip(ids_batch,embeds_batch))

    # Upsert the records into a namespace
    index.upsert(vectors = to_upsert, namespace = index_name)

  except Exception as e:
    print(f'Error Upserting: {e}')
    failed_ids.append(ids_batch)

100%|██████████| 12023/12023 [2:24:28<00:00,  1.39it/s]   


### Inference (Query in real-time)

In [10]:
## Inference (Query in real-time) (you can make more than query in one, List)
query_text = 'Mind Your Nose'

## Generate Embedding for the query_text
query_embeds = model.encode(query_text).tolist()

## Search in pinecone DB
results = index.query(
    vector=[query_embeds],
    top_k = 5,
    namespace = index_name
)

# print(results)
for match in results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")

ID: 3042, Score: 0.999845505
ID: 118746, Score: 0.723469734
ID: 114499, Score: 0.59822464
ID: 15900, Score: 0.597365379
ID: 189496, Score: 0.571017265


### Delete vectors using ids

In [11]:
# index.delete(ids = ['3050','3070'], namespace = index_name)

### Fetch vectors using ids

In [12]:
# Fetch id = 3080 before update
index.fetch(ids=['3080'], namespace = index_name).vectors['3080'].values[:10]

[-0.0321247429,
 -0.0892664716,
 -0.0991047397,
 0.026135914,
 -0.0152404457,
 0.0753452927,
 -0.0302256718,
 0.00802560151,
 -0.0815024599,
 0.0602753684]

### Update the embeddings of any id

In [13]:
# # update id = 3080
# text_update = 'Youssef Taha Badawi'
# embeds_update = model.encode(text_update).tolist()

# # Update
# index.update(id='3080', values=embeds_update, namespace = index_name)

In [14]:
# # Fetch id = 3080 after update
# index.fetch(ids=['3080'], namespace = index_name).vectors['3080'].values[:10]