## Embed and populate Pinecone

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import pandas as pd
from tqdm import tqdm

In [None]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


In [None]:
df = pd.DataFrame(columns=['word', 'embedding'])

In [None]:
with open("../data/raw/dictionary.txt", "r") as f:
    words = f.read().splitlines()

embeddings = [embed_model.get_text_embedding(word) for word in tqdm(words, desc="Computing embeddings")]



In [None]:
# Create a DataFrame from the list of words and embeddings
df = pd.DataFrame({
    'word': words,
    'embedding': embeddings
})

## Pinecone

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()


pinecone_api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
print(len(embeddings[0]))

In [None]:
index_name = "similarity-game"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=len(embeddings[0]),
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

In [None]:
index = pc.Index(index_name)

In [None]:

batch_size = 100
batches = []

# Create batches
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i + batch_size]
    vectors = [
        {"id": row['word'], "values": row['embedding']} for _, row in batch.iterrows()
    ]
    batches.append(vectors)

# Upsert in batches
for batch in tqdm(batches, desc="Indexing words (batches)"):
    index.upsert(vectors=batch, namespace="words")

In [None]:
print(index.describe_index_stats())

In [None]:
query_results1 = index.query(
    namespace="words",
    vector=embed_model.get_text_embedding("take"),
    top_k=3,
    include_values=False
)

print(query_results1)

In [None]:
#pc.delete_index(index_name)