In [3]:
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import pandas as pd
import os
from dotenv import load_dotenv
import time

load_dotenv()

True

In [4]:
runpod_token = os.getenv("RUNPOD_TOKEN")
openai_base_url = os.getenv("RUNPOD_EMBEDDING_URL")
model_name = os.getenv("MODEL_NAME")
embedding_model_name = os.getenv("EMBEDDING_MODEL_NAME")

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_index_name = os.getenv("PINECONE_INDEX_NAME")

In [5]:
pc = Pinecone(api_key=pinecone_api_key)
client = OpenAI(
    api_key=runpod_token,
    base_url=openai_base_url
)

# Test embedding

In [None]:
# output = client.embeddings.create(input=["hello, world!"], model=model_name)
# embedding = output.data[0].embedding
# print(embedding)

# Data wrangling

In [6]:
df = pd.read_json("products/products.jsonl", lines=True)

In [7]:
df.head()
df["text"] = df["name"] + " : " + df["description"] + " -- ingredients: " + df["ingredients"].astype(str) + " -- Price: " + df["price"].astype(str) + " -- Rating: " + df["rating"].astype(str)

In [8]:
texts = df["text"].tolist()

In [9]:
with open("products/Merry's_way_about_us.txt") as file:
    about_us_text = file.read()

about_us_text = "Coffee shop Abdalla's way about section: " + about_us_text
texts.append(about_us_text)

In [10]:
with open("products/menu_items_text.txt") as file:
    menu_text = file.read()

menu_text = "Menu text: " + menu_text
texts.append(menu_text)

# Generate embeddings

In [15]:
coffee_shop_output = client.embeddings.create(input=texts, model=model_name)


In [None]:
embedding = coffee_shop_output.data
embedding

# Push embeddings to pinecone

In [None]:
pc.create_index(
    name=pinecone_index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [19]:
# while not pc.describe_index(pinecone_index_name).status.ready:
#     time.sleep(1)
index = pc.Index(pinecone_index_name)
vectors = []
for text, e in zip(texts, embedding):
    entry_id = text.split(":")[0]
    vectors.append({
        "id": entry_id,
        "values": e.embedding,
        "metadata": {"text": text}
    })

index.upsert(vectors=vectors, namespace="ns1")

{'upserted_count': 20}

## Get closest documents

In [20]:
output = client.embeddings.create(input=["is Cappuccino lactose-free?"], model=embedding_model_name)

In [21]:
embedding = output.data[0].embedding


In [22]:
results = index.query(
    namespace="ns1",
    vector=embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

In [None]:
results