In [3]:
%pip install --upgrade --quiet devtools

Note: you may need to restart the kernel to use updated packages.


In [19]:
from dotenv import load_dotenv
import json
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

load_dotenv()

True

In [20]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="mps")

In [23]:
filename = "./chatbot_data_filtered_sa-V3.json"

pc = Pinecone()
index = pc.Index("catalog-v3-768")
i = j = 0
batch_size = 100
upserted_count = 0
texts, ids, embeds, metadata = [], [], [], []
with open(filename) as file:
    for line in file:
        if j % 1000 == 0:
            print(f"Iteration: {j}. Upserted: ~{upserted_count}")
        if i % batch_size == 0 and texts:
            embeds = model.encode(texts)
            upserted_count += len(embeds)
            index.upsert(vectors=zip(ids, embeds, metadata))
            texts, ids, embeds, metadata = [], [], [], []
        doc = json.loads(line.rstrip())
        if (
            doc.get("title_en")
            and doc.get("description")
            and doc.get("seller_price")
            and doc.get("image_url")
            and doc.get("product_url")
            and doc.get("is_returnable")
        ):
            specs = doc.get("specifications") or []
            if not specs:
                continue
            specs_text = ""
            specs = json.loads(specs)
            for spec in specs:
                k = spec.get("name")
                v = spec.get("value")
                specs_text += f"{k} : {v}" + "\n"
            ids.append(doc["offer_code"])
            title = doc.get("title_en", "Unavailable")
            offer_price = doc.get("seller_price", "Unavailable")
            description = doc.get("description", "Unavailable")
            highlights = doc.get("highlights", "Unavailable")
            is_returnable = doc.get("is_returnable", "Unavailable")
            rating = doc.get("product_rating", "Unavailable")
            warranty = doc.get("warranty", "Unavailable")
            on_deal = doc.get('deal_flag') or 0
            m = {
                "sku": doc.get("sku_config"),
                "image_url": doc.get("image_url") + "?width=240",
                "product_url": f"https://www.{doc['product_url']}",
                "title": title
            }
            text = f"""
                Title: {title}. 
                Price: {offer_price}. 
                Description: {description}. 
                Highlights: {highlights}. 
                is_returnable: {is_returnable}. 
                rating: {rating}. 
                warranty: {warranty}. 
                is_on_deal: {on_deal}. 
                specs: {specs_text}.
            """.strip().replace("\n", " ")
            m["text"] = text
            metadata.append(m)
            texts.append(text)
            i += 1
        j += 1

Iteration: 0. Upserted: ~0
Iteration: 1000. Upserted: ~400
Iteration: 2000. Upserted: ~700
Iteration: 3000. Upserted: ~1200
Iteration: 4000. Upserted: ~1800
Iteration: 5000. Upserted: ~2200
Iteration: 6000. Upserted: ~2700
Iteration: 7000. Upserted: ~3300
Iteration: 8000. Upserted: ~3700
Iteration: 9000. Upserted: ~4200
Iteration: 10000. Upserted: ~4600
Iteration: 11000. Upserted: ~5200
