In [3]:
%pip install --upgrade --quiet devtools

Note: you may need to restart the kernel to use updated packages.


In [9]:
from dotenv import load_dotenv
import json
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

load_dotenv()

True

In [10]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="mps")

In [11]:
filename = "./Chatbot_data_filtered.json"
pc = Pinecone()
index = pc.Index("catalog-v2-768")

i = j = 0
batch_size = 100
upserted_count = 0
texts, ids, embeds, metadata = [], [], [], []
with open(filename) as file:
    for line in file:
        if j % 1000 == 0:
            print(f"Iteration: {j}. Upserted: ~{upserted_count}")
        if i % batch_size == 0 and texts:
            embeds = model.encode(texts)
            upserted_count += len(embeds)
            index.upsert(vectors=zip(ids, embeds, metadata))
            texts, ids, embeds, metadata = [], [], [], []

        doc = json.loads(line.rstrip())
        if (
            doc.get("title_en")
            and doc.get("description")
            and doc.get("offer_price")
            and doc.get("image_url")
            and doc.get("product_url")
            and doc.get("is_returnable")
        ):
            specs = doc.get("specifications") or []
            specs_text = ""
            for spec in specs:
                for k, v in spec:
                    k = spec.get("name")
                    v = spec.get("value")
                    specs_text += f"{k} : {v}" + "\n"
            ids.append(doc["offer_code"])
            m = {
                "sku": doc.get("sku_config"),
                "title": doc.get("title_en"),
                "image_url": doc.get("image_url"),
                "product_url": f"https://www.{doc['product_url']}",
                "specs": specs_text
            }
            title = doc.get("title_en", "Unavailable")
            offer_price = doc.get("offer_price", doc.get("msrp", "Unavailable"))
            description = doc.get("description", "Unavailable")
            highlights = doc.get("highlights", "Unavailable")
            is_returnable = doc.get("is_returnable", "Unavailable")
            rating = doc.get("product_rating", "Unavailable")
            warranty = doc.get("warranty", "Unavailable")
            image_url = doc.get("image_url") + "?width=240"
            text = (
                f"Title: {title}\n"
                f"Price: {offer_price}\n"
                f"Description: {description}.\n"
                f"Highlights: {highlights}.\n"
                f"is_returnable: {is_returnable}\n"
                f"rating: {rating}"
                f"warranty: {warranty}"
                f"is_on_deal: {doc.get('deal_flag')} or 'False'"
            )
            m["text"] = text
            metadata.append({**m})
            texts.append(text)
            i += 1
        j += 1

Iteration: 0. Upserted: ~0
Iteration: 1000. Upserted: ~400
Iteration: 2000. Upserted: ~700
Iteration: 3000. Upserted: ~1100
Iteration: 4000. Upserted: ~1600
Iteration: 5000. Upserted: ~1900
Iteration: 6000. Upserted: ~2400
Iteration: 7000. Upserted: ~2800
Iteration: 8000. Upserted: ~3100
Iteration: 9000. Upserted: ~3500
Iteration: 10000. Upserted: ~4000
Iteration: 11000. Upserted: ~4700
Iteration: 12000. Upserted: ~5200
Iteration: 13000. Upserted: ~6000
Iteration: 14000. Upserted: ~6500
Iteration: 15000. Upserted: ~6900
Iteration: 16000. Upserted: ~7300
Iteration: 17000. Upserted: ~7800
Iteration: 18000. Upserted: ~8200
Iteration: 19000. Upserted: ~8700
Iteration: 20000. Upserted: ~9200
Iteration: 21000. Upserted: ~9700
Iteration: 22000. Upserted: ~10300
Iteration: 23000. Upserted: ~10800
Iteration: 24000. Upserted: ~11200
Iteration: 25000. Upserted: ~11500
Iteration: 26000. Upserted: ~11800
Iteration: 27000. Upserted: ~12000
Iteration: 28000. Upserted: ~12500
Iteration: 29000. Upserte

KeyboardInterrupt: 