In [None]:
# %pip install --upgrade --quiet devtools

In [13]:
from dotenv import load_dotenv
import json
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import re

load_dotenv()

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="mps")

In [14]:
filename = "./Chatbot_data_filtered-V2.json"

pc = Pinecone()
index = pc.Index("catalog-v2")

i = j = 0
batch_size = 100
upserted_count = 0
texts, ids, embeds, metadata = [], [], [], []

with open(filename) as file:
    for line in file:
        if j % 1000 == 0:
            print(f"Iteration: {j}. Upserted: ~{upserted_count}")
        if i % batch_size == 0 and texts:
            embeds = model.encode(texts)
            upserted_count += len(embeds)
            index.upsert(vectors=zip(ids, embeds, metadata))
            texts, ids, embeds, metadata = [], [], [], []
        doc = json.loads(line.rstrip())
        if (
            doc.get("title_en")
            and doc.get("seller_price")
            and doc.get("sku_config")
            # and doc.get("product_url")
            # and doc.get("description")
        ):
            specs = doc.get("specifications") or []
            if not specs:
                continue
            specs_text = ""
            specs = json.loads(specs)
            model_year = None
            for spec in specs:
                k = spec.get("name")
                v = spec.get("value")
                specs_text += f"{k} : {v}" + "\n"
                if k == "Model Year":
                    model_year = v

            ids.append(doc["offer_code"])

            sku = doc["sku_config"]
            title = doc["title_en"]
            offer_price = doc["seller_price"]
            description = doc.get("description", "Unavailable")
            highlights = doc.get("highlights", "Unavailable")
            is_returnable = doc.get("is_returnable", "Unavailable")
            rating = doc.get("product_rating", "Unavailable")
            warranty = doc.get("warranty", "Unavailable")
            on_deal = doc.get("deal_flag") or 0
            product_url = doc.get("product_url")
            instock = (
                1
                if doc.get("instock_flag", "Unavailable").strip().lower() == "true"
                else 0
            )

            m = {
                "image_url": (doc.get("image_url") + "?width=240")
                if doc.get("image_url")
                else "Unavailable",
                "product_url": f"https://www.{product_url}"
                if product_url
                else "Unavailable",
                "title": title,
                "price": offer_price,
                "rating": rating,
                "in_stock": instock,
                "sku": sku,
            }
            text = f"""
                Title: {title}. 
                Price: {offer_price}. 
                Available: {instock}. 
                Description: {description}. 
                Highlights: {highlights}. 
                Returnable: {is_returnable}. 
                Rating: {rating}. 
                Warranty: {warranty}. 
                On deal: {on_deal}.
                Specifications: {specs_text}. 
                """.strip().replace(
                "\n", " "
            )
            if model_year:
                m["model_year"] = model_year
                text += f"""
                model_year: {model_year}
                """
            text = re.sub(r"\s+", " ", text)
            m["text"] = text
            metadata.append(m)
            texts.append(text)
            i += 1
        j += 1

Iteration: 0. Upserted: ~0
Iteration: 1000. Upserted: ~900
Iteration: 2000. Upserted: ~1900
Iteration: 3000. Upserted: ~2900
Iteration: 4000. Upserted: ~3800
Iteration: 5000. Upserted: ~4800
Iteration: 6000. Upserted: ~5800
Iteration: 7000. Upserted: ~6700
Iteration: 8000. Upserted: ~7700
Iteration: 9000. Upserted: ~8700
Iteration: 10000. Upserted: ~9700
Iteration: 11000. Upserted: ~10700
Iteration: 12000. Upserted: ~11600
Iteration: 13000. Upserted: ~12600
Iteration: 14000. Upserted: ~13600
Iteration: 15000. Upserted: ~14600
Iteration: 16000. Upserted: ~15600
Iteration: 17000. Upserted: ~16600
Iteration: 18000. Upserted: ~17500
Iteration: 19000. Upserted: ~18500
Iteration: 20000. Upserted: ~19500
Iteration: 21000. Upserted: ~20500
Iteration: 22000. Upserted: ~21400
Iteration: 23000. Upserted: ~22400
Iteration: 24000. Upserted: ~23400
Iteration: 25000. Upserted: ~24400
Iteration: 26000. Upserted: ~25300
Iteration: 27000. Upserted: ~26300
Iteration: 28000. Upserted: ~27300
Iteration: 29