In [None]:
# %pip install --upgrade --quiet devtools

In [None]:
from dotenv import load_dotenv
import json
from pinecone import Pinecone

# from sentence_transformers import SentenceTransformer
import re
from langchain_openai import OpenAIEmbeddings

load_dotenv()

# model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="mps")
embeddings = OpenAIEmbeddings()

In [10]:
filename = "../bot/sa-catalog-v3.json"

pc = Pinecone()
index = pc.Index("catalog-v2")

i = j = 0
batch_size = 100
upserted_count = 0
texts, ids, embeds, metadata = [], [], [], []

with open(filename) as file:
    for line in file:
        if j % 1000 == 0:
            print(f"Iteration: {j}. Upserted: ~{upserted_count}")
        if i % batch_size == 0 and texts:
            embeds = embeddings.embed_documents(texts)
            upserted_count += len(embeds)
            index.upsert(vectors=zip(ids, embeds, metadata))
            texts, ids, embeds, metadata = [], [], [], []
        doc = json.loads(line.rstrip())
        if (
            doc.get("title_en")
            and doc.get("seller_price")
            and doc.get("sku_config")
        ):
            specs = doc.get("specifications") or []
            if not specs:
                continue
            specs_text = ""
            specs = json.loads(specs)
            model_year = None
            for spec in specs:
                k = spec.get("name")
                v = spec.get("value")
                specs_text += f"{k} : {v}" + "\n"
                if k == "Model Year":
                    model_year = v

            ids.append(doc["sku_config"])

            sku = doc["sku_config"]
            title = f"{doc.get('brand_name', ' ')} {doc['title_en']}"
            offer_price = doc["seller_price"]
            description = doc.get("description", "")
            highlights = doc.get("highlights", "")
            image_url = f"{doc.get('image_url')}?width=240" if doc.get("image_url") else "Unavailable"
            product_url = f"https://www.{doc.get('product_url')}" if doc.get('product_url') else "Unavailable"
            rating = doc.get("product_rating") or "Unavailable"
            warranty = doc.get("warranty", "Unavailable")
            on_deal = doc.get("deal_flag") or 0
            is_returnable_text = doc.get("is_returnable") or 0
            is_returnable = "Yes"
            if is_returnable_text in ("0", False, "false", "False", 0):
                is_returnable = "No"
            if bool(is_returnable_text):
                is_returnable = "Yes"
            minutes_flag_text = doc.get("minutes_flag") or 0
            available_on_minutes = "Yes"
            if minutes_flag_text in ("0", False, "false", "False", 0):
                available_on_minutes = "No"
            rocket_flag_text = doc.get("rocket_flag") or 0
            available_on_rocket = "Yes"
            if rocket_flag_text in ("0", False, "false", "False", 0):
                available_on_rocket = "No"
            available_on_emi_text = doc.get("bnpl_flag") or 0
            available_on_emi = "Yes"
            if available_on_emi_text in ("0", False, "false", "False", 0):
                available_on_emi = "No"
            is_locker_eligible_text = doc.get("is_locker_eligible") or 0
            is_locker_eligible = "Yes"
            if is_locker_eligible_text in ("0", False, "false", "False", 0):
                is_locker_eligible = "No"
            installation_flag = doc.get("installation_flag") or 0
            product_url = f"https://www.noon.com/saudi-en/product-detail/{sku}/p" 
            discount_perc = 0
            try:
                discount_perc = round(doc.get("discount", 0) * 100)
            except:
                pass

            m = {
                "image_url": image_url,
                "product_url": product_url,
                "title": title,
                "price": offer_price,
                "rating": rating,
                "sku": sku,
                "warranty": warranty,
                "is_on_deal": on_deal,
                "is_returnable": is_returnable,
                "installation_provided": installation_flag,
                "discount_percentage": discount_perc,
                "available_on_minutes": available_on_minutes,
                "available_on_rocket": available_on_rocket,
                "bnpl": available_on_emi,
                "available_on_emi": available_on_emi,
                "is_locker_eligible": is_locker_eligible
            }
            text = f"""
                Title: {title}.
                Highlights: {highlights}.
                """.strip()
            if model_year:
                m["model_year"] = model_year
                m["launch_year"] = model_year
            text = re.sub(r"\s+", " ", text)
            m["text"] = text
            metadata.append(m)
            texts.append(text)
            i += 1
        j += 1



Iteration: 0. Upserted: ~0
Iteration: 1000. Upserted: ~900
Iteration: 2000. Upserted: ~1900
Iteration: 3000. Upserted: ~2900
Iteration: 4000. Upserted: ~3900
Iteration: 5000. Upserted: ~4900
Iteration: 6000. Upserted: ~5900
Iteration: 7000. Upserted: ~6900
Iteration: 8000. Upserted: ~7900
Iteration: 9000. Upserted: ~8900
Iteration: 10000. Upserted: ~9900
