In [None]:
%pip install --upgrade --quiet devtools

In [1]:
from dotenv import load_dotenv
import json
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [4]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="mps")

In [6]:
filename = "./Chatbot_data_filtered-V2.json"

pc = Pinecone()
index = pc.Index("catalog-v2")
i = j = 0
batch_size = 100
upserted_count = 0
texts, ids, embeds, metadata = [], [], [], []
with open(filename) as file:
    for line in file:
        if j % 1000 == 0:
            print(f"Iteration: {j}. Upserted: ~{upserted_count}")
        if i % batch_size == 0 and texts:
            embeds = model.encode(texts)
            upserted_count += len(embeds)
            index.upsert(vectors=zip(ids, embeds, metadata))
            texts, ids, embeds, metadata = [], [], [], []
        doc = json.loads(line.rstrip())
        if (
            doc.get("title_en")
            and doc.get("description")
            and doc.get("seller_price")
            and doc.get("image_url")
            and doc.get("product_url")
        ):
            specs = doc.get("specifications") or []
            if not specs:
                continue
            specs_text = ""
            specs = json.loads(specs)
            model_year = None
            for spec in specs:
                k = spec.get("name")
                v = spec.get("value")
                specs_text += f"{k} : {v}" + "\n"
                if k == "Model Year":
                    model_year = v
            ids.append(doc["offer_code"])
            title = doc.get("title_en", "Unavailable")
            offer_price = doc.get("seller_price", "Unavailable")
            description = doc.get("description", "Unavailable")
            highlights = doc.get("highlights", "Unavailable")
            is_returnable = doc.get("is_returnable", "Unavailable")
            rating = doc.get("product_rating", "Unavailable")
            warranty = doc.get("warranty", "Unavailable")
            on_deal = doc.get('deal_flag') or 0
            m = {
                "sku": doc.get("sku_config"),
                "image_url": doc.get("image_url") + "?width=240",
                "product_url": f"https://www.noon.com/uae-en/xyz/{doc.get('sku_config')}/p/",
                "title": title,
                "price": offer_price,
                "rating": rating,
            }
            text = f"""
                Title: {title}. 
                Price: {offer_price}. 
                Description: {description}. 
                Highlights: {highlights}. 
                is_returnable: {is_returnable}. 
                rating: {rating}. 
                warranty: {warranty}. 
                is_on_deal: {on_deal}.
                specifications: {specs_text}.
            """.strip().replace("\n", " ")
            if model_year:
                m["model_year"] = model_year
                text += f"""
                model_year: {model_year}
                """
            m["text"] = text
            metadata.append(m)
            texts.append(text)
            i += 1
        j += 1

Iteration: 0. Upserted: ~0
Iteration: 1000. Upserted: ~400
Iteration: 2000. Upserted: ~900
Iteration: 3000. Upserted: ~1200
Iteration: 4000. Upserted: ~1700
Iteration: 5000. Upserted: ~2400
Iteration: 6000. Upserted: ~2700
Iteration: 7000. Upserted: ~3200
Iteration: 8000. Upserted: ~3700
Iteration: 9000. Upserted: ~4200
Iteration: 10000. Upserted: ~4700
Iteration: 11000. Upserted: ~5400
Iteration: 12000. Upserted: ~6000
Iteration: 13000. Upserted: ~6900
Iteration: 14000. Upserted: ~7600
Iteration: 15000. Upserted: ~8300
Iteration: 16000. Upserted: ~9100
Iteration: 17000. Upserted: ~9900
Iteration: 18000. Upserted: ~10300
Iteration: 19000. Upserted: ~10800
Iteration: 20000. Upserted: ~11500
Iteration: 21000. Upserted: ~11900
Iteration: 22000. Upserted: ~12500
Iteration: 23000. Upserted: ~13200
Iteration: 24000. Upserted: ~13800
Iteration: 25000. Upserted: ~14400
Iteration: 26000. Upserted: ~15200
Iteration: 27000. Upserted: ~15800
Iteration: 28000. Upserted: ~16400
Iteration: 29000. Ups