In [1]:
import pandas as pd
df = pd.read_csv("walmart-products.csv")

In [2]:
df.head()

Unnamed: 0,timestamp,url,final_price,sku,currency,gtin,specifications,image_urls,top_reviews,rating_stars,...,sizes,colors,seller,other_attributes,customer_reviews,ingredients,initial_price,discount,ingredients_full,categories
0,2024-08-24 00:00:00.000,https://www.walmart.com/ip/Laura-Mercier-Cavia...,22.9,173530386,USD,736150100000.0,"[{""name"":""Brand"",""value"":""Laura Mercier""},{""na...","[""https://i5.walmartimages.com/seo/Laura-Merci...","{""negative"":{},""positive"":{}}","{""five_stars"":2,""four_stars"":3,""two_stars"":1}",...,[],"[""Sugar Frost"",""Tuxedo""]",Wal███t.c███,"[{""name"":""Instructions"",""value"":""Apply directl...","[{""name"":""Jac███"",""rating"":5,""review"":""My only...","Cyclopentasiloxane, trimethylsiloxysilicate, s...",,,"[{""type"":""Ingredients"",""values"":""Cyclopentasil...","[""Beauty"",""Makeup"",""Eye Makeup"",""Eye Shadow"",""..."
1,2024-08-24 00:00:00.000,https://www.walmart.com/ip/Exultantex-Grey-Bla...,47.88,430528189,USD,771077900000.0,"[{""name"":""Brand"",""value"":""Exultantex""},{""name""...","[""https://i5.walmartimages.com/seo/Exultantex-...","{""negative"":{""rating"":1,""review"":""Color not ac...","{""five_stars"":47,""four_stars"":4,""one_star"":4,""...",...,"[""50\"" x 54\"""",""50\"" x 63\"""",""50\"" x 84\"""",""50...","[""Black"",""Blue"",""Green"",""Gray"",""Natural(Ivory)...",Exu███nte███ome███,"[{""name"":""Fabric Care Instructions"",""value"":""M...","[{""name"":""Dana"",""rating"":5,""review"":""I love th...",,70.8,$22.92,,"[""Home"",""Decor"",""Curtains & Window Treatments""..."
2,2024-08-24 00:00:00.000,https://www.walmart.com/ip/Jessica-London-Wome...,33.24,6013308220,USD,465192000000.0,"[{""name"":""Features"",""value"":""Easy Care""},{""nam...","[""https://i5.walmartimages.com/seo/Jessica-Lon...","{""negative"":{},""positive"":{}}","{""five_stars"":2}",...,"[""S"",""M"",""L"",""1X"",""2X"",""3X""]","[""Aqua Sea"",""Dark Olive Green"",""Dark Sapphire""...",Ful███aut███ran███,"[{""name"":""Fabric Care Instructions"",""value"":""M...","[{""name"":""Bev███y F███"",""rating"":5,""review"":""V...",,37.99,$4.75,,"[""Clothing"",""Womens Plus"",""Plus Size Tops"",""Pl..."
3,2024-08-24 00:00:00.000,https://www.walmart.com/ip/100-Cotton-King-Per...,49.99,161657830,USD,840708100000.0,"[{""name"":""Brand"",""value"":""Simply Put""},{""name""...","[""https://i5.walmartimages.com/asr/d88fe658-b9...","{""negative"":{""rating"":1,""review"":""[This review...","{""five_stars"":78,""four_stars"":17,""one_star"":2,...",...,"[""Queen"",""King""]","[""Beige"",""Blue"",""Gray"",""Spa Blue""]",IC ███bal███c,"[{""name"":""Fabric Care Instructions"",""value"":""M...","[{""name"":""Jes███oge███"",""rating"":5,""review"":""[...",,71.99,$22.00,,"[""Home"",""Bedding"",""Duvet Covers"",""King Duvet C..."
4,2024-08-25 00:00:00.000,https://www.walmart.com/ip/Disney-Boys-Graphic...,12.99,5397071399,USD,460008600000.0,"[{""name"":""Country of Origin - Textiles"",""value...","[""https://i5.walmartimages.com/seo/Disney-Boys...","{""negative"":{""rating"":1,""review"":""This size sh...","{""five_stars"":12,""one_star"":1,""three_stars"":1,...",...,"[""2T"",""3T"",""3T-4T"",""3-4 Years"",""4-5 Years"",""4-...","[""1#Yellow Micky"",""2#Pink Minnie"",""3#Blue Dona...",Pat███,,"[{""name"":""Mary"",""rating"":5,""review"":""Love thes...",,29.0,$16.01,,"[""Clothing"",""Kids Clothing"",""Boys Clothing"",""B..."


In [4]:
import cohere
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import pandas as pd
import json
from dotenv import load_dotenv
import os
from tqdm import tqdm

# === CONFIG ===
load_dotenv()
COHERE_API_KEY = os.environ.get("Walmart-Text-Embedding")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY_FINAL")
INDEX_NAME = "walmart-text"
EMBED_MODEL = "embed-v4.0"

# === INIT ===
co = cohere.ClientV2(COHERE_API_KEY)

pc = Pinecone(api_key=PINECONE_API_KEY)
print("Existing indexes:", pc.list_indexes().names())
pc.list_indexes()
index = pc.Index(INDEX_NAME)
print(index)


# === LOAD DATA ===
df = pd.read_csv("walmart-products.csv")

# === EMBED + UPSERT LOOP ===
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        specs = json.loads(row['specifications']) if pd.notna(row['specifications']) else []
        spec_text = " | ".join([f"{s['name']}: {s['value']}" for s in specs])

        review_data = json.loads(row['customer_reviews']) if pd.notna(row['customer_reviews']) else []
        review_text = " | ".join([f"{r.get('name', '')}: {r.get('review', '')}" for r in review_data])

        other_attrs = row['other_attributes'] if pd.notna(row['other_attributes']) else ""
        ingredients = row['ingredients'] if pd.notna(row['ingredients']) else ""
        sizes = row['sizes'] if pd.notna(row['sizes']) else ""
        colors = row['colors'] if pd.notna(row['colors']) else ""
        top_reviews = row['top_reviews'] if pd.notna(row['top_reviews']) else ""
        rating = row['rating'] if pd.notna(row['rating']) else ""
        main_image = row['main_image'] if pd.notna(row['main_image']) else ""
        description = row['description'] if pd.notna(row['description']) else ""
        category_name = row['category_name'] if pd.notna(row['category_name']) else ""
        currency = row['currency'] if pd.notna(row['currency']) else ""

        text = f"""
        Product SKU: {row['sku']}
        Title/Description: {description}
        Brand Info: {spec_text}
        Categories: {row['categories']}
        Category Name: {category_name}
        Price: {row['final_price']} {currency}
        Currency: {currency}
        Top Reviews: {top_reviews}
        Ratings (out of 5): {rating}
        Sizes: {sizes}
        Colours: {colors}
        Main Image: {main_image}
        Other Attributes: {other_attrs}
        Customer Reviews: {review_text}
        Ingredients: {ingredients}
        """

        # === Get Embedding from Cohere ===
        response = co.v2.embed(
            texts=[text],
            model=EMBED_MODEL,
            input_type="search_document",  # use "search_query" when embedding queries
            embedding_types=["float"]  # specify embedding type
        )
        vector = response.embeddings.float[0]



        # === Upsert to Pinecone ===
        index.upsert([{
            "id": str(row["sku"]),
            "values": vector,
            "metadata": {
                "url": row["url"],
                "price": row["final_price"],
                "brand": [s["value"] for s in specs if s["name"].lower() == "brand"][0] if specs else None,
                "categories": row["categories"]
            }
        }])


    except Exception as e:
        print(f"Failed on row {row['sku']}: {e}")

Existing indexes: ['walmart-text', 'walmart-images']
<pinecone.grpc.index_grpc.GRPCIndex object at 0x11799bed0>


  4%|▍         | 45/1000 [00:36<10:41,  1.49it/s]

Failed on row 5702523237: list index out of range


  9%|▉         | 90/1000 [01:57<11:05,  1.37it/s]  

Failed on row 422458903: list index out of range


  9%|▉         | 94/1000 [02:00<09:54,  1.52it/s]

Failed on row 247655086: list index out of range


 16%|█▌        | 155/1000 [03:20<20:16,  1.44s/it]  

Failed on row 689096319: list index out of range


 17%|█▋        | 168/1000 [03:40<12:35,  1.10it/s]

Failed on row 3808974443: list index out of range


 23%|██▎       | 234/1000 [04:56<13:03,  1.02s/it]

Failed on row 28926816: list index out of range


 25%|██▌       | 253/1000 [05:23<19:20,  1.55s/it]

Failed on row 1624985447: list index out of range


 29%|██▉       | 292/1000 [06:22<29:22,  2.49s/it]

Failed on row 10294756: list index out of range


 30%|███       | 303/1000 [06:32<10:38,  1.09it/s]

Failed on row 393061304: list index out of range


 33%|███▎      | 329/1000 [06:55<07:32,  1.48it/s]

Failed on row 1577779507: list index out of range


 35%|███▌      | 351/1000 [07:38<11:15,  1.04s/it]

Failed on row 832929384: list index out of range


 38%|███▊      | 384/1000 [08:09<07:18,  1.40it/s]

Failed on row 5642502722: list index out of range


 40%|███▉      | 398/1000 [08:37<10:42,  1.07s/it]

Failed on row 5118743904: list index out of range


 42%|████▏     | 421/1000 [08:59<08:13,  1.17it/s]

Failed on row 10801127: list index out of range


 44%|████▍     | 439/1000 [09:42<12:09,  1.30s/it]

Failed on row 189455189: list index out of range


 45%|████▌     | 452/1000 [09:53<06:06,  1.50it/s]

Failed on row 702951316: list index out of range


 50%|█████     | 505/1000 [10:52<07:31,  1.10it/s]

Failed on row 5495892018: list index out of range


 57%|█████▋    | 567/1000 [12:00<05:14,  1.38it/s]

Failed on row 366367982: list index out of range


 58%|█████▊    | 576/1000 [12:07<05:28,  1.29it/s]

Failed on row 284718781: list index out of range


 62%|██████▏   | 622/1000 [13:04<08:42,  1.38s/it]

Failed on row 1850447630: list index out of range


 67%|██████▋   | 674/1000 [14:11<04:40,  1.16it/s]

Failed on row 5491200472: list index out of range


 68%|██████▊   | 675/1000 [14:13<05:58,  1.10s/it]

Failed on row 132075558: list index out of range


 70%|██████▉   | 697/1000 [14:31<03:51,  1.31it/s]

Failed on row 3331024751: list index out of range


 72%|███████▏  | 718/1000 [15:04<04:49,  1.03s/it]

Failed on row 990631587: list index out of range


 75%|███████▍  | 749/1000 [15:52<20:39,  4.94s/it]

Failed on row 5492294359: list index out of range


 84%|████████▍ | 841/1000 [17:40<02:32,  1.04it/s]

Failed on row 199409627: list index out of range


 84%|████████▍ | 844/1000 [17:42<02:03,  1.27it/s]

Failed on row 887305646: list index out of range


 86%|████████▌ | 859/1000 [17:55<01:45,  1.34it/s]

Failed on row 162709782: list index out of range


 87%|████████▋ | 869/1000 [18:03<01:27,  1.50it/s]

Failed on row 1639387353: list index out of range


100%|█████████▉| 995/1000 [20:36<00:03,  1.54it/s]

Failed on row 537359758: status_code: 429, body: {'id': '21d7e93b-3af4-48cb-bd54-dbd86767f250', 'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}


100%|█████████▉| 996/1000 [20:36<00:02,  1.72it/s]

Failed on row 637751827: status_code: 429, body: {'id': '20e7b7b5-7a93-4fb4-b691-096e1445cce7', 'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}


100%|█████████▉| 997/1000 [20:37<00:01,  1.97it/s]

Failed on row 5494180543: status_code: 429, body: {'id': '95ca5b8e-20e5-4f99-8910-c1f568902979', 'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}


100%|█████████▉| 998/1000 [20:37<00:00,  2.20it/s]

Failed on row 578943868: status_code: 429, body: {'id': '99d2e396-3251-4034-b0f1-54d34e8f9427', 'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}


100%|█████████▉| 999/1000 [20:37<00:00,  2.34it/s]

Failed on row 5286825403: status_code: 429, body: {'id': '1c29b0f2-dc4d-4985-baae-02453a09a67d', 'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}


100%|██████████| 1000/1000 [20:38<00:00,  1.24s/it]

Failed on row 669112285: status_code: 429, body: {'id': '798673fd-4818-4227-94d4-080b782a1fe0', 'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}





In [13]:
response = co.v2.embed(
            texts=["100% ORGANIC COTTON Queen size bedsheet pink color"],
            model=EMBED_MODEL,
            output_dimension=1024,
            input_type="search_document",
            embedding_types=["float"]  # specify embedding type
        )
vector = response.embeddings.float[0]

In [14]:
import pyperclip 
pyperclip.copy(vector)

In [None]:
import cohere
from PIL import Image
from io import BytesIO
import base64
import requests
import os
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import pandas as pd
import json
from dotenv import load_dotenv
from tqdm import tqdm

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY_FINAL")
pc = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "walmart-images"
EMBED_MODEL = "embed-v4.0"
index = pc.Index(INDEX_NAME)

COHERE_API_KEY = os.getenv("COHERE_API_KEY")
co = cohere.ClientV2(COHERE_API_KEY)

df = pd.read_csv("walmart-products.csv")

def url_to_base64(image_url):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    image_format = img.format.lower()
    buffered = BytesIO()
    img.save(buffered, format=img.format)
    img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return f"data:image/{image_format};base64,{img_base64}"

for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        specs = json.loads(row['specifications']) if pd.notna(row['specifications']) else []
        spec_text = " | ".join([f"{s['name']}: {s['value']}" for s in specs])

        review_data = json.loads(row['customer_reviews']) if pd.notna(row['customer_reviews']) else []
        review_text = " | ".join([f"{r.get('name', '')}: {r.get('review', '')}" for r in review_data])
        other_attrs = row['other_attributes'] if pd.notna(row['other_attributes']) else ""
        sizes = row['sizes'] if pd.notna(row['sizes']) else ""
        image_urls = row['image_urls'] if pd.notna(row['image_urls']) else []
        image_urls = image_urls[1:-1]
        for images in image_urls.split(","):
            input = {
                "content": [
                {"type": "image_url", "image_url": {"url": url_to_base64(images[1:-1])}}
                ]
            }

            #=== Get Embedding from Cohere ===
            response = co.v2.embed(
                inputs=[input],
                model=EMBED_MODEL,
                input_type="search_document",
                embedding_types=["float"]  # specify embedding type
            )
            image_vector = response.embeddings.float[0]
            
            # === Upsert to Pinecone ===
            index.upsert([{
                "id": str(row["sku"])+str(i),
                "values": image_vector,
                "metadata": {
                    "sku": row["sku"],
                    "url": row["url"],
                    "price": row["final_price"],
                    "brand": [s["value"] for s in specs if s["name"].lower() == "brand"][0] if specs else None,
                    "categories": row["categories"]
                }
            }])

    except Exception as e:
        print(f"Failed on row {row['sku']}: {e}")


In [16]:
import pyperclip 
pyperclip.copy(image_vector)

In [15]:
response = co.v2.embed(
        texts=["Black bottle with horse"],
        model=EMBED_MODEL,
        output_dimension=1024,
        input_type="search_document",  # use "search_query" when embedding queries
        embedding_types=["float"]  # specify embedding type
    )
vector = response.embeddings.float[0]
import pyperclip 
pyperclip.copy(vector)

In [None]:
def url_to_base64(image_url):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    image_format = img.format.lower()
    buffered = BytesIO()
    img.save(buffered, format=img.format)
    img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return f"data:image/{image_format};base64,{img_base64}"

input = {
    "content": [
    {"type": "image_url", "image_url": {"url": url_to_base64("<testing-link>")}}  # replace with your image URL
    ]
}

# === Get Embedding from Cohere ===
response = co.v2.embed(
    inputs=[input],
    model=EMBED_MODEL,
    output_dimension=1024,
    input_type="search_document",
    embedding_types=["float"]  # specify embedding type
)
    
image_vector =response.embeddings.float[0]

# image_vector = response.embeddings.float[0]
import pyperclip 
pyperclip.copy(image_vector)