# Load ENV

In [23]:
import os
from dotenv import load_dotenv
# Python Built-Ins:
import os
from typing import Optional
import sys
import json
import time

# External Dependencies:
import boto3
from botocore.config import Config
import botocore
import requests
from bs4 import BeautifulSoup

# Load environment variables from .env file
load_dotenv()
# Now you can access the environment variable
AWS_ACCESS_KEY_ID = os.getenv('aws_access_key_id')
AWS_SECRET_ACCESS_KEY = os.getenv('aws_secret_access_key')

# Provided Bedrock setup
def get_bedrock_client(
    runtime: Optional[bool] = True,
    aws_access_key_id: Optional[str] = None,
    aws_secret_access_key: Optional[str] = None,
    aws_session_token: Optional[str] = None
):
    service_name = 'bedrock-runtime' if runtime else 'bedrock'
    bedrock_runtime = boto3.client(
        service_name=service_name,
        region_name="us-west-2",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        aws_session_token=aws_session_token
    )
    print("boto3 Bedrock client successfully created!")
    return bedrock_runtime

bedrock_runtime = get_bedrock_client()

def invoke_model(body, model_id, accept, content_type):
    """
    Invokes Amazon bedrock model to run an inference
    using the input provided in the request body.
    
    Args:
        body (dict): The invokation body to send to bedrock
        model_id (str): the model to query
        accept (str): input accept type
        content_type (str): content type
    Returns:
        Inference response from the model.
    """

    try:
        start_time = time.time()
        response = bedrock_runtime.invoke_model(
            body=json.dumps(body), 
            modelId=model_id, 
            accept=accept, 
            contentType=content_type
        )
        elapsed_time = time.time() - start_time
        print(f"Model invocation took {elapsed_time:.3f} seconds.")

        return response

    except Exception as e:
        print(f"Couldn't invoke {model_id}")
        raise e

boto3 Bedrock client successfully created!


# Web Scraping Old Navy Clothes

In [16]:
import os
import time
import csv
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re

# Setup headless browser
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)

# ✅ Load page (even though URL includes a hash, we'll use what browser renders)
url = "https://oldnavy.gap.com/browse/men/shop-all-mens?cid=1031099"
driver.get(url)
time.sleep(4)

# Scroll to load products
for _ in range(10):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

# Parse page
soup = BeautifulSoup(driver.page_source, "html.parser")

# ✅ Only look at images inside product cards
product_cards = soup.find_all("div", class_="product-card")

# Setup output
output_dir = "filtered_tshirts"
os.makedirs(output_dir, exist_ok=True)
csv_file = open("filtered_tshirts.csv", "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Image Filename", "Alt Text", "Image URL", "Product URL"])

# Extract and save product grid images only
downloaded = 0
for idx, card in enumerate(product_cards):
    try:
        img = card.find("img", src=True)
        if not img:
            continue

        src = img["src"]
        if not src.startswith("http"):
            src = "https://oldnavy.gap.com" + src

        alt = img.get("alt", "").strip()

        # Get product link
        a_tag = card.find("a", href=True)
        href = a_tag["href"] if a_tag else ""
        if href and not href.startswith("http"):
            href = "https://oldnavy.gap.com" + href

        # Sanitize filename
        safe_alt = re.sub(r'[\\/*?:"<>|]', "_", alt)[:50] or f"tshirt_{idx}"
        filename = f"{safe_alt}_{idx}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Download image
        img_data = requests.get(src).content
        with open(filepath, "wb") as f:
            f.write(img_data)

        csv_writer.writerow([filename, alt, src, href])
        print(f"[{idx}] ✅ Saved: {filename}")
        downloaded += 1

    except Exception as e:
        print(f"[{idx}] ❌ Failed to process image: {e}")

csv_file.close()
driver.quit()
print(f"\n✅ Done! Downloaded {downloaded} product grid images and saved metadata to filtered_tshirts.csv")


[0] ✅ Saved: Essential Woven Workout Shorts -- 7-inch inseam_0.jpg
[1] ✅ Saved: Dynamic Fleece Textured Half Zip_1.jpg
[2] ✅ Saved: Dynamic Fleece Textured Half Zip_2.jpg
[3] ✅ Saved: Tech Hybrid Chino Shorts -- 8-inch inseam_3.jpg
[4] ✅ Saved: Essential Woven Workout Shorts -- 9-inch inseam_4.jpg
[5] ✅ Saved: Essential Woven Workout Shorts -- 9-inch inseam_5.jpg
[6] ✅ Saved: Essential Woven Workout Shorts -- 9-inch inseam_6.jpg
[7] ✅ Saved: ProTrain  Shorts -- 7-inch inseam_7.jpg
[8] ✅ Saved: Dynamic Fleece Textured Hoodie_8.jpg
[9] ✅ Saved: CloudMotion T-Shirt 2-Pack_9.jpg
[10] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_10.jpg
[11] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_11.jpg
[12] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_12.jpg
[13] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_13.jpg
[14] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_14.jpg
[15] ✅ Saved: Dynamic Fleece 4.0 Hoodie_15.jpg
[16] ✅ Saved: Dynamic Fleece 4.0 Hoodie_16.jpg
[17] ✅ Saved: Dynamic Fleece 4.0 Hoodie_17.jpg
[18] ✅ Saved: Dynamic 

In [20]:
import os
import time
import csv
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Output setup
output_dir = "mens_all_pages"
os.makedirs(output_dir, exist_ok=True)
csv_file = open("mens_all_pages.csv", "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Image Filename", "Alt Text", "Image URL", "Product URL"])

downloaded = 0

# Loop through pageId=0 to pageId=8
for page_id in range(9):
    print(f"\n📄 Scraping Page {page_id + 1}...")

    # Construct page URL (hash is used by JavaScript in browser)
    url = f"https://oldnavy.gap.com/browse/men/shop-all-mens?cid=1031099#pageId={page_id}"
    driver.get(url)
    time.sleep(6)

    # Scroll to bottom to trigger lazy-loading
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)

    # Parse
    soup = BeautifulSoup(driver.page_source, "html.parser")
    product_cards = soup.find_all("div", class_="product-card")

    if not product_cards:
        print("⚠️ No product cards found on this page.")
        continue

    for idx, card in enumerate(product_cards):
        try:
            img = card.find("img", src=True)
            if not img:
                continue

            src = img["src"]
            if not src.startswith("http"):
                src = "https://oldnavy.gap.com" + src

            alt = img.get("alt", "").strip()

            a_tag = card.find("a", href=True)
            href = a_tag["href"] if a_tag else ""
            if href and not href.startswith("http"):
                href = "https://oldnavy.gap.com" + href

            safe_alt = re.sub(r'[\\/*?:"<>|]', "_", alt)[:50] or f"product_{page_id}_{idx}"
            filename = f"{safe_alt}_p{page_id}_{idx}.jpg"
            filepath = os.path.join(output_dir, filename)

            img_data = requests.get(src).content
            with open(filepath, "wb") as f:
                f.write(img_data)

            csv_writer.writerow([filename, alt, src, href])
            print(f"[{downloaded}] ✅ Saved: {filename}")
            downloaded += 1

        except Exception as e:
            print(f"[{downloaded}] ❌ Error: {e}")

driver.quit()
csv_file.close()
print(f"\n✅ Done! Downloaded {downloaded} product images across 9 pages.")




📄 Scraping Page 1...
[0] ✅ Saved: Essential Woven Workout Shorts -- 7-inch inseam_p0_0.jpg
[1] ✅ Saved: Dynamic Fleece Textured Half Zip_p0_1.jpg
[2] ✅ Saved: Dynamic Fleece Textured Half Zip_p0_2.jpg
[3] ✅ Saved: Tech Hybrid Chino Shorts -- 8-inch inseam_p0_3.jpg
[4] ✅ Saved: Essential Woven Workout Shorts -- 9-inch inseam_p0_4.jpg
[5] ✅ Saved: Essential Woven Workout Shorts -- 9-inch inseam_p0_5.jpg
[6] ✅ Saved: Essential Woven Workout Shorts -- 9-inch inseam_p0_6.jpg
[7] ✅ Saved: ProTrain  Shorts -- 7-inch inseam_p0_7.jpg
[8] ✅ Saved: Dynamic Fleece Textured Hoodie_p0_8.jpg
[9] ✅ Saved: CloudMotion T-Shirt 2-Pack_p0_9.jpg
[10] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_p0_10.jpg
[11] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_p0_11.jpg
[12] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_p0_12.jpg
[13] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_p0_13.jpg
[14] ✅ Saved: Dynamic Fleece 4.0 Zip Hoodie_p0_14.jpg
[15] ✅ Saved: Dynamic Fleece 4.0 Hoodie_p0_15.jpg
[16] ✅ Saved: Dynamic Fleece 4.0 Hoodie_p0_16.

In [27]:
import os
import time
import csv
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# ---- CONFIG ----
page_id = 2  # 👈 Change this for pages 1–8
output_dir = f"mens_products_pageid_{page_id}"
csv_filename = f"mens_products_pageid_{page_id}.csv"
# ----------------

# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Load base page and set hash
url = "https://oldnavy.gap.com/browse/men/shop-all-mens?cid=1031099"
driver.get(url)
time.sleep(4)
driver.execute_script(f"window.location.hash = 'pageId={page_id}'")
time.sleep(5)

# Scroll to load all items
for _ in range(6):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1.5)

# Parse HTML
soup = BeautifulSoup(driver.page_source, "html.parser")
product_cards = soup.find_all("div", class_="product-card")

# Setup output
os.makedirs(output_dir, exist_ok=True)
csv_file = open(csv_filename, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Image Filename", "Alt Text", "Image URL", "Product URL"])

downloaded = 0
for idx, card in enumerate(product_cards):
    try:
        img = card.find("img", src=True)
        if not img:
            continue

        src = img["src"]
        if not src.startswith("http"):
            src = "https://oldnavy.gap.com" + src

        alt = img.get("alt", "").strip()

        a_tag = card.find("a", href=True)
        href = a_tag["href"] if a_tag else ""
        if href and not href.startswith("http"):
            href = "https://oldnavy.gap.com" + href

        # Sanitize filename
        safe_alt = re.sub(r'[\\/*?:"<>|]', "_", alt)[:50] or f"product_{page_id}_{idx}"
        filename = f"{safe_alt}_pid{page_id}_{idx}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Download image
        img_data = requests.get(src).content
        with open(filepath, "wb") as f:
            f.write(img_data)

        csv_writer.writerow([filename, alt, src, href])
        print(f"[{downloaded}] ✅ Saved: {filename}")
        downloaded += 1

    except Exception as e:
        print(f"[{downloaded}] ❌ Error: {e}")

csv_file.close()
driver.quit()
print(f"\n✅ Done! Downloaded {downloaded} product images from pageId={page_id}.")


[0] ✅ Saved: Straight Trouser Pants_pid2_0.jpg
[1] ✅ Saved: Straight Pinstripe Trouser Pants_pid2_1.jpg
[2] ✅ Saved: Baggy Built-In Flex Rotation Chino Pants_pid2_2.jpg
[3] ✅ Saved: Baggy Built-In Flex Rotation Chino Pants_pid2_3.jpg
[4] ✅ Saved: Baggy Built-In Flex Rotation Chino Pants_pid2_4.jpg
[5] ✅ Saved: Slim Rotation Chino Pants_pid2_5.jpg
[6] ✅ Saved: Slim Rotation Chino Pants_pid2_6.jpg
[7] ✅ Saved: Straight Carpenter Pants_pid2_7.jpg
[8] ✅ Saved: Heavyweight Crew-Neck Graphic T-Shirt_pid2_8.jpg
[9] ✅ Saved: 2025 Flag Graphic T-Shirt_pid2_9.jpg
[10] ✅ Saved: 2025 Flag Graphic T-Shirt_pid2_10.jpg
[11] ✅ Saved: 2025 Flag Graphic T-Shirt_pid2_11.jpg
[12] ✅ Saved: 2025 Flag Graphic T-Shirt_pid2_12.jpg
[13] ✅ Saved: 2025 Flag Graphic T-Shirt_pid2_13.jpg
[14] ✅ Saved: Graphic Tank Top_pid2_14.jpg
[15] ✅ Saved: Graphic Tank Top_pid2_15.jpg
[16] ✅ Saved: Crew-Neck Graphic T-Shirt_pid2_16.jpg
[17] ✅ Saved: Crew-Neck Graphic T-Shirt_pid2_17.jpg
[18] ✅ Saved: Crew-Neck Graphic T-Shirt_pi

In [28]:
import os
import time
import csv
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# ---- CONFIG ----
page_id = 3  # 👈 Change this for pages 1–8
output_dir = f"mens_products_pageid_{page_id}"
csv_filename = f"mens_products_pageid_{page_id}.csv"
# ----------------

# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Load base page and set hash
url = "https://oldnavy.gap.com/browse/men/shop-all-mens?cid=1031099"
driver.get(url)
time.sleep(4)
driver.execute_script(f"window.location.hash = 'pageId={page_id}'")
time.sleep(5)

# Scroll to load all items
for _ in range(6):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1.5)

# Parse HTML
soup = BeautifulSoup(driver.page_source, "html.parser")
product_cards = soup.find_all("div", class_="product-card")

# Setup output
os.makedirs(output_dir, exist_ok=True)
csv_file = open(csv_filename, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Image Filename", "Alt Text", "Image URL", "Product URL"])

downloaded = 0
for idx, card in enumerate(product_cards):
    try:
        img = card.find("img", src=True)
        if not img:
            continue

        src = img["src"]
        if not src.startswith("http"):
            src = "https://oldnavy.gap.com" + src

        alt = img.get("alt", "").strip()

        a_tag = card.find("a", href=True)
        href = a_tag["href"] if a_tag else ""
        if href and not href.startswith("http"):
            href = "https://oldnavy.gap.com" + href

        # Sanitize filename
        safe_alt = re.sub(r'[\\/*?:"<>|]', "_", alt)[:50] or f"product_{page_id}_{idx}"
        filename = f"{safe_alt}_pid{page_id}_{idx}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Download image
        img_data = requests.get(src).content
        with open(filepath, "wb") as f:
            f.write(img_data)

        csv_writer.writerow([filename, alt, src, href])
        print(f"[{downloaded}] ✅ Saved: {filename}")
        downloaded += 1

    except Exception as e:
        print(f"[{downloaded}] ❌ Error: {e}")

csv_file.close()
driver.quit()
print(f"\n✅ Done! Downloaded {downloaded} product images from pageId={page_id}.")

[0] ✅ Saved: CloudMotion Space-Dye T-Shirt_pid3_0.jpg
[1] ✅ Saved: Crew-Neck Logo T-Shirt_pid3_1.jpg
[2] ✅ Saved: Crew-Neck Logo T-Shirt_pid3_2.jpg
[3] ✅ Saved: Crew-Neck Graphic T-Shirt_pid3_3.jpg
[4] ✅ Saved: Crew-Neck Graphic T-Shirt_pid3_4.jpg
[5] ✅ Saved: Crew-Neck Graphic T-Shirt_pid3_5.jpg
[6] ✅ Saved: Crew-Neck Graphic T-Shirt_pid3_6.jpg
[7] ✅ Saved: Crew-Neck Graphic T-Shirt_pid3_7.jpg
[8] ✅ Saved: Long-Sleeve Rotation T-Shirt_pid3_8.jpg
[9] ✅ Saved: CloudMotion T-Shirt 2-Pack_pid3_9.jpg
[10] ✅ Saved: CloudMotion T-Shirt 2-Pack_pid3_10.jpg
[11] ✅ Saved: CloudMotion T-Shirt 2-Pack_pid3_11.jpg
[12] ✅ Saved: Graphic T-Shirt_pid3_12.jpg
[13] ✅ Saved: NFL™ Patriots™ T-Shirt_pid3_13.jpg
[14] ✅ Saved: NFL™ Pittsburgh Steelers™ T-Shirt_pid3_14.jpg
[15] ✅ Saved: NFL™ Kansas City Chiefs™ T-Shirt_pid3_15.jpg
[16] ✅ Saved: NFL™ New York Giants™ T-Shirt_pid3_16.jpg
[17] ✅ Saved: CloudMotion T-Shirt_pid3_17.jpg
[18] ✅ Saved: Loose Fit Crew-Neck T-Shirt_pid3_18.jpg
[19] ✅ Saved: Crew-Neck Po

In [29]:
import os
import time
import csv
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# ---- CONFIG ----
page_id = 4  # 👈 Change this for pages 1–8
output_dir = f"mens_products_pageid_{page_id}"
csv_filename = f"mens_products_pageid_{page_id}.csv"
# ----------------

# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Load base page and set hash
url = "https://oldnavy.gap.com/browse/men/shop-all-mens?cid=1031099"
driver.get(url)
time.sleep(4)
driver.execute_script(f"window.location.hash = 'pageId={page_id}'")
time.sleep(5)

# Scroll to load all items
for _ in range(6):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1.5)

# Parse HTML
soup = BeautifulSoup(driver.page_source, "html.parser")
product_cards = soup.find_all("div", class_="product-card")

# Setup output
os.makedirs(output_dir, exist_ok=True)
csv_file = open(csv_filename, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Image Filename", "Alt Text", "Image URL", "Product URL"])

downloaded = 0
for idx, card in enumerate(product_cards):
    try:
        img = card.find("img", src=True)
        if not img:
            continue

        src = img["src"]
        if not src.startswith("http"):
            src = "https://oldnavy.gap.com" + src

        alt = img.get("alt", "").strip()

        a_tag = card.find("a", href=True)
        href = a_tag["href"] if a_tag else ""
        if href and not href.startswith("http"):
            href = "https://oldnavy.gap.com" + href

        # Sanitize filename
        safe_alt = re.sub(r'[\\/*?:"<>|]', "_", alt)[:50] or f"product_{page_id}_{idx}"
        filename = f"{safe_alt}_pid{page_id}_{idx}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Download image
        img_data = requests.get(src).content
        with open(filepath, "wb") as f:
            f.write(img_data)

        csv_writer.writerow([filename, alt, src, href])
        print(f"[{downloaded}] ✅ Saved: {filename}")
        downloaded += 1

    except Exception as e:
        print(f"[{downloaded}] ❌ Error: {e}")

csv_file.close()
driver.quit()
print(f"\n✅ Done! Downloaded {downloaded} product images from pageId={page_id}.")

[0] ✅ Saved: Oversized Cropped Poplin Shirt_pid4_0.jpg
[1] ✅ Saved: Oversized Poplin Shirt_pid4_1.jpg
[2] ✅ Saved: Oversized Poplin Shirt_pid4_2.jpg
[3] ✅ Saved: Oversized Poplin Shirt_pid4_3.jpg
[4] ✅ Saved: Oversized Cropped Poplin Shirt_pid4_4.jpg
[5] ✅ Saved: Oversized Cropped Poplin Shirt_pid4_5.jpg
[6] ✅ Saved: Oversized Cropped Poplin Shirt_pid4_6.jpg
[7] ✅ Saved: Classic Fit Seersucker Shirt_pid4_7.jpg
[8] ✅ Saved: Classic Fit Pique Polo_pid4_8.jpg
[9] ✅ Saved: Classic Fit Pique Polo_pid4_9.jpg
[10] ✅ Saved: Classic Fit Pique Polo_pid4_10.jpg
[11] ✅ Saved: Classic Fit Pique Polo_pid4_11.jpg
[12] ✅ Saved: Uniform Pique Polo_pid4_12.jpg
[13] ✅ Saved: Quarter-Zip Waffle Polo Sweater_pid4_13.jpg
[14] ✅ Saved: Classic Fit Pique Polo_pid4_14.jpg
[15] ✅ Saved: CloudMotion Polo_pid4_15.jpg
[16] ✅ Saved: CloudMotion Pique Polo_pid4_16.jpg
[17] ✅ Saved: CloudMotion Polo_pid4_17.jpg
[18] ✅ Saved: CloudMotion Polo_pid4_18.jpg
[19] ✅ Saved: CloudMotion Polo_pid4_19.jpg
[20] ✅ Saved: CloudMo

In [30]:
import os
import time
import csv
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# ---- CONFIG ----
page_id = 5  # 👈 Change this for pages 1–8
output_dir = f"mens_products_pageid_{page_id}"
csv_filename = f"mens_products_pageid_{page_id}.csv"
# ----------------

# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Load base page and set hash
url = "https://oldnavy.gap.com/browse/men/shop-all-mens?cid=1031099"
driver.get(url)
time.sleep(4)
driver.execute_script(f"window.location.hash = 'pageId={page_id}'")
time.sleep(5)

# Scroll to load all items
for _ in range(6):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1.5)

# Parse HTML
soup = BeautifulSoup(driver.page_source, "html.parser")
product_cards = soup.find_all("div", class_="product-card")

# Setup output
os.makedirs(output_dir, exist_ok=True)
csv_file = open(csv_filename, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Image Filename", "Alt Text", "Image URL", "Product URL"])

downloaded = 0
for idx, card in enumerate(product_cards):
    try:
        img = card.find("img", src=True)
        if not img:
            continue

        src = img["src"]
        if not src.startswith("http"):
            src = "https://oldnavy.gap.com" + src

        alt = img.get("alt", "").strip()

        a_tag = card.find("a", href=True)
        href = a_tag["href"] if a_tag else ""
        if href and not href.startswith("http"):
            href = "https://oldnavy.gap.com" + href

        # Sanitize filename
        safe_alt = re.sub(r'[\\/*?:"<>|]', "_", alt)[:50] or f"product_{page_id}_{idx}"
        filename = f"{safe_alt}_pid{page_id}_{idx}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Download image
        img_data = requests.get(src).content
        with open(filepath, "wb") as f:
            f.write(img_data)

        csv_writer.writerow([filename, alt, src, href])
        print(f"[{downloaded}] ✅ Saved: {filename}")
        downloaded += 1

    except Exception as e:
        print(f"[{downloaded}] ❌ Error: {e}")

csv_file.close()
driver.quit()
print(f"\n✅ Done! Downloaded {downloaded} product images from pageId={page_id}.")

[0] ✅ Saved: Dungeons & Dragons™ T-Shirt_pid5_0.jpg
[1] ✅ Saved: Journey™ T-Shirt_pid5_1.jpg
[2] ✅ Saved: Star Wars™ T-Shirt_pid5_2.jpg
[3] ✅ Saved: Heavyweight Cropped Graphic T-Shirt_pid5_3.jpg
[4] ✅ Saved: Heavyweight Cropped Graphic T-Shirt_pid5_4.jpg
[5] ✅ Saved: Heavyweight Cropped Graphic T-Shirt_pid5_5.jpg
[6] ✅ Saved: Heavyweight Cropped Graphic T-Shirt_pid5_6.jpg
[7] ✅ Saved: Crew-Neck Graphic T-Shirt_pid5_7.jpg
[8] ✅ Saved: SoSoft Fair Isle Sweater_pid5_8.jpg
[9] ✅ Saved: SoSoft Fair Isle Sweater_pid5_9.jpg
[10] ✅ Saved: So-Soft Crew-Neck Sweater_pid5_10.jpg
[11] ✅ Saved: Chenille Quarter Zip_pid5_11.jpg
[12] ✅ Saved: SoSoft Cable Sweater_pid5_12.jpg
[13] ✅ Saved: Textured Button-Down Sweater_pid5_13.jpg
[14] ✅ Saved: Crew-Neck Sweater_pid5_14.jpg
[15] ✅ Saved: V-Neck Sweater_pid5_15.jpg
[16] ✅ Saved: V-Neck Sweater_pid5_16.jpg
[17] ✅ Saved: Striped Sweater_pid5_17.jpg
[18] ✅ Saved: Shaker-Stitch Sweater_pid5_18.jpg
[19] ✅ Saved: SoSoft Fair Isle Vest_pid5_19.jpg
[20] ✅ Save

In [31]:
import os
import time
import csv
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# ---- CONFIG ----
page_id = 6  # 👈 Change this for pages 1–8
output_dir = f"mens_products_pageid_{page_id}"
csv_filename = f"mens_products_pageid_{page_id}.csv"
# ----------------

# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Load base page and set hash
url = "https://oldnavy.gap.com/browse/men/shop-all-mens?cid=1031099"
driver.get(url)
time.sleep(4)
driver.execute_script(f"window.location.hash = 'pageId={page_id}'")
time.sleep(5)

# Scroll to load all items
for _ in range(6):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1.5)

# Parse HTML
soup = BeautifulSoup(driver.page_source, "html.parser")
product_cards = soup.find_all("div", class_="product-card")

# Setup output
os.makedirs(output_dir, exist_ok=True)
csv_file = open(csv_filename, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Image Filename", "Alt Text", "Image URL", "Product URL"])

downloaded = 0
for idx, card in enumerate(product_cards):
    try:
        img = card.find("img", src=True)
        if not img:
            continue

        src = img["src"]
        if not src.startswith("http"):
            src = "https://oldnavy.gap.com" + src

        alt = img.get("alt", "").strip()

        a_tag = card.find("a", href=True)
        href = a_tag["href"] if a_tag else ""
        if href and not href.startswith("http"):
            href = "https://oldnavy.gap.com" + href

        # Sanitize filename
        safe_alt = re.sub(r'[\\/*?:"<>|]', "_", alt)[:50] or f"product_{page_id}_{idx}"
        filename = f"{safe_alt}_pid{page_id}_{idx}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Download image
        img_data = requests.get(src).content
        with open(filepath, "wb") as f:
            f.write(img_data)

        csv_writer.writerow([filename, alt, src, href])
        print(f"[{downloaded}] ✅ Saved: {filename}")
        downloaded += 1

    except Exception as e:
        print(f"[{downloaded}] ❌ Error: {e}")

csv_file.close()
driver.quit()
print(f"\n✅ Done! Downloaded {downloaded} product images from pageId={page_id}.")

[0] ✅ Saved: Loose Graphic Sweatshirt_pid6_0.jpg
[1] ✅ Saved: Oversized Rotation Hoodie_pid6_1.jpg
[2] ✅ Saved: Oversized Rotation Hoodie_pid6_2.jpg
[3] ✅ Saved: Essential Oversized Zip Hoodie_pid6_3.jpg
[4] ✅ Saved: Oversized Essential Graphic Hoodie_pid6_4.jpg
[5] ✅ Saved: Nirvana™ Hoodie_pid6_5.jpg
[6] ✅ Saved: Quarter-Zip Sweater_pid6_6.jpg
[7] ✅ Saved: Oversized Rotation Hoodie_pid6_7.jpg
[8] ✅ Saved: Novelty Crew Socks for Men_pid6_8.jpg
[9] ✅ Saved: 6-Pack Athletic Quarter Crew Socks for Men_pid6_9.jpg
[10] ✅ Saved: 6-Pack Athletic Ankle Socks for Men_pid6_10.jpg
[11] ✅ Saved: 3-Pack Soft-Washed Boxer Shorts -- 3.75-inch insea_pid6_11.jpg
[12] ✅ Saved: 3-Pack Soft-Washed Boxer Shorts -- 3.75-inch insea_pid6_12.jpg
[13] ✅ Saved: 6-Pack Athletic Tube Socks for Men_pid6_13.jpg
[14] ✅ Saved: 6-Pack Athletic Tube Socks for Men_pid6_14.jpg
[15] ✅ Saved: 6-Pack Athletic Quarter Crew Socks for Men_pid6_15.jpg
[16] ✅ Saved: 6-Pack Athletic Quarter Crew Socks for Men_pid6_16.jpg
[17] ✅ Sa

# Obtaining Descriptions for Images

In [1]:
import os
import base64
import pandas as pd
import time
import json
from dotenv import load_dotenv
from typing import Optional
import boto3

# Load AWS credentials
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv('aws_access_key_id')
AWS_SECRET_ACCESS_KEY = os.getenv('aws_secret_access_key')

# AWS Bedrock setup
def get_bedrock_client(runtime: Optional[bool] = True):
    service_name = 'bedrock-runtime' if runtime else 'bedrock'
    return boto3.client(
        service_name=service_name,
        region_name="us-west-2",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )

bedrock_runtime = get_bedrock_client()

def invoke_model(body, model_id, accept, content_type):
    try:
        start_time = time.time()
        response = bedrock_runtime.invoke_model(
            body=json.dumps(body),
            modelId=model_id,
            accept=accept,
            contentType=content_type
        )
        elapsed_time = time.time() - start_time
        print(f"⏱️ Model invocation took {elapsed_time:.2f} seconds.")
        return json.loads(response['body'].read().decode())
    except Exception as e:
        print(f"❌ Error invoking model: {e}")
        return None

# Prompts
PROMPTS = [
    "Carefully analyze this image of a men's clothing item. Provide a detailed and precise description including garment type, fabric, color, texture, construction, fit, stitching, design elements, visible tags, and branding. Write it as if for a fashion catalog, using professional retail language.",
    "Based on the visual style of this men's clothing item, describe the overall vibe or aesthetic. Use relevant fashion terms like minimalist, retro, techwear, classic, preppy, etc. Also suggest the kind of person who might wear it, and in what types of settings or occasions.",
    "Imagine you're a 25-year-old man describing this clothing item to a friend. What does it look and feel like? How would you wear it or style it? Be casual, expressive, and authentic—as if you're giving a personal recommendation.",
    "List any small or easily overlooked details about the item that haven’t been covered yet. These could include minor accents, stitching patterns, internal tags, textures, color transitions, or subtle design choices that might be noticed on close inspection."
]

# Constants
MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"
ACCEPT = "application/json"
CONTENT_TYPE = "application/json"

def encode_image_base64(image_path: str) -> str:
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

def query_model(image_b64: str, prompt: str) -> str:
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": image_b64
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ],
        "max_tokens": 1000
    }

    result = invoke_model(body, MODEL_ID, ACCEPT, CONTENT_TYPE)
    try:
        if result and 'content' in result:
            if isinstance(result['content'], list):
                return "\n".join([chunk.get("text", "") for chunk in result["content"]])
            elif isinstance(result['content'], str):
                return result['content']
        return "ERROR: Empty or unexpected response"
    except Exception as e:
        return f"ERROR parsing response: {e}"

def process_folder(folder_name: str):
    csv_path = f"{folder_name}.csv"
    output_path = f"{folder_name}_output.csv"
    df = pd.read_csv(csv_path)
    results = []

    for idx, row in df.iterrows():
        img_filename = row["Image Filename"]
        alt_text = row["Alt Text"]
        img_path = os.path.join(folder_name, img_filename)

        print(f"\n🖼️ Processing image {idx + 1}/{len(df)} in {folder_name}: {img_filename}")

        try:
            image_b64 = encode_image_base64(img_path)
            responses = []
            for prompt in PROMPTS:
                response = query_model(image_b64, prompt)
                responses.append(response)
                time.sleep(1)  # throttle requests

            results.append({
                "Image Filename": img_filename,
                "Alt Text": alt_text,
                "Response to Prompt 1": responses[0],
                "Response to Prompt 2": responses[1],
                "Response to Prompt 3": responses[2],
                "Response to Prompt 4": responses[3],
            })

        except Exception as e:
            print(f"❌ Failed to process {img_filename}: {e}")
            results.append({
                "Image Filename": img_filename,
                "Alt Text": alt_text,
                "Response to Prompt 1": "ERROR",
                "Response to Prompt 2": "ERROR",
                "Response to Prompt 3": "ERROR",
                "Response to Prompt 4": "ERROR"
            })

    out_df = pd.DataFrame(results)
    out_df.to_csv(output_path, index=False)
    print(f"\n✅ All results for {folder_name} saved to {output_path}")

if __name__ == "__main__":
    for i in range(1, 7):
        folder = f"mens_products_pageid_{i}"
        process_folder(folder)



🖼️ Processing image 1/115 in mens_products_pageid_1: Straight Tech Hybrid Pants_pid1_0.jpg
⏱️ Model invocation took 7.38 seconds.
⏱️ Model invocation took 15.96 seconds.
⏱️ Model invocation took 8.91 seconds.
⏱️ Model invocation took 8.80 seconds.

🖼️ Processing image 2/115 in mens_products_pageid_1: Straight Tech Hybrid Pants_pid1_1.jpg
⏱️ Model invocation took 9.46 seconds.
⏱️ Model invocation took 7.53 seconds.
⏱️ Model invocation took 6.08 seconds.
⏱️ Model invocation took 14.65 seconds.

🖼️ Processing image 3/115 in mens_products_pageid_1: Straight Tech Hybrid Pants_pid1_2.jpg
⏱️ Model invocation took 8.49 seconds.
⏱️ Model invocation took 8.25 seconds.
⏱️ Model invocation took 13.58 seconds.
⏱️ Model invocation took 6.09 seconds.

🖼️ Processing image 4/115 in mens_products_pageid_1: Straight Tech Hybrid Pants_pid1_3.jpg
⏱️ Model invocation took 7.57 seconds.
⏱️ Model invocation took 8.33 seconds.
⏱️ Model invocation took 9.05 seconds.
⏱️ Model invocation took 14.90 seconds.

🖼️ 

# Vector Embeddings

## Cohere Embeddings

In [4]:
import json
import pandas as pd
from tqdm import tqdm

# Bedrock client (must already be defined as `bedrock_runtime`)
# Example: bedrock_runtime = boto3.client("bedrock-runtime", region_name="us-west-2")

COHERE_EMBED_MODEL_ID = "cohere.embed-english-v3"

# ✅ Embedding function using Cohere
def get_cohere_embedding(text):
    if pd.isna(text) or not text.strip():
        return None

    body = {
        "texts": [text],                 # ✅ Correct key
        "input_type": "search_document"  # ✅ Required for Bedrock's Cohere model
    }

    try:
        response = bedrock_runtime.invoke_model(
            modelId="cohere.embed-english-v3",
            contentType="application/json",
            accept="application/json",
            body=json.dumps(body)
        )
        result = json.loads(response['body'].read())
        return json.dumps(result["embeddings"][0])  # ✅ Use correct path to embedding
    except Exception as e:
        print(f"Error embedding text: {e}")
        return None


# ✅ Load your CSV
df = pd.read_csv("mens_products_pageid_1_6_output.csv")

# ✅ Embed the 4 response columns
tqdm.pandas()
df["vector1"] = df["Response1"].progress_apply(lambda x: get_cohere_embedding(str(x)))
df["vector2"] = df["Response2"].progress_apply(lambda x: get_cohere_embedding(str(x)))
df["vector3"] = df["Response3"].progress_apply(lambda x: get_cohere_embedding(str(x)))
df["vector4"] = df["Response4"].progress_apply(lambda x: get_cohere_embedding(str(x)))

# ✅ Save result with vectors as JSON strings
df.to_csv("cohere_vector_embeddings_1_6.csv", index=False)


100%|██████████| 675/675 [01:42<00:00,  6.58it/s]
100%|██████████| 675/675 [01:25<00:00,  7.92it/s]
100%|██████████| 675/675 [01:35<00:00,  7.11it/s]
100%|██████████| 675/675 [01:21<00:00,  8.27it/s]


### Cosine Similarity

In [45]:
# ✅ Bedrock runtime client
bedrock_runtime = boto3.client("bedrock-runtime", region_name="us-west-2")

# ✅ Cohere embedding function (via Bedrock)
def embed_query_with_cohere(text):
    if not text.strip():
        return np.zeros(1024).tolist()

    body = {
        "texts": [text],
        "input_type": "search_document"
    }

    try:
        response = bedrock_runtime.invoke_model(
            modelId="cohere.embed-english-v3",
            contentType="application/json",
            accept="application/json",
            body=json.dumps(body)
        )
        result = json.loads(response['body'].read())
        return result["embeddings"][0]  # 1024-dim vector
    except Exception as e:
        print(f"Error embedding query: {e}")
        return np.zeros(1024).tolist()

# ✅ Load CSV (must contain Cohere vectors)
df = pd.read_csv("cohere_vector_embeddings.csv")

# ✅ Parse JSON strings back into float vectors
for col in ["vector1", "vector2", "vector3", "vector4"]:
    df[col] = df[col].apply(json.loads)

# ✅ Embed the user's query
user_query = "I am looking for a streetwear vibe. SOme baggy jeans."

#Generate HyDE-expanded version of the user query
def generate_hypothetical_description(prompt):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "messages": [
            {
                "role": "user",
                "content": f"""
You are a fashion copywriter. Turn this user request into a professional fashion catalog description: "{prompt}". 
Describe the item in detail — garment type, color, fabric, graphic, fit, and style — as if writing for an e-commerce site like Uniqlo or H&M.
"""
            }
        ],
        "max_tokens": 300
    }

    result = invoke_model(body, model_id="anthropic.claude-3-sonnet-20240229-v1:0", accept="application/json", content_type="application/json")
    
    try:
        return result["content"][0]["text"]
    except Exception as e:
        print(f"Error generating HyDE description: {e}")
        return prompt  # fallback to raw prompt

hyde_text = generate_hypothetical_description(user_query)
print("HyDE-expanded query:", hyde_text)
query_vector = embed_query_with_cohere(hyde_text)

# ✅ Compute max cosine similarity across the 4 responses
def max_similarity(row):
    sims = []
    for i in range(1, 5):
        response_vector = np.array(row[f"vector{i}"]).reshape(1, -1)
        query_vec = np.array(query_vector).reshape(1, -1)
        sim = cosine_similarity(query_vec, response_vector)[0][0]
        sims.append(sim)
    return max(sims)

# ✅ Apply across DataFrame
tqdm.pandas()
df["similarity_score"] = df.progress_apply(max_similarity, axis=1)

# ✅ Get top results
top_results = df.sort_values(by="similarity_score", ascending=False).head(10)

# ✅ Show top results
print(top_results[["ImageFilename", "similarity_score"]])

Model invocation took 10.413 seconds.
HyDE-expanded query: Effortless Streetwear Chic: Baggy Denim Jeans

Embrace the ultimate in laid-back cool with our Baggy Denim Jeans. Crafted from premium cotton blend denim, these jeans offer unparalleled comfort and an authentic streetwear vibe. The relaxed silhouette features a generous fit through the hip and thigh, tapering slightly at the leg opening for a modern, on-trend look.

Available in a range of versatile washes, from classic indigo to trendy distressed finishes, these jeans are designed to elevate your casual wardrobe. The mid-rise waistline and roomy fit provide a flattering silhouette, while the five-pocket styling and zip-fly closure ensure both style and functionality.

Whether you're running errands or hanging out with friends, these Baggy Denim Jeans are the perfect canvas for your streetwear aesthetic. Pair them with a graphic tee and chunky sneakers for a laidback, effortlessly cool ensemble, or dress them up with a sleek bo

100%|██████████| 115/115 [00:01<00:00, 102.26it/s]

                                    ImageFilename  similarity_score
108           Slim Tech Hybrid Pants_pid0_108.jpg          0.610643
114           Slim Tech Hybrid Pants_pid0_114.jpg          0.597809
110       Straight Tech Hybrid Pants_pid0_110.jpg          0.572207
55      Dynamic Fleece 4.0 Zip Hoodie_pid0_55.jpg          0.565926
26   Dynamic Fleece 4.0 Cinched Pants_pid0_26.jpg          0.558138
25   Dynamic Fleece 4.0 Cinched Pants_pid0_25.jpg          0.557630
102              KnitTech Zip Hoodie_pid0_102.jpg          0.554554
23   Dynamic Fleece 4.0 Tapered Pants_pid0_23.jpg          0.552956
112           Slim Tech Hybrid Pants_pid0_112.jpg          0.551793
77         Straight Tech Hybrid Pants_pid0_77.jpg          0.548635





In [49]:
# ✅ User query
user_query = "I am looking for a streetwear vibe. Some baggy jeans."
query_vector = embed_query_with_cohere(user_query)

# ✅ Compute cosine similarity to average of vectors 1–4
def average_similarity(row):
    vectors = [np.array(row[f"vector{i}"]) for i in range(1, 5)]
    avg_vector = np.mean(vectors, axis=0).reshape(1, -1)
    query_vec = np.array(query_vector).reshape(1, -1)
    return cosine_similarity(query_vec, avg_vector)[0][0]

# ✅ Compute similarity for each row
tqdm.pandas()
df["similarity_score"] = df.progress_apply(average_similarity, axis=1)

# ✅ Sort by similarity
df_sorted = df.sort_values(by="similarity_score", ascending=False)

# ✅ Show top results
print(df_sorted[["ImageFilename", "similarity_score"]])



100%|██████████| 115/115 [00:00<00:00, 298.79it/s]

                                    ImageFilename  similarity_score
58    Essential Woven Workout Joggers_pid0_58.jpg          0.517888
29         Dynamic Fleece 4.0 Joggers_pid0_29.jpg          0.503895
108           Slim Tech Hybrid Pants_pid0_108.jpg          0.500501
30   Dynamic Fleece 4.0 Tapered Pants_pid0_30.jpg          0.496458
28         Dynamic Fleece 4.0 Joggers_pid0_28.jpg          0.495422
..                                            ...               ...
90                   CloudMotion Polo_pid0_90.jpg          0.328893
69      Go-Dry Cool Base Layer Shorts_pid0_69.jpg          0.327622
89                   CloudMotion Polo_pid0_89.jpg          0.324303
79   Go-Dry Rib-Knit Tank Tops 3-Pack_pid0_79.jpg          0.323483
70      Go-Dry Cool Base Layer Shorts_pid0_70.jpg          0.321300

[115 rows x 2 columns]





# Clothing Category (For Outfit)

In [10]:
import pandas as pd


def infer_category(row):
    text = " ".join([row[f"Response{i}"] for i in range(1, 5)]).lower()
    if "shirt" in text or "tee" in text or "t-shirt" in text:
        return "shirt"
    elif "pant" in text or "trouser" in text:
        return "pants"
    elif "short" in text or "trunk" in text:
        return "shorts"
    elif "jacket" in text or "coat" in text or "hoodie" in text:
        return "outerwear"
    elif "shoe" in text or "sneaker" in text or "boot" in text:
        return "shoes"
    else:
        return "other"

cohere_embed = pd.read_csv("C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/cohere_vector_embeddings_combined.csv")
cohere_embed["Category"] = cohere_embed.apply(infer_category, axis=1)


# Celebrity Comparison

In [3]:
import os
import base64
import pandas as pd
import time
import json
import io
from dotenv import load_dotenv
from pathlib import Path
from typing import Optional
from PIL import Image
import boto3

# === Load AWS credentials ===
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv('aws_access_key_id')
AWS_SECRET_ACCESS_KEY = os.getenv('aws_secret_access_key')

# === Claude setup ===
def get_bedrock_client(runtime: Optional[bool] = True):
    service_name = 'bedrock-runtime' if runtime else 'bedrock'
    return boto3.client(
        service_name=service_name,
        region_name="us-west-2",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )

bedrock_runtime = get_bedrock_client()
MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"
ACCEPT = "application/json"
CONTENT_TYPE = "application/json"

# === Claude prompts ===
CELEB_STYLE_PROMPT = (
    "Carefully analyze this image of a celebrity. Describe their fashion style using fashion terminology. "
    "Create a verbose description of outfit elements, materials, colors, fit, and noticeable aesthetic choices. "
    "Focus mainly on the top (shirt/jacket) and bottom (shorts/pants) parts of the outfit."
)

ARCHETYPE_PROMPT = (
    "Based on this image, what male fashion archetype best describes the outfit? Examples could include: 'Stylish but Houseless', 'Posh on a Budget', 'Egyptian Nights', 'Exotic Flamingo', 'Simple but Sleek', 'Streetwear Savant', 'Mid 90s Tech Mogul', etc. Make them creative like these. Return just the archetype label."
)

# === Image encoder (supports .webp, .png, .avif, etc.) ===
def encode_image_base64(image_path: str) -> str:
    ext = Path(image_path).suffix.lower()
    if ext in [".jpg", ".jpeg"]:
        with open(image_path, "rb") as img_file:
            return base64.b64encode(img_file.read()).decode("utf-8")
    else:
        with Image.open(image_path) as img:
            img = img.convert("RGB")
            buffer = io.BytesIO()
            img.save(buffer, format="JPEG")
            return base64.b64encode(buffer.getvalue()).decode("utf-8")

# === Claude request ===
def query_claude(image_b64: str, prompt: str) -> str:
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": image_b64
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ],
        "max_tokens": 700
    }

    try:
        start = time.time()
        response = bedrock_runtime.invoke_model(
            body=json.dumps(body),
            modelId=MODEL_ID,
            accept=ACCEPT,
            contentType=CONTENT_TYPE
        )
        result = json.loads(response['body'].read().decode())
        print(f"⏱️ Claude took {time.time() - start:.2f} seconds.")
        return result["content"][0]["text"]
    except Exception as e:
        return f"ERROR: {e}"

# === Process Excel with image descriptions and archetypes ===
def process_celeb_excel(file_path: str, image_folder: str):
    df = pd.read_excel(file_path)
    image_dir = Path(image_folder)

    for idx, row in df.iterrows():
        filename = row["Image_FileName"]
        image_path = image_dir / filename

        print(f"\n🔍 Processing {row['Name']} ({filename})")

        if not image_path.exists():
            print(f"❌ Image not found: {filename}")
            df.iat[idx, 2] = "ERROR: Image not found"
            df.iat[idx, 4] = "N/A"
            continue

        try:
            image_b64 = encode_image_base64(str(image_path))

            # Fashion description
            desc = query_claude(image_b64, CELEB_STYLE_PROMPT)
            df.iat[idx, 2] = desc

            # Archetype classification
            archetype = query_claude(image_b64, ARCHETYPE_PROMPT)
            df.iat[idx, 4] = archetype.strip().lower()

        except Exception as e:
            df.iat[idx, 2] = f"ERROR: {e}"
            df.iat[idx, 4] = "ERROR"

        time.sleep(1)

    df.to_excel(file_path, index=False)
    print(f"\n✅ Done! Output saved back to: {file_path}")

# === Run ===
if __name__ == "__main__":
    process_celeb_excel(
        file_path="C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/Celeb_Comp_Embeddings.xlsx",
        image_folder="C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/CelebPics"
    )



🔍 Processing A$AP Rocky (rocky.jpg)
⏱️ Claude took 17.32 seconds.
⏱️ Claude took 4.45 seconds.


  df.iat[idx, 4] = archetype.strip().lower()



🔍 Processing Alex Turner (alexturner.jpg)
⏱️ Claude took 7.74 seconds.
⏱️ Claude took 5.56 seconds.

🔍 Processing Austin Butler (austinButler.jpg)
⏱️ Claude took 8.21 seconds.
⏱️ Claude took 3.11 seconds.

🔍 Processing Bad Bunny (badbunny.jpg)
⏱️ Claude took 14.96 seconds.
⏱️ Claude took 4.74 seconds.

🔍 Processing Brad Pitt (bradpitt.jpg)
⏱️ Claude took 7.46 seconds.
⏱️ Claude took 5.27 seconds.

🔍 Processing Brandon Flowers (brandonflowers.jpg)
⏱️ Claude took 15.67 seconds.
⏱️ Claude took 4.21 seconds.

🔍 Processing Bruno Mars (brunomars.jpg)
⏱️ Claude took 8.28 seconds.
⏱️ Claude took 7.48 seconds.

🔍 Processing Cam Newton (CamNewton.jpg)
⏱️ Claude took 7.80 seconds.
⏱️ Claude took 3.67 seconds.

🔍 Processing Carmelo Anthony (CarmeloAnthony.jpg)
⏱️ Claude took 7.91 seconds.
⏱️ Claude took 4.07 seconds.

🔍 Processing Chris Paul (ChrisPaul.jpg)
⏱️ Claude took 10.31 seconds.
⏱️ Claude took 9.78 seconds.

🔍 Processing Conor McGregor (ConorMcGregor.webp)
⏱️ Claude took 10.23 seconds.
⏱️

## Celebrity Outfit Description Embeddings

In [19]:
import os
import json
import time
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import boto3

# === Load AWS credentials ===
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv('aws_access_key_id')
AWS_SECRET_ACCESS_KEY = os.getenv('aws_secret_access_key')

# === Initialize Bedrock client ===
def get_bedrock_client(runtime=True):
    service_name = 'bedrock-runtime' if runtime else 'bedrock'
    bedrock_runtime = boto3.client(
        service_name=service_name,
        region_name="us-west-2",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )
    print("✅ Bedrock client initialized")
    return bedrock_runtime

bedrock_runtime = get_bedrock_client()

# === Cohere embedding function ===
def get_cohere_embedding(text):
    if pd.isna(text) or not text.strip():
        return None
    body = {
        "texts": [text],
        "input_type": "search_document"
    }
    try:
        response = bedrock_runtime.invoke_model(
            modelId="cohere.embed-english-v3",
            contentType="application/json",
            accept="application/json",
            body=json.dumps(body)
        )
        result = json.loads(response['body'].read())
        return json.dumps(result["embeddings"][0])  # JSON string for Excel storage
    except Exception as e:
        print(f"❌ Error embedding text at input: {e}")
        return None

# === Load Excel file ===
file_path = "C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/Celeb_Comp_Embeddings.xlsx"
df = pd.read_excel(file_path)

# === Embed and update missing rows ===
for idx, row in tqdm(df.iterrows(), total=len(df)):
    if pd.isna(row.get("CohereEmbedding")) and not pd.isna(row.get("ClaudeDesc")):
        text = row["ClaudeDesc"]
        embedding = get_cohere_embedding(text)
        df.at[idx, "CohereEmbedding"] = embedding
        time.sleep(0.5)  # Optional: throttle requests

# === Save back to Excel ===
df.to_excel(file_path, index=False)
print("✅ Embeddings added and file saved.")



✅ Bedrock client initialized


100%|██████████| 89/89 [00:00<00:00, 7166.17it/s]


✅ Embeddings added and file saved.


## Archetypes

In [28]:
import os
import base64
import pandas as pd
import time
import json
import io
import re
from dotenv import load_dotenv
from pathlib import Path
from typing import Optional
from PIL import Image
import boto3

# === Load AWS credentials ===
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv('aws_access_key_id')
AWS_SECRET_ACCESS_KEY = os.getenv('aws_secret_access_key')

# === Claude setup ===
def get_bedrock_client(runtime: Optional[bool] = True):
    service_name = 'bedrock-runtime' if runtime else 'bedrock'
    return boto3.client(
        service_name=service_name,
        region_name="us-west-2",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )

bedrock_runtime = get_bedrock_client()
MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"
ACCEPT = "application/json"
CONTENT_TYPE = "application/json"

# === Archetype prompt (unchanged) ===
ARCHETYPE_PROMPT = (
    "Based on this image, what male fashion archetype best describes the outfit? "
    "Examples could include: 'Stylish but Houseless', 'Posh on a Budget', 'Egyptian Nights', 'Exotic Flamingo', "
    "'Simple but Sleek', 'Streetwear Savant', 'Mid 90s Tech Mogul', etc. Make them creative like these. "
    "Return just the 2-3 word archetype label in quotes. Do not output extra words like: 'Based on the outfits and overall style depicted in the image, I would label the fashion archetype as...'"
)

# === Image encoder ===
def encode_image_base64(image_path: str) -> str:
    ext = Path(image_path).suffix.lower()
    if ext in [".jpg", ".jpeg"]:
        with open(image_path, "rb") as img_file:
            return base64.b64encode(img_file.read()).decode("utf-8")
    else:
        with Image.open(image_path) as img:
            img = img.convert("RGB")
            buffer = io.BytesIO()
            img.save(buffer, format="JPEG")
            return base64.b64encode(buffer.getvalue()).decode("utf-8")

# === Claude request ===
def query_claude(image_b64: str, prompt: str) -> str:
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": image_b64
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ],
        "max_tokens": 300
    }

    try:
        start = time.time()
        response = bedrock_runtime.invoke_model(
            body=json.dumps(body),
            modelId=MODEL_ID,
            accept=ACCEPT,
            contentType=CONTENT_TYPE
        )
        result = json.loads(response['body'].read().decode())
        print(f"⏱️ Claude took {time.time() - start:.2f} seconds.")
        return result["content"][0]["text"]
    except Exception as e:
        return f"ERROR: {e}"

# === Archetype cleaner ===
def clean_archetype(raw: str) -> str:
    match = re.search(r'"([^"]+)"', raw)
    if match:
        phrase = match.group(1).strip().rstrip(".")
    else:
        phrase = raw.strip().rstrip(".")
    return ' '.join(word.capitalize() for word in phrase.split())

# === Main processor (all rows, writes back to Excel) ===
def process_archetypes_only(file_path: str, image_folder: str):
    df = pd.read_excel(file_path)
    image_dir = Path(image_folder)

    for idx, row in df.iterrows():
        filename = row["Image_FileName"]
        image_path = image_dir / filename

        print(f"\n🔁 Row {idx}: {row['Name']} - File: {filename}")
        print(f"📂 Checking if image exists at: {image_path}")

        if not image_path.exists():
            print(f"❌ Image not found: {filename}")
            df.iat[idx, 4] = "ERROR: Image not found"
            continue

        try:
            image_b64 = encode_image_base64(str(image_path))
            archetype_raw = query_claude(image_b64, ARCHETYPE_PROMPT)
            print(f"🧠 Claude Raw Response: {archetype_raw}")
            cleaned = clean_archetype(archetype_raw)
            print(f"🎯 Cleaned Archetype: {cleaned}")
            df.iat[idx, 4] = cleaned
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")
            df.iat[idx, 4] = f"ERROR: {e}"

        time.sleep(1)  # prevent throttling

    df.to_excel(file_path, index=False)
    print(f"\n✅ All archetypes updated and saved to: {file_path}")

# === Run ===
if __name__ == "__main__":
    process_archetypes_only(
        file_path="C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/Celeb_Comp_Embeddings.xlsx",
        image_folder="C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/CelebPics"
    )



🔁 Row 0: A$AP Rocky - File: rocky.jpg
📂 Checking if image exists at: C:\Users\petew\OneDrive\Desktop\Cal Poly\MSBA\GSB-570-(Gen-AI)\Code\Final Project\CelebPics\rocky.jpg
⏱️ Claude took 7.07 seconds.
🧠 Claude Raw Response: "Camouflage Couture"
🎯 Cleaned Archetype: Camouflage Couture

🔁 Row 1: Alex Turner - File: alexturner.jpg
📂 Checking if image exists at: C:\Users\petew\OneDrive\Desktop\Cal Poly\MSBA\GSB-570-(Gen-AI)\Code\Final Project\CelebPics\alexturner.jpg
⏱️ Claude took 5.16 seconds.
🧠 Claude Raw Response: "Casual Rocker"
🎯 Cleaned Archetype: Casual Rocker

🔁 Row 2: Austin Butler - File: austinButler.jpg
📂 Checking if image exists at: C:\Users\petew\OneDrive\Desktop\Cal Poly\MSBA\GSB-570-(Gen-AI)\Code\Final Project\CelebPics\austinButler.jpg
⏱️ Claude took 5.90 seconds.
🧠 Claude Raw Response: "Casual Explorer"
🎯 Cleaned Archetype: Casual Explorer

🔁 Row 3: Bad Bunny - File: badbunny.jpg
📂 Checking if image exists at: C:\Users\petew\OneDrive\Desktop\Cal Poly\MSBA\GSB-570-(Gen-AI

# Agent

In [38]:
def call_claude(prompt: str, model_id: str = "anthropic.claude-3-sonnet-20240229-v1:0", max_tokens: int = 512) -> str:
    """
    Unified Claude 3 call via Bedrock that assumes invoke_model() already returns a parsed dict.
    """
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens,
        "temperature": 0.5
    }

    try:
        response = invoke_model(
            body=body,
            model_id=model_id,
            accept="application/json",
            content_type="application/json"
        )

        # ✅ Assume response is already a parsed dict from invoke_model()
        return response.get("content", [{}])[0].get("text", "")

    except Exception as e:
        print("Error invoking Claude or parsing response:", e)
        return ""




## Tools

In [39]:
import pandas as pd
import numpy as np
from typing import List, Dict
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Load the CSV once
df = pd.read_csv("C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/cohere_vector_embeddings.csv")

# Convert string embeddings (e.g. "[0.1, 0.2, ...]") to lists
def parse_embedding(embedding_str):
    return np.array(ast.literal_eval(embedding_str))

# Apply conversion to all embedding columns (assuming names like embedding_1, embedding_2, etc.)
embedding_cols = [col for col in df.columns if col.startswith("vector")]
for col in embedding_cols:
    df[col] = df[col].apply(parse_embedding)


In [42]:
def search_items(query_embedding: np.ndarray, item_type: str = None, top_k: int = 5) -> List[Dict]:
    """
    Find the top-k items most semantically similar to the query embedding.
    Optionally filter by item_type (e.g., 'pants', 'shirt').
    Returns a list of item dicts with ID, type, description, and score.
    """
    results = []

    for _, row in df.iterrows():
        if item_type and row['type'].lower() != item_type.lower():
            continue
        
        best_score = -1
        best_desc = None

        for i, col in enumerate(embedding_cols):
            item_vec = row[col]
            similarity = cosine_similarity([query_embedding], [item_vec])[0][0]
            if similarity > best_score:
                best_score = similarity
                best_desc = row[f"description_{i+1}"]

        results.append({
            "id": row['id'],
            "type": row['type'],
            "description": best_desc,
            "score": best_score
        })

    # Sort and return top_k
    results.sort(key=lambda x: x['score'], reverse=True)
    return results[:top_k]

def analyze_color_compatibility(base_desc: str, candidate_desc: str) -> dict:
    """
    Returns a 0.0–10.0 color compatibility score (with 1 decimal place)
    and a short 1–2 sentence explanation using Claude via Bedrock.
    """
    prompt = f"""
Evaluate how well the colors of these two clothing items go together:

Item 1: {base_desc}
Item 2: {candidate_desc}

Give your answer in this format:

Score: [number from 0.0 to 10.0, with one decimal place]
Explanation: [a short explanation of why these colors do or do not go well together]

Focus on color theory, contrast, harmony, and how well the tones or hues pair for coordinated outfits.
Do not evaluate fabric, texture, or fit.
"""

    response = call_claude(prompt)

    # Attempt to parse score + explanation
    try:
        lines = response.strip().splitlines()
        score_line = next(line for line in lines if line.lower().startswith("score:"))
        explanation_line = next(line for line in lines if line.lower().startswith("explanation:"))

        score = float(score_line.split(":")[1].strip())
        explanation = explanation_line.split(":", 1)[1].strip()

        return {
            "score": score,
            "explanation": explanation
        }

    except Exception as e:
        print("Error parsing color compatibility output:", e)


def analyze_style_compatibility(base_desc: str, candidate_desc: str) -> str:
    """
    Ask the LLM: Do these two clothing items share a compatible aesthetic (e.g., both minimalist, both casual)?
    Return: 'high', 'medium', or 'low'
    """
    prompt = f"""
    Based on the following descriptions, how well do the styles match?
    
    Item 1: {base_desc}
    Item 2: {candidate_desc}

    Consider things like vibe, aesthetic, formality, and fit.
    Respond only with one of: high, medium, or low.
    """
    return call_claude(prompt)

def score_combination(color_score: str, style_score: str) -> int:
    """
    Combine color + style compatibility into a numeric score.
    """
    score_map = {'low': 0, 'medium': 1, 'high': 2}
    return score_map[color_score] + score_map[style_score]


In [51]:
desc1 = "Orange linen short-sleeve shirt with coconut buttons and a relaxed fit"
desc2 = "Forest green tapered pants made from lightweight cotton"

color_score = analyze_color_compatibility(desc1, desc2)
print("Color compatibility:", color_score)


Model invocation took 2.426 seconds.
Color compatibility: {'score': 7.5, 'explanation': 'The orange and forest green colors create a complementary contrast, which can be visually appealing and harmonious. The warm orange tone complements the cool green shade, creating a vibrant yet balanced combination. However, the brightness of the orange shirt may overpower the deeper green pants, so careful consideration of the specific shades and proportions is recommended for a cohesive look.'}


# Category for Outfit

In [33]:
import pandas as pd

# File path
file_path = "C:/Users/petew/OneDrive/Desktop/Cal Poly/MSBA/GSB-570-(Gen-AI)/Code/Final Project/cohere_vector_embeddings_combined.csv"

# Define keywords
top_keywords = ["shirt", "polo", "jacket", "sweater", "hoodie", "top", "blazer", "coat"]
bottom_keywords = ["pants", "shorts", "trousers", "jeans", "bottom"]

# New classifier that prioritizes mentions near the beginning
def determine_category(responses):
    text = " ".join(str(r) for r in responses if pd.notna(r)).lower()

    # Look only in first 2–3 sentences
    head = ". ".join(text.split(".")[:3])

    if any(top in head for top in top_keywords):
        return "top"
    elif any(bottom in head for bottom in bottom_keywords):
        return "bottom"
    elif any(top in text for top in top_keywords):
        return "top"
    elif any(bottom in text for bottom in bottom_keywords):
        return "bottom"
    else:
        return "unknown"

# Load and classify
df = pd.read_csv(file_path)
df["Category"] = df.apply(
    lambda row: determine_category([row["Response1"], row["Response2"], row["Response3"], row["Response4"]]),
    axis=1
)

# Insert Category after vector4
cols = df.columns.tolist()
insert_at = cols.index("vector4") + 1
new_cols = cols[:insert_at] + ["Category"] + cols[insert_at:]
df = df[new_cols]

# Save
df.to_csv(file_path, index=False)
print("✅ Category column updated and saved.")


✅ Category column updated and saved.
