## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import pickle
import faiss
from together import Together
from difflib import get_close_matches
import os
from openai import OpenAI




## Define a Line-by-Line JSON Parser

In [None]:
def parse_json(filename, limit=None):
    data = []
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            if limit and i >= limit:
                break
            data.append(json.loads(line))
    return data

## Load Metadata

In [None]:
sample_meta = parse_json("meta_Beauty_and_Personal_Care.jsonl")
meta_df = pd.DataFrame(sample_meta)

# Extract set of ASINs that appear in metadata
meta_asins = set([m.get("parent_asin") for m in sample_meta if m.get("parent_asin")])

## Load Only Reviews That Match Metadata

In [None]:
raw_reviews = parse_json("Beauty_and_Personal_Care.jsonl")

# Filter reviews to only those in metadata
sample_reviews = [r for r in raw_reviews if r.get("parent_asin") in meta_asins]
reviews_df = pd.DataFrame(sample_reviews)

## Group Reviews by Product ID

In [None]:
review_by_asin = defaultdict(list)
for r in sample_reviews:
    parent_asin = r.get("parent_asin")
    if parent_asin and r.get("text"):
        review_by_asin[parent_asin].append(r["text"])

## Sanity Check

In [None]:
print("Reviews loaded:", len(sample_reviews))
print("Metadata loaded:", len(sample_meta))
review_asins = set([r.get("parent_asin") for r in sample_reviews])
print("Common ASINs:", len(review_asins & meta_asins))

Reviews loaded: 347941
Metadata loaded: 500000
Common ASINs: 101800


## Build RAG documents per product

In [None]:
documents = []

for product in sample_meta:
    parent_asin = product.get("parent_asin")
    if parent_asin not in review_by_asin:
        continue

    doc = f"Title: {product.get('title', '')}\n"
    doc += f"Brand: {product.get('details', {}).get('brand', '')}\n"
    doc += f"Store: {product.get('store', '')}\n"

    doc += f"Main Category: {product.get('main_category', '')}\n"
    doc += f"Categories: {' > '.join(product.get('categories', []))}\n"

    doc += f"Price: ${product.get('price', 'N/A')}\n"
    doc += f"Average Rating: {product.get('average_rating', 'N/A')} ({product.get('rating_number', 0)} reviews)\n"

    # Features (bullet points)
    features = product.get("features", [])
    if features:
        doc += "Key Features:\n" + "\n".join([f"- {f}" for f in features]) + "\n"

    # Description (multi-line)
    desc = product.get("description")
    if isinstance(desc, list):
        doc += "Description:\n" + "\n".join(desc) + "\n"
    elif isinstance(desc, str):
        doc += f"Description: {desc}\n"

    # Optional: include a few ingredients if available
    ingredients = product.get("details", {}).get("ingredients")
    if ingredients:
        doc += f"Ingredients: {ingredients}\n"

    # Recommended bundles
    bundles = product.get("bought_together", [])
    if bundles:
        doc += f"Frequently Bought Together: {', '.join(bundles[:3])}\n"

    # Reviews (limit to 3)
    reviews = review_by_asin[parent_asin][:3]
    doc += "\nUser Reviews:\n" + "\n".join(reviews)

    documents.append((parent_asin, doc))

## Print Summary + Sample

In [None]:
print(f"\nGenerated {len(documents)} RAG-ready product documents.")
print("\nSample document:\n", documents[0][1])


Generated 101797 RAG-ready product documents.

Sample document:
 Title: L.A. COLORS 5 Color Matte Eyeshadow, Brown Tweed, 0.25 Oz Powder
Brand: 
Store: L.A. COLORS
Main Category: All Beauty
Categories: Beauty & Personal Care > Makeup > Eyes > Eyeshadow
Price: $2.49
Average Rating: 4.3 (5367 reviews)
Key Features:
- Intense color
- Matte finish
- Silky texture
Description:
Wrap yourself in soft, cozy color. These matte palettes are full of five silky, soft and blendable shades. Applies on smooth and each palette is perfectly color coordinated to shade and highlight eyes. An array of shades from neutrals and bolds to create any look you desire.

User Reviews:
The pic looks great, but the colors are not-they are almost all dark grays and shades of gray-nothing like the vibrant purple shown
Very flattering!
I am not thrilled with the colors.  I have fair skin and most of the colors are too dark for me.  They also are more yellowish which doesn't work with my pale pink complexion.  the pro

## Embed documents & save to disk

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
# model = SentenceTransformer("all-mpnet-base-v2")

asins = [doc[0] for doc in documents]
texts = [doc[1] for doc in documents]
embeddings = model.encode(texts, show_progress_bar=True)

with open("vector_store/product_embeddings.pkl", "wb") as f:
    pickle.dump((asins, texts, embeddings), f)

Batches:   0%|          | 0/3182 [00:00<?, ?it/s]

## Build and Save FAISS Index

In [None]:
embeddings = np.array(embeddings).astype("float32")
dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "vector_store/product_index.faiss")

## Query + Retrieve Matches

In [None]:
# with open("vector_store/product_embeddings.pkl", "rb") as f:
#     asins, texts, embeddings = pickle.load(f)

# index = faiss.read_index("vector_store/product_index.faiss")

# query = "I want a shampoo and conditioner for oily scalp under $35"
# query_vec = model.encode([query]).astype("float32")

# D, I = index.search(query_vec, k=3)

# seen = set()
# for idx in I[0]:
#     if idx == -1 or idx in seen:
#         continue
#     seen.add(idx)
#     print(f"\n--- Match {idx} (ASIN: {asins[idx]}) ---\n")
#     print(texts[idx][:1000])  # Limit output length

#### Retrieve Top-K Relevant Products for a New Query
(wrapping it into a clean block)

In [None]:
query = "I want a shampoo for oily scalp under $35"
query_vec = model.encode([query]).astype("float32")

# Search top 3 most relevant product documents
D, I = index.search(query_vec, k=3)

# Get matched texts and metadata
retrieved_texts = [texts[i] for i in I[0] if i != -1]
retrieved_asins = [asins[i] for i in I[0] if i != -1]

## Build Structured RAG Context from Retrieved Products

In [None]:
def build_structured_rag_context(asins, texts, meta_lookup, review_by_asin, max_products=3):
    context_blocks = []

    for asin, text in zip(asins[:max_products], texts[:max_products]):
        product = meta_lookup.get(asin, {})
        title = product.get("title", "Unknown")
        brand = product.get("brand", "Unknown")
        category = product.get("category", "Unknown")
        price = product.get("price", "N/A")
        description = product.get("description", "")
        if isinstance(description, list):
            description = " ".join(description)
        elif not isinstance(description, str):
            description = ""

        reviews = review_by_asin.get(asin, [])[:3]
        reviews_text = "\n".join([f"- {r}" for r in reviews])

        block = f"""=== PRODUCT ===
Title: {title}
Brand: {brand}
Price: ${price}
Description: {description}
User Reviews:
{reviews_text}
"""
        context_blocks.append(block.strip())

    rag_context = "\n\n---\n\n".join(context_blocks)
    return rag_context

In [None]:
# Lookup metadata by ASIN
meta_lookup = {p["parent_asin"]: p for p in sample_meta if "parent_asin" in p}

# Generate clean structured RAG context
rag_context = build_structured_rag_context(retrieved_asins, retrieved_texts, meta_lookup, review_by_asin)


## Dynamic Category Matching Using the Query

In [None]:
# def get_best_category_match(query, category_list, cutoff=0.3):
#     query = query.lower()
#     matches = get_close_matches(query, category_list, n=1, cutoff=cutoff)
#     return matches[0] if matches else None

In [None]:
# # Collect all categories from metadata
# all_categories = set(cat for p in sample_meta for cat in p.get("categories", []))
# all_categories_lower = [cat.lower() for cat in all_categories]

# # Match query to best category
# best_match = get_best_category_match(query, all_categories_lower)

# # Build a lookup from ASIN to metadata
# asin_to_meta = {p["parent_asin"]: p for p in sample_meta if "parent_asin" in p}

# # Filter top-k results by category
# filtered_results = []
# for asin, text in zip(retrieved_asins, retrieved_texts):
#     product = asin_to_meta.get(asin)
#     if not product or not best_match:
#         continue
#     if best_match in [cat.lower() for cat in product.get("categories", [])]:
#         filtered_results.append((asin, text))

# # Use only filtered results to build the context
# rag_context = "\n\n---\n\n".join([text for _, text in filtered_results[:3]])

## Compose Prompt for the LLM Justifier

In [None]:
# Use Together / DeepSeek or any LLM API you prefer
prompt = f"""
You are a helpful assistant that explains which products best match the user's request. Use the provided product information and user reviews to justify your recommendation in natural language. Do not mention that you are an assistant or describe your reasoning process. Be friendly and concise.

User request: "{query}"

Product Information:
{rag_context}

Please write a clear and human-friendly explanation of which products are suitable and why. Avoid technical language or tags like <think>.
"""

## Call the LLM API

In [None]:
api_key = os.environ.get("TOGETHER_API_KEY")

client = Together(api_key=api_key)

response = client.chat.completions.create(
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
    messages=[
        {"role": "user", "content": prompt}
    ]
)

print(response.choices[0].message.content)

If you're looking for a shampoo to help with an oily scalp and you're on a budget of $35, I'd recommend checking out the Baxter of California Deep Clean Shampoo. It's priced at $29.75, which fits your budget, and users have reported that it does a great job of keeping their hair clean without leaving any buildup, which is especially helpful for oily hair. Plus, it smells wonderful and is safe for colored hair.

The J Beverly Hills Blue Everyday Moisturizing Shampoo is another option that's under $35, priced at $27. However, there aren't as many reviews available to confirm its effectiveness for oily scalps.

As for the KEVIN MURPHY Plumping Wash, it's unfortunately out of your budget at $81.49, so that might not be the best option for you.


## Deploy as a Gradio Web App

In [None]:
def recommend_products(query):
    query_vec = model.encode([query]).astype("float32")
    D, I = index.search(query_vec, k=5)

    retrieved_texts = [texts[i] for i in I[0] if i != -1]
    retrieved_asins = [asins[i] for i in I[0] if i != -1]

    rag_context = build_structured_rag_context(retrieved_asins, retrieved_texts, meta_lookup, review_by_asin)

    prompt = f"""
You are a helpful assistant that explains which products best match the user's request. Use the provided product information and user reviews to justify your recommendation in natural language. Do not mention that you are an assistant or describe your reasoning process. Be friendly and concise.

User request: "{query}"

Product Information:
{rag_context}

Please write a clear and human-friendly explanation of which products are suitable and why.
"""
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content


# def recommend_products(query):
#     rag_context = build_structured_rag_context(
#         retrieved_asins, retrieved_texts, meta_lookup, review_by_asin
#     )

#     # SAFETY / FALLBACK CHECK
#     if not rag_context.strip():
#         return (
#             "❗ We couldn’t find any perfect matches for your request. "
#             "Here are the most relevant products based on available reviews:\n\n"
#             f"{rag_context}"
#         )

#     # Otherwise continue as usual
#     prompt = f"""You are a helpful assistant that explains which products best match the user's request. Use the provided product information and user reviews to justify your recommendation in natural language. Do not mention that you are an assistant or describe your reasoning process. Be friendly and concise.

# Query:
# {query}

# Product Context (retrieved top matches):
# {rag_context}

# Please write a clear and human-friendly explanation of which products are suitable and why.
# """

#     response = client.chat.completions.create(
#         model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
#         messages=[{"role": "user", "content": prompt}]
#     )

#     return response.choices[0].message.content

## Create the Gradio Interface

In [None]:
import gradio as gr

# Define theme
amazon_theme = gr.themes.Soft(
    primary_hue=gr.themes.colors.orange,
    secondary_hue=gr.themes.colors.slate,
    neutral_hue=gr.themes.colors.gray,
).set(
    body_background_fill="#232F3E",                # Amazon dark background
    body_text_color="#FFFFFF",
    button_primary_background_fill="#FF9900",      # Amazon orange
    button_primary_text_color="#000000",           # Black button text
    button_secondary_background_fill="#37475A",    # Darker gray secondary
    input_background_fill="#37475A",
    input_border_color="#FF9900",
    block_title_text_color="#FF9900",
    block_background_fill="#232F3E"
)

# Define interface
with gr.Blocks(theme=amazon_theme) as demo:
    gr.HTML(
    "<h2 style='text-align: center; color: #FF9900;'>🛒 JustifyMyBuy: <span style='color:white;'>Your AI Product Explainer</span></h2>"
)

    gr.Markdown(
        "📦 *Note: This tool uses Amazon product listings and review data from 2023 to generate recommendations. Currently supports products in the Beauty & Personal Care category only.*"
    )

    with gr.Row():
        with gr.Column():
            user_query = gr.Textbox(
                label="Enter your product need",
                placeholder="e.g Lip balm under $10."
            )
            submit_btn = gr.Button("Submit")
            clear_btn = gr.Button("Clear")

        with gr.Column():
            response = gr.Textbox(
                label="Recommended Products Explanation",
                lines=12
            )

    submit_btn.click(fn=recommend_products, inputs=user_query, outputs=response)
    clear_btn.click(fn=lambda: ("", ""), outputs=[user_query, response])

# Launch the app
demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on public URL: https://d173ab09671fb3f94f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


