In [9]:
import json, csv
from typing import List, Dict, Any, Optional, Tuple
import time
import requests
from bs4 import BeautifulSoup
import re

import numpy as np
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()  



In [7]:
# DO NOT RUN THIS - use uuid_list.csv, InventoryLookup.jsonl, and embedded_items.npy
# INITIAL CREATION + UPDATE OF ARTIFACTS

'''
# def write_json(path: str, lookup: Dict[str, Dict[str, Any]]) -> None:
#     with open(path, "w", encoding="utf-8") as f:
#         # write the entire lookup dict as a single JSON object
#         json.dump(lookup, f, ensure_ascii=False, indent=2)

# def write_csv(path: str, uuid_list: List[str]) -> None:
#    with open(path, "w", encoding="utf-8", newline="") as f:
#        w = csv.DictWriter(f, fieldnames=["row_id", "uuid"])
#        w.writeheader()
#        for i, u in enumerate(uuid_list):
#            w.writerow({"row_id": i, "uuid": u})

# def make_full_name(name, long_name):
#    if not long_name:   # catches "" and None
#        return name
#    return name + " - " + long_name

# def save_npy(path: str, matrix: np.ndarray) -> None:
#    np.save(path, matrix)
#    print(f"Saved matrix of shape {matrix.shape}, dtype {matrix.dtype} -> {path}")

# # open inventory database
# with open("inventory-5-31-25.json", "r", encoding="utf-8") as f:
#    inventory_json = json.load(f)   # inventory_json is already a list of dicts

# inventory_lookup = {}           # dict[uuid] -> {Name, Long name, Locations}
# uuid_list = []                  # list of uuids, in EXACT row order
# names_list = []                 # list of full_name strings, same order as UUID_list

# for item in inventory_json:
#     # inventory_json is a list of dicts, so item is a dict 

#     # Build fast lookup (N-length dict)
#     inventory_lookup[item["uuid"]] = {
#         "name": item["name"],
#         "long_name": item["long_name"],
#         "locations": item["locations"]  # keep structured as list of dicts
#     }

#     # Build index-aligned lists
#     uuid_list.append(item["uuid"])
#     names_list.append(make_full_name(item["name"], item["long_name"]))


# # Persist inventory_lookup 
# write_json("InventoryLookup.json", inventory_lookup)  

# # Persist uuid_list 
# write_csv("uuid_list.csv", uuid_list)

# # Vectorize names_list -> names_matrix (N x d), then L2-normalize rows
# names_matrix = embed_normalize(names_list)           # shape: N x d, d = 384

# # Persist names_matrix 
# save_npy("embedded_items.npy", names_matrix)
'''
# DO NOT RUN AGAIN
# UPDATE OF ARTIFACTS 

'''
# import json, csv

# # UPDATE INVENTORY LOOKUP TO INCLUDE DETAILS USEFUL FOR LLM 
# with open("data/inventory-5-31-25.json", "r", encoding="utf-8") as f:
#    inventory_json = json.load(f)   # inventory_json is already a list of dicts

# with open("artifacts/InventoryLookup.json", "r", encoding="utf-8") as f:
#     inventory_lookup = json.load(f)

# for item in inventory_json:
#     uuid = item.get("uuid")
#     role = item.get("role")
#     if uuid in inventory_lookup and role is not None:
#         inventory_lookup[uuid]["role"] = role

# with open("artifacts/InventoryLookup.json", "w", encoding="utf-8") as f:
#     json.dump(inventory_lookup, f, indent=2, ensure_ascii=False)

# # UPDATE UUID_LIST FOR MORE EFFICIENT STORAGE AND RETRIEVAL
# with open("artifacts/uuid_list.csv", "r", encoding="utf-8") as f:
#     reader = csv.reader(f)
#     uuid_list = [row[1] for row in reader]  # each row is a single uuid

# with open("artifacts/uuid_list.json", "w", encoding="utf-8") as f:
#     json.dump(uuid_list, f, indent=2, ensure_ascii=False)
'''

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Saved matrix of shape (1343, 384), dtype float32 -> embedded_items.npy


In [11]:
# LOAD DATA + ARTIFACTS

with open("artifacts/InventoryLookup.json", "r", encoding="utf-8") as f:
    inventory_lookup = json.load(f)

# Load uuid_list (list of uuids)
with open("artifacts/uuid_list.json", "r", encoding="utf-8") as f:
    uuid_list = json.load(f)

# Load names_matrix (numpy array of embeddings)
names_matrix = np.load("artifacts/embedded_items.npy")

with open("data/restocks-5-31-25.json", "r", encoding="utf-8") as f:
    restock_requests = json.load(f)

In [12]:
# FUNCTIONS 
# PARSING RESTOCK REQUESTS AND GETTING ITEM NAMES FROM LINKS

def _normalize_amazon_url(u: str) -> str:
    # strip trailing punctuation that often sneaks in from chat/markdown
    u = u.rstrip(").,;:]")
    # remove query params
    u = u.split("?", 1)[0]
    # collapse to canonical dp/ASIN form if present
    m = re.search(r"/dp/([A-Z0-9]{10})", u)
    if m:
        asin = m.group(1)
        return f"https://www.amazon.com/dp/{asin}"
    return u

_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

def _extract_title_from_html(html_text: str) -> Optional[str]:
    soup = BeautifulSoup(html_text, "html.parser")

    # 1) Amazon product title selectors
    for css in ["#productTitle", "#title", "span#productTitle"]:
        el = soup.select_one(css)
        if el:
            txt = el.get_text(strip=True)
            if txt:
                return txt

    # 2) Open Graph
    og = soup.find("meta", {"property": "og:title"})
    if og and og.get("content"):
        return og["content"].strip()

    # 3) Fallback <h1>
    h1 = soup.find("h1")
    if h1:
        txt = h1.get_text(strip=True)
        if txt:
            return txt

    # 4) Fallback <title>
    if soup.title:
        txt = soup.title.get_text(strip=True)
        if txt:
            return txt

    return None

def link_parser(urls: List[str], timeout: float = 10.0, sleep_sec: float = 0.4) -> List[str]:
    seen = set()
    titles: List[str] = []

    for raw_u in urls:
        u = _normalize_amazon_url(raw_u)
        if u in seen:
            continue
        seen.add(u)

        try:
            r = requests.get(u, headers=_HEADERS, timeout=timeout)
            # DEBUG: show status; uncomment next line to dump HTML if needed
            # print("DEBUG status:", r.status_code, "URL:", u)
            if r.status_code != 200 or not r.text:
                time.sleep(sleep_sec); continue

            title = _extract_title_from_html(r.text)
            if title:
                # Avoid adding useless 'Amazon.com' fallback as a signal
                if title.lower().strip() == "amazon.com":
                    # treat as failure; skip adding
                    pass
                else:
                    titles.append(title)

        except requests.RequestException as e:
            # DEBUG: print(e)  # uncomment if you want to see the error
            pass

        time.sleep(sleep_sec)

    return titles

def link_find_split(user_request: str) -> Tuple[str, List[str]]:
    url_pattern = re.compile(r"https?://\S+")
    urls = url_pattern.findall(user_request)
    user_request_text = url_pattern.sub("", user_request).strip()

    link_titles = link_parser(urls)
    return user_request_text, link_titles

In [13]:
# FUNCTIONS 
# EMBEDDING THE RESTOCK REQUEST + COMPARING TO INVENTORY 

def embed_normalize(texts: List[str], batch_size: int = 128) -> np.ndarray:
   emb = model.encode(
       texts,
       batch_size=batch_size,
       convert_to_numpy=True,
       normalize_embeddings=False,  # we’ll handle normalization ourselves
       show_progress_bar=True,
   ).astype(np.float32, copy=False)
   # L2-normalize rows
   norms = np.linalg.norm(emb, axis=1, keepdims=True)
   norms[norms == 0] = 1.0  # avoid division by zero
   emb = emb / norms
   return emb

def find_five_inventory_match(
    extracted_vector: np.ndarray,
    names_matrix: np.ndarray,
    uuid_list: List[str],
    inventory_lookup: Dict[str, Dict],
) -> List[Dict]:
    scores = names_matrix @ extracted_vector  # shape (N,)
    top_idx = np.argpartition(scores, -5)[-5:]
    top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]  # sort descending

    matches: List[Dict] = []
    for i in top_idx:
        uuid = uuid_list[i]
        item = inventory_lookup[uuid]

        # Shallow copy (top-level only). Nested structures (like locations list) are shared.
        d = item.copy()

        # Attach retrieval metadata
        d["uuid"] = uuid
        d["score"] = float(scores[i])

        matches.append(d)

    return matches



In [16]:
# # FUNCTIONS 
# # SETTING UP LLM TO PROCESS SINGLE RESTOCK REQUEST + CANDIDATES 

# _oai_client = OpenAI(api_key="sk-")

# AREA_LABELS = [
#     "Cabinet 1","Cabinet 2","Cabinet 3", "Cabinet 4", "Cabinet 5","Cabinet 6","Cabinet 7", "Cabinet 9", "Cabinet 11", "Cabinet 15", "Pegboard 1","Fabric", "LFP", "Laser3D",
#     "Electronics", "Studio", "Spraypaint", "Cage/Crypt/Other"
# ]

# def llm_match(user_request_text: str,
#               links_items_list: List[str],
#               five_possible_matches: List[Dict]) -> Tuple[str, str]:

#     # Ensure scores are JSON-serializable
#     for c in five_possible_matches or []:
#         if "score" in c:
#             c["score"] = float(c["score"])

#     # --- System instructions ---
#     system_instructions = (
#         "You label makerspace restock requests.\n"
#         "TASK:\n"
#         "1) Pick exactly ONE inventory item from the provided CANDIDATES by its 'uuid'.\n"
#         "2) Assign exactly ONE area label from:\n"
#         f" {AREA_LABELS}.\n\n"
#         "INPUT FIELDS YOU WILL RECEIVE:\n"
#         "- user_request_text: free-text from the steward (links removed)\n"
#         "- links_items_list: list of product titles parsed from any links\n"
#         "- five_possible_matches: array of objects with {uuid, full_name, locations, score}\n\n"
#         "RULES:\n"
#         "- Only select a uuid that appears in three_possible_matches.\n"
#         "- If one of the candidate items is only in the Cage or Crypt, it is most likely not the correct option.\n"
#         "- If links_items_list contradict user_request_text, favor links_items_list unless clearly off-topic.\n"
#         "- If candidates are near-ties, prefer the higher score.\n"
#         "- Area mapping guidance:\n"
#         " • If any location.container contains 'Cabinet', map to the number cabinet as listed in Area Labels\n"
#         " • Map location.room (case-insensitive):\n"
#         "   - 'fabric' → 'Fabric'\n"
#         "   - 'lfp' → 'LFP'\n"
#         "   - '3d printer room' / '3d printers' → '3D Printer Room'\n"
#         "   - 'electronics' → 'Electronics'\n"
#         "   - 'studio' → 'Studio'\n"
#         "   - 'spraypaint' / 'spray booth' → 'Spraypaint'\n"
#         "   - 'cage', 'crypt', 'main', 'shop', 'other' → 'Cage/Crypt/Other'\n\n"
#         "OUTPUT:\n"
#         "Return ONLY this JSON (no prose):\n"
#         '{"item_uuid":"<candidate uuid>","item_location":"<one allowed area>"}'
#     )

#     user_payload = {
#         "user_request_text": user_request_text or "",
#         "links_items_list": list(links_items_list or []),
#         "five_possible_matches": five_possible_matches or [],
#     }

#     schema = {
#         "type": "object",
#         "properties": {
#             "item_uuid": {"type": "string"},
#             "item_location": {"type": "string", "enum": AREA_LABELS},
#         },
#         "required": ["item_uuid", "item_location"],
#         "additionalProperties": False,
#     }

#     try:
#         # ---- CALL THE CHAT COMPLETIONS API ----
#         # NOTE: 'response_format={"type": "json_object"}' instructs the model to return JSON,
#         # but the JSON still arrives in choices[0].message.content (string), not as 'output_text'.
#         resp = _oai_client.chat.completions.create(
#             model="gpt-4o-mini",
#             response_format={"type": "json_object"},
#             messages=[
#                 {"role": "system", "content": system_instructions},
#                 {"role": "user", "content": json.dumps(user_payload, ensure_ascii=False)},
#             ],
#             temperature=0,
#         )

#         # --- Unified extraction of raw text (handles both APIs if you later switch back) ---
#         raw: Optional[str] = getattr(resp, "output_text", None)  # Responses API
#         if not raw:
#             # Chat Completions API
#             raw = (resp.choices[0].message.content or "").strip()

#         if DEBUG_LLM:
#             print("[LLM RAW][:300]:", (raw or "")[:300])

#         if not raw:
#             raise ValueError("Empty LLM response")

#         # --- Parse JSON robustly ---
#         # 1) Try direct JSON
#         try:
#             data = json.loads(raw)
#         except json.JSONDecodeError:
#             # 2) Fallback: extract first {...} block
#             m = re.search(r"\{[\s\S]*\}", raw)
#             if not m:
#                 raise
#             data = json.loads(m.group(0))

#         if DEBUG_LLM:
#             print("[LLM PARSED]:", data)

#         cand_uuids = {c.get("uuid") for c in (three_possible_matches or []) if c.get("uuid")}
#         item_uuid = (data or {}).get("item_uuid")
#         item_location = (data or {}).get("item_location")

#         # --- Validate against candidates and allowed labels ---
#         if item_uuid in cand_uuids and item_location in AREA_LABELS:
#             if DEBUG_LLM:
#                 print("[LLM OK]:", item_uuid, "→", item_location)
#             return item_uuid, item_location

#         if DEBUG_LLM:
#             print(
#                 "[VALIDATION FAIL]:",
#                 item_uuid,
#                 item_location,
#                 "; cand_uuids sample:",
#                 list(cand_uuids)[:3],
#             )

#     except Exception as e:
#         print("[LLM ERROR]:", repr(e))  # <-- do not silently pass

#     # Sensible fallback
#     return "", "Cage/Crypt/Other"



_oai_client = OpenAI()

# Expect these to exist in your module
AREA_LABELS = [
   "Cabinet 1","Cabinet 2","Cabinet 3", "Cabinet 4", "Cabinet 5","Cabinet 6","Cabinet 7", "Cabinet 9", "Cabinet 11", "Cabinet 15", "Pegboard 1","Fabric", "LFP", "Laser3D",
   "Electronics", "Studio", "Spraypaint", "Cage/Crypt/Other"
]

LOCATION_GUIDE = {
    "Cabinet 3": ["jewelry","wire","chain","beads"],
    "Cabinet 4": ["paint (not spraypaint)","acrylic","tempera","gouache","watercolor"],
    "Cabinet 5": ["glue","adhesive","tape","masking tape","painter's tape"],
    "Cabinet 6": ["ribbon"],
    "Cabinet 7": ["paper","cardstock","construction paper","origami paper"],
    "Cabinet 9": ["leather","wood carving","clay","linoleum tools","stamp carving"],
    "Cabinet 11": ["hand tools","pliers","wrenches","screwdrivers","nuts","bolts","sandpaper"],
    "Fabric": ["fabric bolts","stuffing","stabilizer","yarn","felt","sewing fabric"],
    "Pegboard 1": ["safety pins","needles","sewing machine bits","bobbin","thimble"],
    "Laser3D": ["PLA","filament","3d printer","nozzle","laser wood","laser sheet"],
    "Spraypaint": ["spray paint","aerosol","sealant","clear coat"],
    "LFP": ["printer","ink","toner","plotter","large roll paper","rolls"]
}

def llm_match(
    comment_text: str,
    extracted_item_names: List[str],
    five_candidates: List[Dict],
) -> Tuple[str, str, str]:
    """
    Area-first selector.
    Returns (item_uuid, item_name, item_location).
    Falls back to ("", "", "Cage/Crypt/Other") on any error.
    """
    # ensure scores are JSON-serializable
    for c in five_candidates or []:
        if "score" in c:
            c["score"] = float(c["score"])

        # COMMENT OUT LATER   
        print(f"Candidate: {c.get('full_name') or c.get('name')} — {c.get('locations')}")

    # --- SYSTEM INSTRUCTIONS (explicit and strict) ---
    system_instructions = (
        "You label Makerspace restock requests.\n"
        "\n"
        "PRIMARY GOAL:\n"
        "• The most important choice is the LOCATION (area). The exact item is secondary.\n"
        "\n"
        "INPUTS (as JSON):\n"
        "• REQUEST: {comment, extracted_item_names}\n"
        "• CANDIDATES: exactly 5 items, each with {item_uuid, name, role (M|T), locations, cosine_score}\n"
        "• ALLOWED_AREAS: the only valid location labels you may output\n"
        "• LOCATION_GUIDE: advisory hints describing what tends to live in each area\n"
        "\n"
        "DECISION POLICY (follow in order):\n"
        "1) Choose the LOCATION strictly from ALLOWED_AREAS.\n"
        "   - Use LOCATION_GUIDE strongly to interpret the request.\n"
        "   - De-emphasize cosine_score; do not rely on it heavily.\n"
        "   - Prefer MATERIALS over TOOLS when ambiguous (role 'M' > 'T').\n"
        "   - If multiple candidates are similar and cluster in one area, that area becomes more likely.\n"
        "2) After choosing the LOCATION, pick the ITEM from among the 5 candidates that belongs to that LOCATION.\n"
        "   - If NONE of the 5 candidates are in the chosen LOCATION, leave item_uuid and name as empty strings.\n"
        "\n"
        "OUTPUT SPECIFICATION (STRICT):\n"
        "• Respond with exactly one JSON object (no extra text) with these keys ONLY:\n"
        "  {\"item_uuid\":\"<uuid or empty>\",\"name\":\"<candidate name or empty>\",\"location\":\"<one of ALLOWED_AREAS>\"}\n"
        "• Do not invent keys, do not output markdown or explanations."
    )

    # --- USER PAYLOAD (single source of truth for areas & guide) ---
    payload = {
        "REQUEST": {
            "comment": comment_text or "",
            "extracted_item_names": list(extracted_item_names or []),
        },
        "CANDIDATES": [
            {
                "item_uuid": c.get("uuid") or c.get("item_uuid") or "",
                "name": c.get("full_name") or c.get("name") or "",
                "role": c.get("role") or c.get("kind") or "",     # 'M' or 'T' (materials > tools)
                "locations": c.get("locations"),                   # pass raw locations through
                "cosine_score": c.get("score", None),
            }
            for c in (five_candidates or [])
        ],
        "ALLOWED_AREAS": AREA_LABELS,
        "LOCATION_GUIDE": LOCATION_GUIDE,
    }

    try:
        resp = _oai_client.chat.completions.create(
            model="gpt-4o-mini",
            response_format={"type": "json_object"},   # enforce JSON-only reply
            messages=[
                {"role": "system", "content": system_instructions},
                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
            ],
            temperature=0,
        )

        raw: Optional[str] = getattr(resp, "output_text", None) or (resp.choices[0].message.content or "").strip()
        if not raw:
            raise ValueError("Empty LLM response")

        data = json.loads(raw)
        item_uuid = (data or {}).get("item_uuid") or ""
        item_name = (data or {}).get("name") or ""
        item_location = (data or {}).get("location") or ""

        # Validate location against allowed set
        if item_location not in AREA_LABELS:
            raise ValueError(f"Invalid location: {item_location}")

        # If a UUID is present, ensure it came from the 5; otherwise clear both uuid & name
        cand_by_uuid = {
            (c.get("uuid") or c.get("item_uuid")): c
            for c in (five_candidates or [])
            if (c.get("uuid") or c.get("item_uuid"))
        }
        if item_uuid:
            if item_uuid not in cand_by_uuid:
                item_uuid, item_name = "", ""
            else:
                # Canonicalize name from candidate record (source of truth)
                src = cand_by_uuid[item_uuid]
                canonical = src.get("full_name") or src.get("name") or item_name
                item_name = canonical

        return item_uuid, item_name, item_location

    # except Exception:
    #     return "", "", "Cage/Crypt/Other"

    except Exception as e:
        print("[LLM ERROR]", repr(e))
        if raw:
            # show a helpful snippet of what the model actually returned
            print("[LLM RAW][:800]:", raw[:800])
        # show quick context to debug common issues
        try:
            missing = set(LOCATION_GUIDE.keys()) - set(AREA_LABELS)
            if missing:
                print("[HINT] LOCATION_GUIDE keys missing from AREA_LABELS:", sorted(missing))
        except Exception:
            pass
        print("[HINT] Candidate UUIDs:", [c.get("uuid") or c.get("item_uuid") for c in (five_candidates or [])])
        return "", "", "Cage/Crypt/Other"


In [17]:
# MAIN PROGRAM - 'for' LOOP GOING OVER ALL RESTOCKS AND CREATING DATA FOR GRAPH

restocks_final_data = []

i = 0
for req in restock_requests:
    i += 1 
    if i == 3:
        break
    
    #  Skip unapproved requests
    # if req["is_approved"] is False:
    #     continue

    print ("\n *********************************************************** \n ")

    # Split text vs links
    user_request_text, links_items_list = link_find_split(req["item"])
    print ("\n text = ", user_request_text)
    print ("\n link items = ", links_items_list)
    
    # Build query string for embedding
    if len(links_items_list) != 0:
        # join all link titles into one string 
        extracted_vector = embed_normalize(["".join(links_items_list)])[0]
    else:
        extracted_vector = embed_normalize([user_request_text])[0]

    # Find top 5 inventory matches
    five_possible_matches = find_five_inventory_match(extracted_vector, names_matrix, uuid_list, inventory_lookup)

    final_item_uuid, final_item_name, final_item_location = llm_match(user_request_text, links_items_list, five_possible_matches)

    print ("LLM returns: \n", final_item_name, "\n", final_item_location)

    # Collect results (you can expand tuple into dict later for CSV/graph)
    restocks_final_data.append([final_item_uuid, final_item_name, final_item_location, req["timestamp_sent"], req["timestamp_completed"]])



 *********************************************************** 
 

 text =  

 link items =  ['Gute 10Pcs Cast Acrylic Sheet,Colored Acrylic Sheet Plastic Sheet 8 x 8 Inch (.118" Thick) for Signs, DIY Display Projects, Craft and Easy to Cut（10 Colors,Transparent）']


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.91it/s]


Candidate: Plastic Sheeting — [{'room': 'Cage', 'container': '1a', 'specific': ''}]
Candidate: Printer — [{'room': 'Cage', 'container': '8d', 'specific': None}]
Candidate: Iron Rest Pad — [{'room': 'Cage', 'container': '3b', 'specific': ''}]
Candidate: Black Acrylic Paint — [{'room': 'Cage', 'container': '5c', 'specific': 'Stack 21|-3'}]
Candidate: Laminator Sheets (Business Card Size) — [{'room': 'Cage', 'container': '1b', 'specific': ''}]
[LLM ERROR] AuthenticationError("Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-.... You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}")


UnboundLocalError: cannot access local variable 'raw' where it is not associated with a value

In [16]:
print (len(restocks_final_data))

160


In [18]:
import os
print(os.getenv("OPENAI_API_KEY"))

sk-...
