# WeaveForward — Web Scraper Extraction

**Purpose:** Scrape fiber-composition and clothing-type data directly from the
product pages of **25 sample Philippine clothing brands** and **25 sample multinational
fashion retailers**, parse fiber `%` values, compute EU Ecodesign
biodegradability tiers, and build a `BRAND_FIBER_LOOKUP` table.

**Outputs written to `data/webscraped_data/`:**
- `YYYYMMDD-HHMMSS-webscraped_catalog.csv` — timestamped full product catalog
- `webscraped_catalog.csv` — stable alias (always the latest run)

**Outputs written to `data/processed/`:**
- `YYYYMMDD-HHMMSS-brand_fiber_lookup.json` — timestamped brand fiber profile
- `brand_fiber_lookup.json` — stable alias

In [8]:
import subprocess, sys
from pathlib import Path

PACKAGES = [
    "requests",
    "beautifulsoup4",
    "lxml",
    "pandas",
    "numpy",
    "tqdm",
]
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *PACKAGES])

# ── Path definitions ──────────────────────────────────────────────────────
ROOT          = Path("..").resolve()
DATA_DIR      = ROOT / "data"
PROC_DIR      = DATA_DIR / "processed"
WEB_DIR       = DATA_DIR / "webscraped_data"

for d in [PROC_DIR, WEB_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("✓ environment ready")
print(f"  ROOT    : {ROOT}")
print(f"  WEB_DIR : {WEB_DIR}")
print(f"  PROC_DIR: {PROC_DIR}")


✓ environment ready
  ROOT    : /Users/gyalm/Desktop/WeaveForward_FiberClassificationML
  WEB_DIR : /Users/gyalm/Desktop/WeaveForward_FiberClassificationML/data/webscraped_data
  PROC_DIR: /Users/gyalm/Desktop/WeaveForward_FiberClassificationML/data/processed



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m


In [9]:
import json, re, time, random, datetime
from urllib.parse import urlparse, urljoin
import numpy as np
import pandas as pd
import requests
from bs4         import BeautifulSoup
from tqdm        import tqdm
from pathlib     import Path

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
SCRAPE_TIMEOUT  = 12   # seconds per request
SCRAPE_DELAY    = 1.2  # polite delay between requests

print("✓ imports ready")


✓ imports ready


---
## 1-A · Brand Catalogue — Philippine & Multinational Retailers

Each brand entry holds only the **root / homepage URL**.  
`discover_collection_urls()` (1-B-3) crawls that root to find all clothing-category
and collection sub-pages automatically, so no paths are hardcoded.

In [10]:
# ── 1-A  Brand catalogue ──────────────────────────────────────────────────
# Each entry: (brand_name, root_url, source_region, country)
# root_url is the brand homepage or top-level shop root.
# Collection sub-pages are discovered automatically by discover_collection_urls().

PH_BRANDS = [
    ("Bench",              "https://www.bench.com.ph",               "philippine", "PH"),
    ("Plains & Prints",    "https://www.plainsandprints.com",        "philippine", "PH"),
    ("Penshoppe",          "https://www.penshoppe.com",              "philippine", "PH"),
    ("Folded & Hung",      "https://www.foldedandhung.com.ph",       "philippine", "PH"),
    ("Bayo",               "https://www.bayo.com.ph",                "philippine", "PH"),
    ("Kashieca",           "https://www.kashieca.com",               "philippine", "PH"),
    ("Human",              "https://www.human.com.ph",               "philippine", "PH"),
    ("Oxygen",             "https://www.oxygenonline.com.ph",        "philippine", "PH"),
    ("Something Borrowed", "https://www.somethingborrowed.ph",       "philippine", "PH"),
    ("Artwork Studio",     "https://www.artworkstudio.com.ph",       "philippine", "PH"),
    ("Regatta",            "https://www.regatta.com.ph",             "philippine", "PH"),
    ("Caramelo",           "https://www.caramelo.ph",                "philippine", "PH"),
    ("Dimensione",         "https://www.dimensione.com.ph",          "philippine", "PH"),
    ("Hnos",               "https://www.hnos.ph",                    "philippine", "PH"),
    ("Island Souvenirs",   "https://www.islandsouvenirs.com.ph",     "philippine", "PH"),
    ("Freego",             "https://www.freego.com.ph",              "philippine", "PH"),
    ("Tyler Fashion",      "https://www.tylerfashion.com.ph",        "philippine", "PH"),
    ("Tomato",             "https://www.tomato.ph",                  "philippine", "PH"),
    ("Love Bonito PH",     "https://lovebonito.com",                 "philippine", "PH"),
    ("Giordano PH",        "https://www.giordano.com/ph",            "philippine", "PH"),
    ("Muji PH",            "https://www.muji.com/ph",                "philippine", "PH"),
    ("Koton PH",           "https://www.koton.com/ph/en",            "philippine", "PH"),
    ("Terranova PH",       "https://www.terranovastyle.com/ph/en",   "philippine", "PH"),
    ("The Varsity PH",     "https://www.thevarsity.ph",              "philippine", "PH"),
    ("Stradivarius PH",    "https://www.stradivarius.com/ph",        "philippine", "PH"),
]

MULTINATIONAL_BRANDS = [
    ("H&M",              "https://www2.hm.com/en_ph",                "multinational", "SE"),
    ("Zara",             "https://www.zara.com/ph/en",               "multinational", "ES"),
    ("Uniqlo",           "https://www.uniqlo.com/ph/en",             "multinational", "JP"),
    ("Nike",             "https://www.nike.com/ph",                  "multinational", "US"),
    ("Adidas",           "https://www.adidas.com.ph",                "multinational", "DE"),
    ("Levi's",           "https://www.levi.com/PH/en_PH",           "multinational", "US"),
    ("Mango",            "https://shop.mango.com/ph",                "multinational", "ES"),
    ("Pull&Bear",        "https://www.pullandbear.com/ph",           "multinational", "ES"),
    ("Bershka",          "https://www.bershka.com/ph",               "multinational", "ES"),
    ("Massimo Dutti",    "https://www.massimodutti.com/ph",          "multinational", "ES"),
    ("Tommy Hilfiger",   "https://www.tommy.com/en",                 "multinational", "US"),
    ("Calvin Klein",     "https://www.calvinklein.us",               "multinational", "US"),
    ("Lacoste",          "https://www.lacoste.com/ph",               "multinational", "FR"),
    ("Guess",            "https://www.guess.com/en",                 "multinational", "US"),
    ("Lululemon",        "https://www.lululemon.com",                "multinational", "CA"),
    ("Patagonia",        "https://www.patagonia.com",                "multinational", "US"),
    ("The North Face",   "https://www.thenorthface.com/en-us",       "multinational", "US"),
    ("Under Armour",     "https://www.underarmour.com/en-ph",        "multinational", "US"),
    ("Puma",             "https://ph.puma.com/ph/en",                "multinational", "DE"),
    ("New Balance",      "https://www.newbalance.com.ph",            "multinational", "US"),
    ("Columbia",         "https://www.columbia.com",                 "multinational", "US"),
    ("Everlane",         "https://www.everlane.com",                 "multinational", "US"),
    ("Gap",              "https://www.gap.com",                      "multinational", "US"),
    ("Ralph Lauren",     "https://www.ralphlauren.com",              "multinational", "US"),
    ("Banana Republic",  "https://bananarepublic.gap.com",           "multinational", "US"),
]

ALL_BRANDS = PH_BRANDS + MULTINATIONAL_BRANDS
print(f"✓ {len(PH_BRANDS)} Philippine brands + {len(MULTINATIONAL_BRANDS)} multinational = {len(ALL_BRANDS)} total")

✓ 25 Philippine brands + 25 multinational = 50 total


---
## 1-B · Live Scraper — requests + BeautifulSoup

### 1-B-1 · Fiber Pattern Parser & Clothing-Type Detector


In [11]:
# ── Regex & mapping constants ─────────────────────────────────────────────
# Fiber composition regex — matches patterns like "95% Cotton" or "100% Polyester"
FIBER_RE = re.compile(
    r'(\d{1,3})\s*%\s*(cotton|polyester|nylon|wool|linen|silk|rayon|viscose|'
    r'acrylic|elastane|spandex|lycra|modal|bamboo|hemp|denim|cashmere|tencel|'
    r'lyocell|polyamide|polypropylene|microfibre|microfiber|fleece|acetate|'
    r'angora|mohair|alpaca)',
    re.IGNORECASE,
)

# Keyword → canonical clothing category mapping
CLOTHING_TYPE_MAP = {
    "dress": "dress",       "dresses": "dress",
    "top": "top",           "tops": "top",
    "shirt": "shirt",       "shirts": "shirt",      "blouse": "shirt",
    "tee": "t-shirt",       "t-shirt": "t-shirt",   "tshirt": "t-shirt",
    "polo": "polo",
    "pants": "pants",       "trousers": "pants",    "slacks": "pants",
    "skirt": "skirt",       "skirts": "skirt",
    "jacket": "jacket",     "coat": "jacket",
    "jeans": "jeans",       "denim": "jeans",
    "shorts": "shorts",
    "suit": "suit",         "blazer": "blazer",
    "swimwear": "swimwear", "swim": "swimwear",
    "underwear": "underwear", "bra": "underwear",
    "hoodie": "hoodie",     "sweatshirt": "hoodie",
    "knit": "knitwear",     "sweater": "knitwear",  "pullover": "knitwear",
    "activewear": "activewear", "leggings": "activewear",
}

print("✓ FIBER_RE and CLOTHING_TYPE_MAP ready")


✓ FIBER_RE and CLOTHING_TYPE_MAP ready


In [12]:
def detect_clothing_type(text: str, url: str = "") -> str:
    """Infer clothing category from page text + URL keywords."""
    combined = (text + " " + url).lower()
    for kw, cat in CLOTHING_TYPE_MAP.items():
        if kw in combined:
            return cat
    return "unspecified"


print("✓ detect_clothing_type() ready")


✓ detect_clothing_type() ready


In [13]:
def parse_fiber_composition(text: str) -> dict:
    """
    Extract { fiber: percentage } from raw HTML page text.
    Normalises synonyms (spandex→elastane, polyamide→nylon, etc.).
    Rescales totals that are within ±15 % of 100.
    """
    fibers: dict = {}
    for m in FIBER_RE.finditer(text):
        pct = float(m.group(1))
        fib = (m.group(2).lower()
               .replace("spandex",   "elastane")
               .replace("lycra",     "elastane")
               .replace("polyamide", "nylon")
               .replace("microfiber","microfibre"))
        fibers[fib] = fibers.get(fib, 0.0) + pct
    total = sum(fibers.values())
    if total > 0 and abs(total - 100) <= 15:
        fibers = {k: round(v / total * 100, 2) for k, v in fibers.items()}
    return fibers


print("✓ parse_fiber_composition() ready")


✓ parse_fiber_composition() ready


### 1-B-2 · Page Scraper Function


In [14]:
def scrape_page(url: str, brand: str, source: str, country: str) -> list[dict]:
    """
    Fetch one collection/listing page, crawl up to 15 product detail links,
    and extract fiber composition + clothing type from each.

    Strategy:
      1. GET the listing page.
      2. Collect <a href> links containing product-path keywords
         (/products/, /p/, /item/, /detail, /pd/).
      3. Visit each product page; run parse_fiber_composition() on full text.
      4. Skip pages that yield no fiber data.

    Returns a list of record dicts (empty list on any hard failure).
    """
    records = []
    try:
        resp = requests.get(url, headers=HEADERS, timeout=SCRAPE_TIMEOUT)
        if resp.status_code != 200:
            return []
        soup = BeautifulSoup(resp.text, "lxml")

        product_links: set = set()
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if any(kw in href.lower() for kw in
                   ["/products/", "/p/", "/item/", "/detail", "/pd/"]):
                if href.startswith("http"):
                    product_links.add(href)
                elif href.startswith("/"):
                    product_links.add(urljoin(url, href))

        pages_to_parse = list(product_links)[:15] if product_links else [url]

        for purl in pages_to_parse:
            try:
                time.sleep(SCRAPE_DELAY + random.uniform(0, 0.5))
                p_resp = requests.get(purl, headers=HEADERS, timeout=SCRAPE_TIMEOUT)
                if p_resp.status_code != 200:
                    continue
                p_soup    = BeautifulSoup(p_resp.text, "lxml")
                page_text = p_soup.get_text(" ", strip=True)

                fibers = parse_fiber_composition(page_text)
                if not fibers:
                    continue

                name = ""
                for sel in ["h1", ".product-title", ".product-name",
                            '[data-testid="product-name"]', ".pdp-title"]:
                    node = p_soup.select_one(sel)
                    if node and node.get_text(strip=True):
                        name = node.get_text(strip=True)[:120]
                        break

                # Capture a short snippet of raw fabric text for auditability
                fab_text = ""
                for m in FIBER_RE.finditer(page_text):
                    start    = max(0, m.start() - 60)
                    end      = min(len(page_text), m.end() + 60)
                    fab_text = page_text[start:end].strip()[:200]
                    break

                records.append({
                    "brand":               brand,
                    "product_name":        name or brand,
                    "clothing_type":       detect_clothing_type(page_text, purl),
                    "fabric_composition":  fab_text,
                    "fiber_json":          json.dumps(fibers),
                    "most_dominant_fiber": max(fibers, key=fibers.get),
                    "source":              source,
                    "country_of_brand":    country,
                    "scraped_url":         purl,
                    "scraped_at":          datetime.datetime.utcnow().isoformat(),
                    "origin":              "live",
                })
            except Exception:
                continue
    except Exception:
        pass   # silently skip — seed catalog covers missing brands
    return records


print("✓ scrape_page() ready")


✓ scrape_page() ready


### 1-B-3 · Collection URL Discovery

`discover_collection_urls(root_url)` crawls the brand's homepage and returns
all clothing-category / collection listing pages it finds in `<a href>` links —
no hardcoded paths required. The run-scraper cell feeds these discovered URLs
into `scrape_page()` instead of a single fixed URL.

In [15]:
# ── Private regex constants for discover_collection_urls() ────────────────
# Path segments that indicate a clothing collection / category page
_COLLECTION_RE = re.compile(
    r'/('
    r'collections?|categories?|category|'
    r'clothing|clothes|apparel|'
    r'c/|browse/|shop/|products?/|'
    r'women|mens?|ladies|girls?|boys?|kids?|'
    r'tops?|shirts?|blouses?|dresses?|skirts?|'
    r'pants?|trousers?|jeans?|bottoms?|'
    r'jackets?|coats?|outerwear|knitwear|sweaters?|hoodies?|'
    r'activewear|swimwear|underwear|lingerie'
    r')',
    re.IGNORECASE,
)

# Path segments that are definitely NOT product listings
_SKIP_RE = re.compile(
    r'/(cart|checkout|account|login|register|search|wishlist|'
    r'faq|about|contact|blog|press|careers?|stores?|'
    r'sustainability|privacy|returns?|size.?guide|gift)',
    re.IGNORECASE,
)

print("✓ _COLLECTION_RE and _SKIP_RE ready")


✓ _COLLECTION_RE and _SKIP_RE ready


In [16]:
def discover_collection_urls(root_url: str, max_collections: int = 8) -> list[str]:
    """
    Crawl root_url (brand homepage) and return up to max_collections
    clothing-category / collection listing URLs found in <a href> links.

    Strategy:
      1. GET root_url.
      2. Walk every <a href> — resolve relative URLs, keep same-domain only.
      3. Filter by _COLLECTION_RE and exclude _SKIP_RE noise links.
      4. Sort by number of clothing keywords in the path (most specific first).
      5. Return up to max_collections unique URLs.
      6. Fallback: if nothing is found, return [root_url] so scrape_page()
         still has something to work with.
    """
    parsed_root = urlparse(root_url)
    base        = f"{parsed_root.scheme}://{parsed_root.netloc}"

    try:
        resp = requests.get(root_url, headers=HEADERS, timeout=SCRAPE_TIMEOUT)
        if resp.status_code != 200:
            return [root_url]
        soup = BeautifulSoup(resp.text, "lxml")
    except Exception:
        return [root_url]

    seen:       set  = set()
    candidates: list = []

    for a in soup.find_all("a", href=True):
        raw_href = a["href"].strip()
        if not raw_href or raw_href.startswith(("#", "javascript", "mailto")):
            continue

        # Resolve to absolute URL; strip query-string and trailing slash
        full = (raw_href if raw_href.startswith("http") else urljoin(base, raw_href))
        full = full.split("?")[0].split("#")[0].rstrip("/")

        # Keep same domain only
        if urlparse(full).netloc != parsed_root.netloc:
            continue

        path = urlparse(full).path.lower()

        if _SKIP_RE.search(path):
            continue
        if not _COLLECTION_RE.search(path):
            continue
        if full in seen:
            continue

        seen.add(full)
        candidates.append(full)

    # Rank: paths that contain more clothing-type keywords score higher
    _RANK_WORDS = {
        "tops", "shirts", "dresses", "clothing", "collections",
        "women", "men", "ladies", "apparel", "clothes",
    }
    candidates.sort(
        key=lambda u: sum(w in u.lower() for w in _RANK_WORDS),
        reverse=True,
    )

    result = candidates[:max_collections]
    if not result:
        result = [root_url]

    return result


print("✓ discover_collection_urls() ready")


✓ discover_collection_urls() ready


### 1-B-4 · Run Live Scraper

For each brand, `discover_collection_urls()` crawls the root homepage and
returns up to 8 clothing-category pages. `scrape_page()` is then called on
each discovered page to harvest product detail links and extract fiber data.  
Brands whose sites require JavaScript or block requests will yield 0 rows.

> **Testing mode:** Currently running with **2 Philippine + 2 multinational brands** (`TEST_BRANDS`).  
> To run the full 50-brand scrape, replace `TEST_BRANDS` with `ALL_BRANDS` in the scraper cell below.


In [20]:
# ── 1-B-4  Run live scraper across brand root URLs ────────────────────────
#
# TEST_MODE   = True  → scrape 2 PH + 2 multinational brands only
# TEST_MODE   = False → scrape all 50 brands (full production run)
#
TEST_MODE   = True
TEST_BRANDS = PH_BRANDS[:2] + MULTINATIONAL_BRANDS[:2]
RUN_BRANDS  = TEST_BRANDS if TEST_MODE else ALL_BRANDS

if TEST_MODE:
    print(f"⚠ TEST MODE — scraping {len(RUN_BRANDS)} brands "
          f"({len(PH_BRANDS[:2])} PH + {len(MULTINATIONAL_BRANDS[:2])} multinational):")
    for b in RUN_BRANDS:
        print(f"  • {b[0]}  ({b[3]})")
    print()
else:
    print(f"PRODUCTION MODE — scraping all {len(RUN_BRANDS)} brands")
    print()

_SCHEMA = [
    "brand", "product_name", "clothing_type", "fabric_composition",
    "fiber_json", "most_dominant_fiber", "source", "country_of_brand",
    "scraped_url", "scraped_at", "origin",
]

live_rows = []

try:
    for brand_name, root_url, source, country in tqdm(RUN_BRANDS, desc="Scraping brands"):
        collection_urls = discover_collection_urls(root_url)
        brand_rows = []
        for col_url in collection_urls:
            rows = scrape_page(col_url, brand_name, source, country)
            brand_rows.extend(rows)
        live_rows.extend(brand_rows)
        if brand_rows:
            print(f"  ✓ {brand_name:30s} {len(brand_rows):3d} products  "
                  f"({len(collection_urls)} collection pages crawled)")
except KeyboardInterrupt:
    print(f"\n⚠ Scrape interrupted — {len(live_rows)} rows collected so far will be saved.")

# ── Full 50-brand loop (kept for production use — set TEST_MODE = False above) ──
# try:
#     for brand_name, root_url, source, country in tqdm(ALL_BRANDS, desc="Scraping brands"):
#         collection_urls = discover_collection_urls(root_url)
#         brand_rows = []
#         for col_url in collection_urls:
#             rows = scrape_page(col_url, brand_name, source, country)
#             brand_rows.extend(rows)
#         live_rows.extend(brand_rows)
#         if brand_rows:
#             print(f"  ✓ {brand_name:30s} {len(brand_rows):3d} products  "
#                   f"({len(collection_urls)} collection pages crawled)")
# except KeyboardInterrupt:
#     print(f"\n⚠ Scrape interrupted — {len(live_rows)} rows collected so far will be saved.")

print(f"\n  Live scrape total: {len(live_rows)} rows from "
      f"{len({r['brand'] for r in live_rows})} brands")

df_catalog = (
    pd.DataFrame(live_rows, columns=_SCHEMA)
    if live_rows
    else pd.DataFrame(columns=_SCHEMA)
)
df_catalog = df_catalog.drop_duplicates(subset=["brand", "product_name"])

print(f"\n✓ df_catalog: {len(df_catalog):,} rows | {df_catalog['brand'].nunique()} brands")
print(f"\n  Clothing-type distribution:")
for ct, n in df_catalog["clothing_type"].value_counts().head(10).items():
    print(f"    {ct:<16} {n:>4}")


Scraping brands:   6%|▌         | 3/50 [08:55<2:41:30, 206.18s/it]

  ✓ Penshoppe                       87 products  (8 collection pages crawled)


Scraping brands:  64%|██████▍   | 32/50 [12:23<08:46, 29.26s/it]  

  ✓ Mango                           11 products  (8 collection pages crawled)


Scraping brands:  92%|█████████▏| 46/50 [13:42<00:49, 12.38s/it]

  ✓ Columbia                         4 products  (8 collection pages crawled)


Scraping brands:  94%|█████████▍| 47/50 [14:43<01:20, 26.95s/it]

  ✓ Everlane                         5 products  (8 collection pages crawled)


Scraping brands: 100%|██████████| 50/50 [14:49<00:00, 17.80s/it]


  Live scrape total: 107 rows from 4 brands

✓ df_catalog: 71 rows | 4 brands

  Clothing-type distribution:
    dress              58
    shirt               6
    t-shirt             5
    jacket              2





---
## 1-C · Biodegradability Tier Classification — BRAND_FIBER_LOOKUP

### Regulatory References

The biodegradability tiers applied to each product's fiber composition are
derived from two authoritative sources:

**1. EU Regulation 2024/1781 — Ecodesign for Sustainable Products (ESPR)**
> *Regulation (EU) 2024/1781 of the European Parliament and of the Council,*
> Official Journal of the European Union, 2024.
> Annex I (textile product groups) and the accompanying Commission staff
> working document on textile sustainability scoring methodology.
> The regulation establishes minimum recycled-content and natural-fiber
> thresholds for product sustainability labelling across EU member states.
> Delegated acts specifying exact numeric thresholds for textile
> biodegradability scoring are ongoing as of 2024–2026.

**2. GOTS v6.0 — Global Organic Textile Standard**
> *Global Organic Textile Standard, Version 6.0*, GOTS, 2020.
> Establishes **≥ 85 % certified organic natural fibers** as the minimum
> threshold for main-label GOTS certification — the origin of the 85 %
> boundary used in this notebook.

### Tier Mapping Applied in This Notebook

| Tier | Bio-fiber share | Interpretation |
|------|----------------|----------------|
| **high** | ≥ 85 % | Predominantly natural / biodegradable — GOTS-aligned |
| **medium** | 50 – 84 % | Mixed composition |
| **low** | < 50 % | Synthetic-dominant |

> ⚠ **Note:** The exact numeric thresholds in the ESPR delegated textile act
> are still being finalised. The 85 / 50 split is an evidence-based
> approximation aligned with GOTS v6.0 and the draft ESPR textile methodology.

---

### 1-C-1 · Bio-Share & Tier Helper Functions

In [21]:
# ── 1-C-1  Bio-fiber vocabulary ─────────────────────────────────────────

BIO_FIBERS = frozenset([
    "cotton", "linen", "hemp", "wool", "silk",
    "bamboo", "tencel", "lyocell", "modal",
    "cashmere", "viscose", "rayon", "acetate", "denim",
])


def bio_share(fibers: dict) -> float:
    """Return the percentage of bio/natural fibers in a fiber-composition dict."""
    total = sum(fibers.values())
    if total == 0:
        return 0.0
    return round(sum(v for k, v in fibers.items() if k in BIO_FIBERS) / total * 100, 2)


def biodeg_tier(bio_pct: float) -> str:
    """Map a bio-fiber percentage to an EU-Ecodesign-aligned biodegradability tier.

    Thresholds:
        ≥ 85 % → 'high'   (GOTS v6.0 main-label threshold)
        ≥ 50 % → 'medium'
        < 50 % → 'low'
    """
    if bio_pct >= 85:
        return "high"
    if bio_pct >= 50:
        return "medium"
    return "low"


print("✓ BIO_FIBERS vocabulary loaded:", len(BIO_FIBERS), "fiber types")
print("✓ bio_share() and biodeg_tier() ready")

✓ BIO_FIBERS vocabulary loaded: 14 fiber types
✓ bio_share() and biodeg_tier() ready


### 1-C-2 · Annotate Product Catalog

Parse the `fiber_json` column of `df_catalog` into Python dicts, then compute
`fs_bio_share` (% bio-fiber) and `fs_biodeg_tier` for every row.

In [22]:
# ── §1-C-2  Parse fiber_json → annotate df_catalog ───────────────────────

df_catalog["fiber_dict"] = df_catalog["fiber_json"].apply(
    lambda x: json.loads(x) if isinstance(x, str) else {}
)
df_catalog["fs_bio_share"]   = df_catalog["fiber_dict"].apply(bio_share)
df_catalog["fs_biodeg_tier"] = df_catalog["fs_bio_share"].apply(biodeg_tier)

print(f"✓ Annotated {len(df_catalog):,} rows")
print("\n  Biodegradability tier distribution (EU Ecodesign 2024/1781 / GOTS v6.0):")
for tier, cnt in df_catalog["fs_biodeg_tier"].value_counts().items():
    print(f"    {tier:<8} {cnt:>5}  ({cnt / len(df_catalog) * 100:.1f} %)")

print("\n  Sample rows:")
print(df_catalog[["brand", "clothing_type", "most_dominant_fiber",
                   "fs_bio_share", "fs_biodeg_tier", "source"]].head(8).to_string(index=False))

✓ Annotated 71 rows

  Biodegradability tier distribution (EU Ecodesign 2024/1781 / GOTS v6.0):
    low         32  (45.1 %)
    medium      20  (28.2 %)
    high        19  (26.8 %)

  Sample rows:
    brand clothing_type most_dominant_fiber  fs_bio_share fs_biodeg_tier     source
Penshoppe         dress           polyester          35.0            low philippine
Penshoppe         dress              cotton         100.0           high philippine
Penshoppe         dress           polyester          35.0            low philippine
Penshoppe         dress           polyester          35.0            low philippine
Penshoppe         dress           polyester           0.0            low philippine
Penshoppe         dress              cotton         100.0           high philippine
Penshoppe         dress           polyester           0.0            low philippine
Penshoppe         dress           polyester           0.0            low philippine


### 1-C-3 · Aggregate Per-Brand Median Profile → BRAND_FIBER_LOOKUP

For each brand, compute the **median fiber share** across all scraped products.
Fibers contributing < 2 % at the median are dropped to keep the profile clean.
The resulting dict `BRAND_FIBER_LOOKUP[brand]` holds:
`fibers`, `bio_share`, `biodeg_tier`, `item_count`, `source`.

In [23]:
# ── 1-C-3  Per-brand median fiber profile ───────────────────────────────

_brand_records = []
for brand, grp in df_catalog[df_catalog["fiber_dict"].apply(bool)].groupby("brand"):
    agg: dict = {}
    for fdict in grp["fiber_dict"]:
        for k, v in fdict.items():
            agg.setdefault(k, []).append(v)

    # median share per fiber; drop fibers with median ≤ 2 %
    brand_fiber = {
        k: round(float(np.median(v)), 1)
        for k, v in agg.items()
        if np.median(v) > 2
    }

    # re-normalise to 100 %
    total = sum(brand_fiber.values())
    if total > 0:
        brand_fiber = {k: round(v / total * 100, 1) for k, v in brand_fiber.items()}

    _brand_records.append({
        "brand":       brand.lower().strip(),
        "fibers":      brand_fiber,
        "bio_share":   bio_share(brand_fiber),
        "biodeg_tier": biodeg_tier(bio_share(brand_fiber)),
        "item_count":  len(grp),
        "source":      grp["source"].iloc[0],
    })

BRAND_FIBER_LOOKUP: dict = {r["brand"]: r for r in _brand_records}

print(f"✓ BRAND_FIBER_LOOKUP built: {len(BRAND_FIBER_LOOKUP)} brands")
print("\n  Sample entries:")
for brand, rec in list(BRAND_FIBER_LOOKUP.items())[:4]:
    print(f"    {brand:<22} tier={rec['biodeg_tier']:<8} bio={rec['bio_share']:5.1f}%  "
          f"items={rec['item_count']:>3}  fibers={rec['fibers']}")

✓ BRAND_FIBER_LOOKUP built: 4 brands

  Sample entries:
    columbia               tier=low      bio=  0.0%  items=  4  fibers={'nylon': 38.3, 'elastane': 5.2, 'polyester': 56.5}
    everlane               tier=medium   bio= 67.6%  items=  5  fibers={'alpaca': 32.4, 'wool': 8.8, 'tencel': 35.3, 'cotton': 23.5}
    mango                  tier=high     bio=100.0%  items= 11  fibers={'cotton': 100.0}
    penshoppe              tier=low      bio= 49.6%  items= 51  fibers={'polyester': 23.9, 'cotton': 22.1, 'nylon': 26.5, 'viscose': 27.6}


---
## Save Outputs

In [24]:
# ── Save webscraped catalog CSV — timestamped ─────────────────────────────
_ts              = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
catalog_csv      = WEB_DIR / f"{_ts}-webscraped_catalog.csv"
catalog_csv_stable = WEB_DIR / "webscraped_catalog.csv"   # stable alias

_save_cols = [
    "brand", "product_name", "clothing_type", "fabric_composition",
    "fiber_json", "most_dominant_fiber", "fs_bio_share", "fs_biodeg_tier",
    "source", "country_of_brand", "scraped_url", "scraped_at", "origin",
]
df_catalog[_save_cols].to_csv(catalog_csv,        index=False, encoding="utf-8")
df_catalog[_save_cols].to_csv(catalog_csv_stable, index=False, encoding="utf-8")

print(f"✓ {catalog_csv.name}  →  {catalog_csv}")
print(f"✓ webscraped_catalog.csv (stable alias)  →  {catalog_csv_stable}")
print(f"  {len(df_catalog):,} rows | {df_catalog['brand'].nunique()} brands | "
      f"{len(_save_cols)} columns")


✓ 20260224-212202-webscraped_catalog.csv  →  /Users/gyalm/Desktop/WeaveForward_FiberClassificationML/data/webscraped_data/20260224-212202-webscraped_catalog.csv
✓ webscraped_catalog.csv (stable alias)  →  /Users/gyalm/Desktop/WeaveForward_FiberClassificationML/data/webscraped_data/webscraped_catalog.csv
  71 rows | 4 brands | 13 columns


### Save BRAND_FIBER_LOOKUP

Writes `brand_fiber_lookup.json` to `data/processed/` for backward compatibility
with the main classification notebook.

In [None]:
# ── Save BRAND_FIBER_LOOKUP JSON (timestamped + stable alias) ─────────────
lookup_path        = PROC_DIR / f"{_ts}-brand_fiber_lookup.json"
lookup_path_stable = PROC_DIR / "brand_fiber_lookup.json"

for path in [lookup_path, lookup_path_stable]:
    with open(path, "w") as f:
        json.dump(BRAND_FIBER_LOOKUP, f, indent=2)

print(f"✓ {lookup_path.name}  →  {lookup_path}")
print(f"✓ brand_fiber_lookup.json (stable alias)  →  {lookup_path_stable}")
print(f"  {len(BRAND_FIBER_LOOKUP):,} brand entries")

print(f"\n── Webscraper extraction complete — run ID: {_ts} ─────────────────")
print(f"   Next: load  data/webscraped_data/{catalog_csv.name}")
print(f"         into  weaveforward_fiber_recommendation.ipynb")
