In [1]:
from google.colab import files

uploaded = files.upload()

Saving Nanofiltration Flat Sheet Membranes.html to Nanofiltration Flat Sheet Membranes.html
Saving Concentration of Fruits Juice and Extracts.html to Concentration of Fruits Juice and Extracts.html
Saving Food & Beverage Crossflow Filtration.html to Food & Beverage Crossflow Filtration.html
Saving Food & Beverage Capsule Filters.html to Food & Beverage Capsule Filters.html
Saving General Lab Essentials.html to General Lab Essentials.html
Saving Food & Beverage Specialty Products.html to Food & Beverage Specialty Products.html
Saving Food & Beverage Filtration Process.html to Food & Beverage Filtration Process.html
Saving custom-capsule-filters.html to custom-capsule-filters.html
Saving custom-syringe-filters.html to custom-syringe-filters.html
Saving Clarification Process_Sample Preparation - Syringe Filters.html to Clarification Process_Sample Preparation - Syringe Filters.html
Saving Chemical and Gravimetric Analysis - Laboratory Filter Papers.html to Chemical and Gravimetric Analysi

In [2]:
import json
from bs4 import BeautifulSoup


def clean_text(element):
    if not element:
        return None
    return " ".join(element.get_text(separator=" ", strip=True).split())


def extract_category(soup):
    crumbs = soup.select(".breadcrumbs ul.items li")
    categories = []

    for li in crumbs:
        text = clean_text(li)
        if text and text.lower() != "home":
            categories.append(text)

    return " > ".join(categories)


def extract_description(soup):
    desc_div = soup.select_one(".category-description")
    return clean_text(desc_div)


def extract_products(soup):
    products = []

    table = soup.select_one("table#product-items")
    if not table:
        return products

    rows = table.select("tbody tr.item")

    for row in rows:
        def cell(selector):
            return clean_text(row.select_one(selector))

        product = {
            "sku": cell("td.product.sku a"),
            "product_name": cell("td.product.name a"),
            "pore_size": cell("td.product.attribute.pore_size"),
            "diameter_mm": cell("td.product.attribute.diameter"),
            "pack_size": cell("td.product.attribute.pack_size"),
            "availability": cell("td.product.availability span"),
            "price": cell("span.price"),
            "url": (
                row.select_one("td.product.name a")["href"]
                if row.select_one("td.product.name a")
                else None
            ),
        }

        products.append(product)

    return products


def parse_html(filename, html_content):
    soup = BeautifulSoup(html_content, "lxml")

    category = extract_category(soup)
    description = extract_description(soup)
    products = extract_products(soup)

    records = []
    for p in products:
        records.append({
            "html_filename": filename,
            "category": category,
            "description": description,
            **p
        })

    return records


# ==========================
# MAIN EXECUTION
# ==========================

all_results = []

for filename, content in uploaded.items():
    print(f"Processing: {filename}")
    html_text = content.decode("utf-8", errors="ignore")
    all_results.extend(parse_html(filename, html_text))

print(f"\nExtracted {len(all_results)} products")

output_file = "sterlitech_products.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

files.download(output_file)



Processing: Nanofiltration Flat Sheet Membranes.html
Processing: Concentration of Fruits Juice and Extracts.html
Processing: Food & Beverage Crossflow Filtration.html
Processing: Food & Beverage Capsule Filters.html
Processing: General Lab Essentials.html
Processing: Food & Beverage Specialty Products.html
Processing: Food & Beverage Filtration Process.html
Processing: custom-capsule-filters.html
Processing: custom-syringe-filters.html
Processing: Clarification Process_Sample Preparation - Syringe Filters.html
Processing: Chemical and Gravimetric Analysis - Laboratory Filter Papers.html
Processing: Microbiological Analysis - Microfiltration Membranes.html
Processing: dermatological-patches.html
Processing: Nobuto Blood Filter Strips.html
Processing: Kato Katz Kits.html
Processing: Schistosome Test Kits.html
Processing: MCE, Nitrocellulose Mixed Esters.html
Processing: Grade E Borosilicate Glass Filter.html
Processing: Grade TSS Borosilicate Glass Fiber Filter.html
Processing: Grade VSS

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
import json
from bs4 import BeautifulSoup


def clean_text(el):
    if not el:
        return None
    return " ".join(el.get_text(separator=" ", strip=True).split())


# =========================
# CATEGORY EXTRACTION
# =========================
def extract_category_path(soup):
    crumbs = soup.select(".breadcrumbs ul.items li")
    values = []

    for li in crumbs:
        text = clean_text(li)
        if text and text.lower() != "home":
            values.append(text)

    return values


def extract_category_name(soup):
    path = extract_category_path(soup)
    return path[-1] if path else None


def extract_description(soup):
    return clean_text(soup.select_one(".category-description"))


# =========================
# MAIN PRODUCT TABLE
# =========================
def extract_main_products(soup):
    products = []

    table = soup.select_one("table#product-items")
    if not table:
        return products

    for row in table.select("tbody tr.item"):
        def cell(css):
            return clean_text(row.select_one(css))

        name_el = row.select_one("td.product.name a")

        products.append({
            "product_type": "main",
            "sku": cell("td.product.sku a"),
            "product_name": cell("td.product.name a"),
            "pore_size": cell("td.product.attribute.pore_size"),
            "diameter_mm": cell("td.product.attribute.diameter"),
            "pack_size": cell("td.product.attribute.pack_size"),
            "availability": cell("td.product.availability span"),
            "price": cell("span.price"),
            "url": name_el["href"] if name_el else None,
        })

    return products


# =========================
# ACCESSORY TABLE
# =========================
def extract_accessories(soup):
    accessories = []

    table = soup.select_one("table#accessory-items")
    if not table:
        return accessories

    for row in table.select("tbody tr.item"):
        sku_el = row.select_one("td.product.sku a")
        name_el = row.select_one("td.product.name a")

        accessories.append({
            "product_type": "accessory",
            "sku": clean_text(sku_el),
            "product_name": clean_text(name_el),
            "pore_size": None,
            "diameter_mm": None,
            "pack_size": None,
            "availability": None,
            "price": clean_text(row.select_one("span.price")),
            "url": name_el["href"] if name_el else None,
        })

    return accessories


# =========================
# PARSER
# =========================
def parse_html(filename, html):
    soup = BeautifulSoup(html, "lxml")

    category_path = extract_category_path(soup)
    category_name = extract_category_name(soup)
    description = extract_description(soup)

    products = extract_main_products(soup)
    accessories = extract_accessories(soup)

    records = []
    for item in products + accessories:
        records.append({
            "html_filename": filename,
            "category_path": " > ".join(category_path),
            "category_name": category_name,
            "description": description,
            **item
        })

    return records


# =========================
# MAIN EXECUTION (Colab)
# =========================
all_results = []

for filename, content in uploaded.items():
    print(f"Processing: {filename}")
    html_text = content.decode("utf-8", errors="ignore")
    all_results.extend(parse_html(filename, html_text))

print(f"\n✅ Extracted {len(all_results)} total products (main + accessories)")

output_file = "sterlitech_products2.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

files.download(output_file)

Processing: Nanofiltration Flat Sheet Membranes.html
Processing: Concentration of Fruits Juice and Extracts.html
Processing: Food & Beverage Crossflow Filtration.html
Processing: Food & Beverage Capsule Filters.html
Processing: General Lab Essentials.html
Processing: Food & Beverage Specialty Products.html
Processing: Food & Beverage Filtration Process.html
Processing: custom-capsule-filters.html
Processing: custom-syringe-filters.html
Processing: Clarification Process_Sample Preparation - Syringe Filters.html
Processing: Chemical and Gravimetric Analysis - Laboratory Filter Papers.html
Processing: Microbiological Analysis - Microfiltration Membranes.html
Processing: dermatological-patches.html
Processing: Nobuto Blood Filter Strips.html
Processing: Kato Katz Kits.html
Processing: Schistosome Test Kits.html
Processing: MCE, Nitrocellulose Mixed Esters.html
Processing: Grade E Borosilicate Glass Filter.html
Processing: Grade TSS Borosilicate Glass Fiber Filter.html
Processing: Grade VSS

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import json
from bs4 import BeautifulSoup


def clean_text(el):
    if not el:
        return None
    return " ".join(el.get_text(separator=" ", strip=True).split())


# =========================
# CATEGORY
# =========================
def extract_category_path(soup):
    crumbs = soup.select(".breadcrumbs ul.items li")
    values = []

    for li in crumbs:
        text = clean_text(li)
        if text and text.lower() != "home":
            values.append(text)

    return values


def extract_category_name(soup):
    path = extract_category_path(soup)
    return path[-1] if path else None


def extract_description(soup):
    return clean_text(soup.select_one(".category-description"))


# =========================
# FAQ (RAW HTML)
# =========================
def extract_faq_html(soup):
    faq_div = soup.select_one("div#faq-content")
    if not faq_div:
        return None

    # Store raw inner HTML (not prettified, not cleaned)
    return str(faq_div)


# =========================
# MAIN PRODUCT TABLE
# =========================
def extract_main_products(soup):
    products = []

    table = soup.select_one("table#product-items")
    if not table:
        return products

    for row in table.select("tbody tr.item"):
        def cell(css):
            return clean_text(row.select_one(css))

        name_el = row.select_one("td.product.name a")

        products.append({
            "product_type": "main",
            "sku": cell("td.product.sku a"),
            "product_name": cell("td.product.name a"),
            "pore_size": cell("td.product.attribute.pore_size"),
            "diameter_mm": cell("td.product.attribute.diameter"),
            "pack_size": cell("td.product.attribute.pack_size"),
            "availability": cell("td.product.availability span"),
            "price": cell("span.price"),
            "url": name_el["href"] if name_el else None,
        })

    return products


# =========================
# ACCESSORY TABLE
# =========================
def extract_accessories(soup):
    accessories = []

    table = soup.select_one("table#accessory-items")
    if not table:
        return accessories

    for row in table.select("tbody tr.item"):
        sku_el = row.select_one("td.product.sku a")
        name_el = row.select_one("td.product.name a")

        accessories.append({
            "product_type": "accessory",
            "sku": clean_text(sku_el),
            "product_name": clean_text(name_el),
            "pore_size": None,
            "diameter_mm": None,
            "pack_size": None,
            "availability": None,
            "price": clean_text(row.select_one("span.price")),
            "url": name_el["href"] if name_el else None,
        })

    return accessories


# =========================
# PARSER
# =========================
def parse_html(filename, html):
    soup = BeautifulSoup(html, "lxml")

    category_path = extract_category_path(soup)
    category_name = extract_category_name(soup)
    description = extract_description(soup)
    faq_html = extract_faq_html(soup)

    products = extract_main_products(soup)
    accessories = extract_accessories(soup)

    records = []
    for item in products + accessories:
        records.append({
            "html_filename": filename,
            "category_path": " > ".join(category_path),
            "category_name": category_name,
            "description": description,
            "faq_html": faq_html,
            **item
        })

    return records


# =========================
# MAIN EXECUTION (COLAB)
# =========================
all_results = []

for filename, content in uploaded.items():
    print(f"Processing: {filename}")
    html_text = content.decode("utf-8", errors="ignore")
    all_results.extend(parse_html(filename, html_text))

print(f"\n✅ Extracted {len(all_results)} rows (main + accessory)")

Processing: Nanofiltration Flat Sheet Membranes.html
Processing: Concentration of Fruits Juice and Extracts.html
Processing: Food & Beverage Crossflow Filtration.html
Processing: Food & Beverage Capsule Filters.html
Processing: General Lab Essentials.html
Processing: Food & Beverage Specialty Products.html
Processing: Food & Beverage Filtration Process.html
Processing: custom-capsule-filters.html
Processing: custom-syringe-filters.html
Processing: Clarification Process_Sample Preparation - Syringe Filters.html
Processing: Chemical and Gravimetric Analysis - Laboratory Filter Papers.html
Processing: Microbiological Analysis - Microfiltration Membranes.html
Processing: dermatological-patches.html
Processing: Nobuto Blood Filter Strips.html
Processing: Kato Katz Kits.html
Processing: Schistosome Test Kits.html
Processing: MCE, Nitrocellulose Mixed Esters.html
Processing: Grade E Borosilicate Glass Filter.html
Processing: Grade TSS Borosilicate Glass Fiber Filter.html
Processing: Grade VSS

In [8]:
import json
from bs4 import BeautifulSoup


def clean_text(el):
    if not el:
        return None
    return " ".join(el.get_text(separator=" ", strip=True).split())


# =========================
# CATEGORY
# =========================
def extract_category_path(soup):
    crumbs = soup.select(".breadcrumbs ul.items li")
    values = []

    for li in crumbs:
        text = clean_text(li)
        if text and text.lower() != "home":
            values.append(text)

    return values


def extract_category_name(soup):
    path = extract_category_path(soup)
    return path[-1] if path else None


def extract_description(soup):
    return clean_text(soup.select_one(".category-description"))


# =========================
# FAQ (FORMATTED TEXT)
# =========================
def extract_faq_text(soup):
    faq_container = soup.select_one("#faq-content")
    if not faq_container:
        return None

    faq_blocks = faq_container.select(".panel.panel-default")
    faq_pairs = []

    for block in faq_blocks:
        question_el = block.select_one(".panel-title")
        answer_el = block.select_one(".panel-body")

        question = clean_text(question_el)
        answer = clean_text(answer_el)

        if question and answer:
            # Remove leading "Q:" if present
            if question.lower().startswith("q:"):
                question = question[2:].strip()

            faq_pairs.append(
                f"Question: {question}\n"
                f"Answer: {answer}"
            )

    if not faq_pairs:
        return None

    return "\n\n".join(faq_pairs)


# =========================
# MAIN PRODUCT TABLE
# =========================
def extract_main_products(soup):
    products = []

    table = soup.select_one("table#product-items")
    if not table:
        return products

    for row in table.select("tbody tr.item"):
        def cell(css):
            return clean_text(row.select_one(css))

        name_el = row.select_one("td.product.name a")

        products.append({
            "product_type": "main",
            "sku": cell("td.product.sku a"),
            "product_name": cell("td.product.name a"),
            "pore_size": cell("td.product.attribute.pore_size"),
            "diameter_mm": cell("td.product.attribute.diameter"),
            "pack_size": cell("td.product.attribute.pack_size"),
            "availability": cell("td.product.availability span"),
            "price": cell("span.price"),
            "url": name_el["href"] if name_el else None,
        })

    return products


# =========================
# ACCESSORY TABLE
# =========================
def extract_accessories(soup):
    accessories = []

    table = soup.select_one("table#accessory-items")
    if not table:
        return accessories

    for row in table.select("tbody tr.item"):
        sku_el = row.select_one("td.product.sku a")
        name_el = row.select_one("td.product.name a")

        accessories.append({
            "product_type": "accessory",
            "sku": clean_text(sku_el),
            "product_name": clean_text(name_el),
            "pore_size": None,
            "diameter_mm": None,
            "pack_size": None,
            "availability": None,
            "price": clean_text(row.select_one("span.price")),
            "url": name_el["href"] if name_el else None,
        })

    return accessories


# =========================
# PARSER
# =========================
def parse_html(filename, html):
    soup = BeautifulSoup(html, "lxml")

    category_path = extract_category_path(soup)
    category_name = extract_category_name(soup)
    description = extract_description(soup)
    faq_text = extract_faq_text(soup)

    products = extract_main_products(soup)
    accessories = extract_accessories(soup)

    records = []
    for item in products + accessories:
        records.append({
            "html_filename": filename,
            "category_path": " > ".join(category_path),
            "category_name": category_name,
            "description": description,
            "faq": faq_text,
            **item
        })

    return records


# =========================
# MAIN EXECUTION (COLAB)
# =========================
all_results = []

for filename, content in uploaded.items():
    print(f"Processing: {filename}")
    html_text = content.decode("utf-8", errors="ignore")
    all_results.extend(parse_html(filename, html_text))

print(f"\n✅ Extracted {len(all_results)} rows with formatted FAQ")

output_file = "sterlitech_products3.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

files.download(output_file)

Processing: Nanofiltration Flat Sheet Membranes.html
Processing: Concentration of Fruits Juice and Extracts.html
Processing: Food & Beverage Crossflow Filtration.html
Processing: Food & Beverage Capsule Filters.html
Processing: General Lab Essentials.html
Processing: Food & Beverage Specialty Products.html
Processing: Food & Beverage Filtration Process.html
Processing: custom-capsule-filters.html
Processing: custom-syringe-filters.html
Processing: Clarification Process_Sample Preparation - Syringe Filters.html
Processing: Chemical and Gravimetric Analysis - Laboratory Filter Papers.html
Processing: Microbiological Analysis - Microfiltration Membranes.html
Processing: dermatological-patches.html
Processing: Nobuto Blood Filter Strips.html
Processing: Kato Katz Kits.html
Processing: Schistosome Test Kits.html
Processing: MCE, Nitrocellulose Mixed Esters.html
Processing: Grade E Borosilicate Glass Filter.html
Processing: Grade TSS Borosilicate Glass Fiber Filter.html
Processing: Grade VSS

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
all_results[-1]

{'html_filename': 'Silver Membranes - Membrane Disc Filters _ Sterlitech.html',
 'category_path': 'Filters > Membrane Disc Filters > Silver Membranes',
 'category_name': 'Silver Membranes',
 'description': "Silver metal membrane filters are used in a variety of filtration applications, and their ability to withstand extreme chemical and thermal stress makes them ideal laboratory filtration equipment for applications involving aggressive fluids and/or high temperatures. Silver metal filter membranes are pure metallic silver (99.97% pure silver) and are available with particle retention ratings of 0.2 to 5.0 microns. These silver metal membrane disc filters and sheets are specified in a National Institute for Occupational Safety and Health (NIOSH) standard for the analysis of crystalline and amorphous silica, lead sulfide, boron carbide, and chrysotile asbestos. Sterlitech's silver metal membranes can be used as the collection media and subsequent x-ray diffraction substrate for quantify

In [10]:
product_urls = [item['url'] for item in all_results if item.get('url')]
print(f"Extracted {len(product_urls)} product URLs.")
print("First 5 URLs:")
for url in product_urls[:5]:
    print(url)

Extracted 2173 product URLs.
First 5 URLs:
https://www.sterlitech.com/junior-cartridge-filters.html
https://www.sterlitech.com/junior-capsule-filters.html
https://www.sterlitech.com/rd65-capsule-filters.html
https://www.sterlitech.com/skv-pes-capsule-filters.html
https://www.sterlitech.com/skl-series-ptfe-capsule-filters.html


In [11]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time


HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"
}


def clean_text(el):
    if not el:
        return None
    return " ".join(el.get_text(separator=" ", strip=True).split())


def extract_product_detail(product_url):
    r = requests.get(product_url, headers=HEADERS, timeout=30)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "lxml")

    # =========================
    # PRODUCT IMAGE
    # =========================
    img_el = soup.select_one("img.fotorama__img")
    product_image = (
        img_el["src"]
        if img_el and img_el.get("src")
        else None
    )

    # =========================
    # NAME
    # =========================
    product_name = clean_text(
        soup.select_one("h1.page-title span.base")
    )

    # =========================
    # SHORT DESCRIPTION
    # =========================
    short_description = clean_text(
        soup.select_one(".product.attribute.overview .value")
    )

    # =========================
    # SKU / BRAND / EST SHIP
    # =========================
    def attribute_value(label):
        label_el = soup.find("label", string=lambda x: x and x.strip() == label)
        if not label_el:
            return None
        value_el = label_el.find_next("span", class_="attribute-value")
        return clean_text(value_el)

    sku = attribute_value("SKU:")
    brand = attribute_value("Brand:")
    est_ship = attribute_value("Est Ship:")

    # =========================
    # PRICE
    # =========================
    price_el = soup.select_one(
        ".price-final_price [data-price-amount]"
    )
    price = price_el["data-price-amount"] if price_el else None

    currency_el = soup.select_one(
        ".price-final_price meta[itemprop='priceCurrency']"
    )
    currency = currency_el["content"] if currency_el else None

    # =========================
    # SPECIFICATIONS (TEXT)
    # =========================
    spec_tab = soup.select_one("#amcustomtabs_tabs_85")
    specifications_text = None

    if spec_tab:
        specifications_text = clean_text(spec_tab)

    # =========================
    # APPLICATIONS (TEXT)
    # =========================
    app_tab = soup.select_one("#amcustomtabs_tabs_87")
    applications_text = None

    if app_tab:
        applications_text = clean_text(app_tab)

    return {
        "product_url": product_url,
        "product_name": product_name,
        "sku": sku,
        "brand": brand,
        "est_ship": est_ship,
        "price": price,
        "currency": currency,
        "product_image": product_image,
        "short_description": short_description,
        "specifications_text": specifications_text,
        "applications_text": applications_text,
    }


In [12]:
results = []

for url in product_urls:
    print(f"Fetching {url}")
    try:
        results.append(extract_product_detail(url))
        time.sleep(1)  # polite delay
    except Exception as e:
        print("❌ Error:", e)

results

Fetching https://www.sterlitech.com/junior-cartridge-filters.html
Fetching https://www.sterlitech.com/junior-capsule-filters.html
Fetching https://www.sterlitech.com/rd65-capsule-filters.html
Fetching https://www.sterlitech.com/skv-pes-capsule-filters.html
Fetching https://www.sterlitech.com/skl-series-ptfe-capsule-filters.html
Fetching https://www.sterlitech.com/d90-capsule-filters.html
Fetching https://www.sterlitech.com/d4-syringe-filters.html
Fetching https://www.sterlitech.com/d13-syringe-filters.html
Fetching https://www.sterlitech.com/d25-syringe-filters.html
Fetching https://www.sterlitech.com/d40c-syringe-filters.html
Fetching https://www.sterlitech.com/d50-syringe-filters.html
Fetching https://www.sterlitech.com/g50-vacuum-vent-syringe-filters.html
Fetching https://www.sterlitech.com/d65-syringe-filters.html
Fetching https://www.sterlitech.com/dermatological-patch-study-patches-for-hript.html
Fetching https://www.sterlitech.com/dermatological-patch-study-patches-for-hript-sem

[{'product_url': 'https://www.sterlitech.com/junior-cartridge-filters.html',
  'product_name': 'MJF Junior Cartridges',
  'sku': 'MJF',
  'brand': 'Sterlitech',
  'est_ship': 'Contact Us',
  'price': None,
  'currency': None,
  'product_image': None,
  'short_description': 'These Junior Cartridges are ready-to-use filters that offer high flows, increased throughputs, and high strength, all with the convenience of a small package. Designed for small applications in pharmaceutical, biotechnology, food and beverage, medical, chemical, and DI water industries.',
  'specifications_text': None,
  'applications_text': None},
 {'product_url': 'https://www.sterlitech.com/junior-capsule-filters.html',
  'product_name': 'JKL Junior Capsules',
  'sku': 'JKL',
  'brand': 'Sterlitech',
  'est_ship': 'Contact Us',
  'price': None,
  'currency': None,
  'product_image': None,
  'short_description': 'Our Junior capsule filter assemblies are ready-to-use filters that offer high flows, increased throughp

In [13]:
output_file = "sterlitech_products_detailed.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
import requests
from bs4 import BeautifulSoup
import re


def extract_tab_by_title(url, title_keyword):
    html = requests.get(url, timeout=30).text
    soup = BeautifulSoup(html, "lxml")

    # Find tab link containing the title
    for tab in soup.select("a[data-toggle='tab'], a.data.switch"):
        title = tab.get_text(strip=True).lower()
        if title_keyword.lower() in title:
            tab_id = tab.get("href", "").replace("#", "")
            if not tab_id:
                continue

            content = soup.find(id=tab_id)
            if content:
                text = content.get_text(separator="\n", strip=True)
                return clean_text(text)

    return None


def clean_text(text):
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)
    return text.strip()


url = "https://www.sterlitech.com/13mm-glass-microanalysis-holder-glass-frit-support-250-ml-for-use-with-25mm-membranes.html"

specs = extract_tab_by_title(url, "spec")
apps = extract_tab_by_title(url, "application")

print("\n--- SPECIFICATIONS ---")
print(specs[:1500] if specs else "❌ NOT FOUND")

print("\n--- APPLICATIONS ---")
print(apps[:800] if apps else "❌ NOT FOUND")


--- SPECIFICATIONS ---
Filter Size (mm)
25
25
47
47
47
47
90
90
90
Model No.
VF5
VF8
VF6
VF7
VF3
VF12
VF15
VF16
VF17
Brand
Rocker
Rocker
Rocker
Rocker
Rocker
Rocker
Rocker
Rocker
Rocker
Support Material
Sintered glass
SS with PTFE gasket
Sintered glass
SS with PTFE gasket
Sintered glass
Sintered glass
SS316
Sintered glass
Sintered glass
Funnel Capacity (ml)
15
15
300
300
300
300
1000
1000
1000
Funnel Material
Borosilicate glass
Borosilicate glass
Borosilicate glass
Borosilicate glass
Borosilicate glass
Borosilicate glass
Borosilicate glass
Borosilicate glass
Borosilicate glass
Stopper Material
Silicone stopper
Silicone stopper
Silicone stopper
Silicone stopper
N/A
Silicone stopper
Silicone stopper
Silicone stopper
Silicone stopper
GL45 Filtration Adaptor Material
N/A
N/A
N/A
N/A
N/A
PP (autoclavable)
PP (autoclavable)
PP (autoclavable)
PP (autoclavable)
Clamp
Aluminum
Aluminum
Aluminum
Aluminum
Aluminum
Aluminum
Aluminum
Aluminum
Aluminum
Filtration area (cm2)
3.1
2.5
13.2
9.6
13.2
13

In [17]:
import requests
from bs4 import BeautifulSoup


def get_tab_node(url, keyword):
    html = requests.get(url, timeout=30).text
    soup = BeautifulSoup(html, "lxml")

    for tab in soup.select("a.data.switch, a[data-toggle='tab']"):
        if keyword.lower() in tab.get_text(strip=True).lower():
            tab_id = tab.get("href", "").replace("#", "")
            if not tab_id:
                continue
            return soup.find(id=tab_id)

    return None

import re

def clean_tables_from_node(node):
    rows = []

    for table in node.select("table"):
        for tr in table.select("tr"):
            cells = tr.find_all(["td", "th"])
            if len(cells) < 2:
                continue

            values = []
            for cell in cells:
                # unwrap links
                for a in cell.find_all("a"):
                    a.replace_with(a.get_text(strip=True))

                text = cell.get_text(" ", strip=True)
                text = re.sub(r"\s+", " ", text)
                values.append(text)

            label = values[0]
            row_data = values[1:]
            rows.append(f"{label}: " + " | ".join(row_data))

    return "\n".join(rows).strip()


url = "https://www.sterlitech.com/13mm-glass-microanalysis-holder-glass-frit-support-250-ml-for-use-with-25mm-membranes.html"

spec_node = get_tab_node(url, "spec")

if spec_node:
    clean_table = clean_tables_from_node(spec_node)
    print(clean_table[:2000])
else:
    print("❌ Spec tab not found")

Filter Size (mm): 25 | 25 | 47 | 47 | 47 | 47 | 90 | 90 | 90
Model No.: VF5 | VF8 | VF6 | VF7 | VF3 | VF12 | VF15 | VF16 | VF17
Brand: Rocker | Rocker | Rocker | Rocker | Rocker | Rocker | Rocker | Rocker | Rocker
Support Material: Sintered glass | SS with PTFE gasket | Sintered glass | SS with PTFE gasket | Sintered glass | Sintered glass | SS316 | Sintered glass | Sintered glass
Funnel Capacity (ml): 15 | 15 | 300 | 300 | 300 | 300 | 1000 | 1000 | 1000
Funnel Material: Borosilicate glass | Borosilicate glass | Borosilicate glass | Borosilicate glass | Borosilicate glass | Borosilicate glass | Borosilicate glass | Borosilicate glass | Borosilicate glass
Stopper Material: Silicone stopper | Silicone stopper | Silicone stopper | Silicone stopper | N/A | Silicone stopper | Silicone stopper | Silicone stopper | Silicone stopper
GL45 Filtration Adaptor Material: N/A | N/A | N/A | N/A | N/A | PP (autoclavable) | PP (autoclavable) | PP (autoclavable) | PP (autoclavable)
Clamp: Aluminum | Alu

In [26]:
# def get_tab_node(soup, keyword):
#     for tab in soup.select("a.data.switch, a[data-toggle='tab']"):
#         if keyword.lower() in tab.get_text(strip=True).lower():
#             tab_id = tab.get("href", "").replace("#", "")
#             if tab_id:
#                 return soup.find(id=tab_id)
#     return None

def get_tab_node(url, keyword):
    html = requests.get(url, timeout=30).text
    soup = BeautifulSoup(html, "lxml")

    for tab in soup.select("a.data.switch, a[data-toggle='tab']"):
        if keyword.lower() in tab.get_text(strip=True).lower():
            tab_id = tab.get("href", "").replace("#", "")
            if not tab_id:
                continue
            return soup.find(id=tab_id)

    return None

def clean_tables_from_node(node):
    rows = []

    for table in node.select("table"):
        for tr in table.select("tr"):
            cells = tr.find_all(["td", "th"])
            if len(cells) < 2:
                continue

            values = []
            for cell in cells:
                # unwrap links
                for a in cell.find_all("a"):
                    a.replace_with(a.get_text(strip=True))

                text = cell.get_text(" ", strip=True)
                text = re.sub(r"\s+", " ", text)
                values.append(text)

            label = values[0]
            row_data = values[1:]
            rows.append(f"{label}: " + " | ".join(row_data))

    return "\n".join(rows).strip()

def clean_text_block(node):
    if not node:
        return None

    for table in node.select("table"):
        table.decompose()

    text = node.get_text("\n", strip=True)
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)

    return text.strip() or None

# def extract_clean_tables(node):
#     if not node:
#         return None

#     rows = []

#     for table in node.select("table"):
#         for tr in table.select("tr"):
#             cells = tr.find_all(["td", "th"])
#             if len(cells) < 2:
#                 continue

#             values = []
#             for cell in cells:
#                 for a in cell.find_all("a"):
#                     a.replace_with(a.get_text(strip=True))

#                 text = re.sub(r'\s+', ' ', cell.get_text(" ", strip=True))
#                 values.append(text)

#             rows.append(f"{values[0]}: " + " | ".join(values[1:]))

#     return "\n".join(rows) or None

# html = requests.get(url, timeout=30).text
# soup = BeautifulSoup(html, "lxml")

spec_node = get_tab_node(url, "spec")
# app_node = get_tab_node(soup, "application")

# print(spec_node)
specifications = {
    "text": clean_text_block(spec_node),
    "table": clean_tables_from_node(spec_node)
}

# applications = clean_text_block(app_node)
print(specifications)
# print(applications)

{'text': None, 'table': ''}
