In [1]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
import re
import time
from datetime import date, timedelta, datetime
from urllib.parse import urlparse

import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def slug_underscore(s: str) -> str:
    s = s.strip().lower().replace("-", "_")
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]+", "", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def derive_subfolders_from_url(base_url: str) -> tuple[str, str, str]:
    parts = urlparse(base_url).path.strip("/").split("/")
    if "products" not in parts:
        raise ValueError("URL does not contain '/products/' segment.")
    i = parts.index("products")
    tail = parts[i+1:]
    if len(tail) < 3:
        raise ValueError(f"Not enough segments after /products/: {tail}")

    cat, brand, product = tail[-4], tail[-2], tail[-1]
    return slug_underscore(cat), slug_underscore(brand), slug_underscore(product)

In [3]:
MONTHS = r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"

def parse_absolute_date(text: str) -> date | None:
    m = re.search(r"\b(\d{1,2})\s+([A-Za-z]{3})\s+(\d{4})\b", text)
    if not m:
        return None
    dd, mon, yyyy = m.group(1), m.group(2), m.group(3)
    try:
        return datetime.strptime(f"{dd} {mon} {yyyy}", "%d %b %Y").date()
    except ValueError:
        return None


def _parse_amount(token: str) -> int:
    token = token.lower()
    if token in ("a", "an"):
        return 1
    return int(token)


def parse_relative_to_date(text: str, today: date | None = None) -> date | None:
    """
    Handles:
      'a minute ago', 'an hour ago', '2 minutes ago', '5 hours ago', '3 days ago', '10 seconds ago'
    Returns a DATE (not datetime).
    """
    if today is None:
        today = date.today()

    m = re.search(r"\b(a|an|\d+)\s+(second|minute|hour|day)s?\s+ago\b", text, flags=re.I)
    if not m:
        return None

    amount = _parse_amount(m.group(1))
    unit = m.group(2).lower()

    if unit == "day":
        return today - timedelta(days=amount)

    # seconds/minutes/hours => same calendar date (today)
    return today


def extract_date(card_text: str, today: date | None = None) -> tuple[str | None, date | None]:
    """
    Returns (date_raw, date_parsed) where date_raw can be:
      - '07 Dec 2025'
      - '6 days ago'
      - 'an hour ago'
    """
    if today is None:
        today = date.today()

    abs_m = re.search(r"\b\d{1,2}\s+[A-Za-z]{3}\s+\d{4}\b", card_text)
    if abs_m:
        raw = abs_m.group(0)
        return raw, parse_absolute_date(raw)

    rel_m = re.search(r"\b(a|an|\d+)\s+(second|minute|hour|day)s?\s+ago\b", card_text, flags=re.I)
    if rel_m:
        raw = rel_m.group(0)
        return raw, parse_relative_to_date(raw, today=today)

    return None, None

In [4]:
def make_driver(headless: bool = True) -> webdriver.Chrome:
    options = Options()
    if headless:
        options.add_argument("--headless=new")

    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1200,900")
    options.add_argument("--log-level=3")

    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)

In [5]:
def get_html(url: str, headless: bool = True, timeout_s: int = 30, scroll: bool = True) -> str:
    driver = make_driver(headless=headless)
    try:
        driver.get(url)

        WebDriverWait(driver, timeout_s).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.list-reviews, div.review-card"))
        )

        if scroll:
            for _ in range(4):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.8)

        return driver.page_source
    finally:
        driver.quit()

In [6]:
def clean_review_text(s: str) -> str | None:
    if not isinstance(s, str):
        return None
    x = s.strip()

    # remove leading absolute date
    x = re.sub(rf"^\s*\d{{1,2}}\s+{MONTHS}\s+\d{{4}}\s+", "", x)

    # remove leading relative timestamp ('2 days ago', 'an hour ago', '5 minutes ago', etc.)
    x = re.sub(r"^\s*(?:a|an|\d+)\s+(?:second|minute|hour|day)s?\s+ago\s+", "", x, flags=re.I)

    # remove leading "username recommends/doesn't recommend this product!"
    x = re.sub(
        r"^\s*\S+\s+(?:doesn[â€™']?t\s+)?recommends?\s+this\s+product!\s*",
        "",
        x,
        flags=re.IGNORECASE
    )

    # cut off metadata if it leaks into the wrapper
    x = re.split(r"\bUsage Period\b\s*:", x, maxsplit=1, flags=re.IGNORECASE)[0]
    x = re.split(r"\bPurchase Point\b\s*:", x, maxsplit=1, flags=re.IGNORECASE)[0]

    x = re.sub(r"\s+", " ", x).strip()
    return x or None


def clean_purchase_point(s: str) -> str | None:
    """
    Removes trailing numbers and extra spaces:
      'Shopee 0 0' -> 'Shopee'
      'Alfamart 0' -> 'Alfamart'
      'Female Daily Studio 0 0' -> 'Female Daily Studio'
    """
    if not isinstance(s, str):
        return None
    x = re.sub(r"\s+", " ", s).strip()

    # remove trailing numeric tokens
    x = re.sub(r"(?:\s+\d+)+\s*$", "", x).strip()

    return x or None

In [7]:
CARD_SELECTOR = "div.review-card"

def parse_reviews(html: str, card_selector: str = CARD_SELECTOR, today: date | None = None) -> list[dict]:
    if today is None:
        today = date.today()

    soup = BeautifulSoup(html, "html.parser")
    cards = soup.select(card_selector)

    rows: list[dict] = []

    for card in cards:
        card_text = card.get_text(" ", strip=True)

        is_recommended = card.select_one("p.recommend") is not None

        date_raw, date_parsed = extract_date(card_text, today=today)

        # username heuristic
        username = None
        for a in card.select("a"):
            t = a.get_text(strip=True)
            if t and len(t) <= 30:
                username = t
                break

        # rating
        rating = None
        stars_full = card.select("i.icon-ic_big_star_full, i[class*='star'][class*='full'], i[class*='star_full']")
        if stars_full:
            rating = len(stars_full)

        # usage period
        usage_period = None
        m = re.search(r"Usage Period\s*:\s*([^:]+?)(?:Purchase Point|$)", card_text, flags=re.I)
        if m:
            usage_period = m.group(1).strip()

        # purchase point (raw from card text)
        purchase_point_raw = None
        m = re.search(r"Purchase Point\s*:\s*(.+)$", card_text, flags=re.I)
        if m:
            purchase_point_raw = m.group(1).strip()

        purchase_point = clean_purchase_point(purchase_point_raw) if purchase_point_raw else None

        # review text
        content = card.select_one("div.review-content-wrapper")
        raw_review = content.get_text(" ", strip=True) if content else card_text
        review = clean_review_text(raw_review)

        if review is None:
            continue

        rows.append({
            "date": date_parsed.isoformat() if date_parsed else None,
            "date_raw": date_raw,
            "username": username,
            "review": review,
            "average_rating": rating,
            "is_recommended": is_recommended,
            "usage_period": usage_period,
            "purchase_point": purchase_point,
        })

    return rows

In [14]:
@dataclass(frozen=True)
class ScrapeConfig:
    base_url: str
    headless: bool = True
    timeout_s: int = 30
    scroll: bool = True
    polite_sleep_s: float = 1.0
    card_selector: str = CARD_SELECTOR


def scrape(cfg: ScrapeConfig, max_pages: int = 200) -> pd.DataFrame:
    all_rows: list[dict] = []
    seen = set()

    for p in range(1, max_pages + 1):
        print(f"Scraping page {p}...")
        url = f"{cfg.base_url}?page={p}"
        html = get_html(url, headless=cfg.headless, timeout_s=cfg.timeout_s, scroll=cfg.scroll)

        rows = parse_reviews(html, cfg.card_selector)

        if not rows:
            print("No reviews found â€” stopping.")
            break

        new_rows = []
        for r in rows:
            key = (r.get("date"), r.get("username"), r.get("review"))
            if key not in seen:
                seen.add(key)
                new_rows.append(r)

        if not new_rows:
            print("No new reviews â€” stopping.")
            break

        all_rows.extend(new_rows)
        time.sleep(cfg.polite_sleep_s)

    return pd.DataFrame(all_rows)

In [15]:
ROOT_PRODUCTS_DIR = Path("products")
ROOT_PRODUCTS_DIR.mkdir(parents=True, exist_ok=True)

In [16]:
BASE_URL = "https://reviews.femaledaily.com/products/cleanser/facial-wash/hada-labo/gokyujyun-ultimate-moisturizing-face-wash"

cat, brand, product = derive_subfolders_from_url(BASE_URL)
out_dir = ROOT_PRODUCTS_DIR / cat / brand / product
out_dir.mkdir(parents=True, exist_ok=True)

cfg = ScrapeConfig(
    base_url=BASE_URL,
    headless=True,
    scroll=True,
    card_selector=CARD_SELECTOR,
)

df = scrape(cfg, max_pages=5)
df.head()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...


Unnamed: 0,date,date_raw,username,review,average_rating,is_recommended,usage_period,purchase_point
0,2025-12-17,an hour ago,alphandro,masih terjangkau. harga masuk akal gak overpri...,5,True,3 months - 6 months,Alfamart
1,2025-12-15,2 days ago,miss_cullen,"review jujur,awalnya coba aja,tapi ternyata ba...",5,True,1 week - 1 month,Shopee
2,2025-12-11,6 days ago,carmennita_,"Simple dan oke, bisa membersihkan wajah dengan...",5,True,1 month - 3 months,Gift
3,2025-12-07,07 Dec 2025,wulanroshinta,sangat worth to price dan membuat wajah halus ...,5,True,More than 1 year,Shopee
4,2025-12-07,07 Dec 2025,miftanjnh,"Facewash Hada Labo tuh lembut banget di kulit,...",5,True,More than 1 year,Shopee


In [17]:
df = df.drop('date_raw', axis=1)

Unnamed: 0,date,username,review,average_rating,is_recommended,usage_period,purchase_point
0,2025-12-17,alphandro,masih terjangkau. harga masuk akal gak overpri...,5,True,3 months - 6 months,Alfamart
1,2025-12-15,miss_cullen,"review jujur,awalnya coba aja,tapi ternyata ba...",5,True,1 week - 1 month,Shopee
2,2025-12-11,carmennita_,"Simple dan oke, bisa membersihkan wajah dengan...",5,True,1 month - 3 months,Gift
3,2025-12-07,wulanroshinta,sangat worth to price dan membuat wajah halus ...,5,True,More than 1 year,Shopee
4,2025-12-07,miftanjnh,"Facewash Hada Labo tuh lembut banget di kulit,...",5,True,More than 1 year,Shopee
5,2025-12-07,NoviWulandari03,Hadalabo ini sangat bermanfaat untuk kulit muk...,5,True,More than 1 year,Female Daily Studio
6,2025-12-06,aureliape_,"cocok untuk kulit kombinasi, tidak bikin ketar...",5,True,1 month - 3 months,Female Daily Event
7,2025-12-06,Ekaaaay2,"face wash ternyaman, busanya lembut, ga bikin ...",4,True,1 month - 3 months,Traditional Market
8,2025-12-06,nadianir,ini adalah salah satu facewash terbaikk yang a...,5,True,More than 1 year,Shopee
9,2025-12-06,aurora75,Hada Labo Gokujyun Ultimate Moisturizing Face ...,5,True,1 month - 3 months,Shopee
