In [32]:
import os
import json
import time
import random
import requests
import traceback
import xml.etree.ElementTree as ET
from functools import wraps

def safe_open(file_path, mode='w', **kwargs):
    """
    Opens a file, ensuring that the folder or path exists.
    
    Args:
        file_path (str): The path to the file.
        mode (str): The mode in which to open the file (e.g., 'w', 'r', 'a').
        **kwargs: Additional arguments to pass to the `open()` function.

    Returns:
        file object: The opened file object.
    """
    # Ensure the parent folder exists
    folder_path = os.path.dirname(file_path)
    if folder_path:  # Only create directories if a folder path exists
        os.makedirs(folder_path, exist_ok=True)
    
    # Open the file
    return open(file_path, mode, **kwargs)

def parse_to_dict(data: str) -> dict:
    """
    Parses a string that could be either JSON or XML and returns it as a dictionary.

    Args:
        data (str): The input string, either in JSON or XML format.

    Returns:
        dict: The parsed data as a dictionary.

    Raises:
        ValueError: If the input is not valid JSON or XML.
    """
    # Attempt to parse as JSON
    try:
        return json.loads(data)
    except json.JSONDecodeError:
        pass  # It's not JSON, so we try XML

    # Attempt to parse as XML
    try:
        root = ET.fromstring(data)
        return xml_to_dict(root)
    except ET.ParseError:
        pass  # It's neither valid JSON nor XML
    
    raise ValueError("Input is neither valid JSON nor XML")

def xml_to_dict(element: ET.Element) -> dict:
    """
    Recursively converts an XML element and its children into a dictionary.

    Args:
        element (ET.Element): The XML element to convert.

    Returns:
        dict: The XML element represented as a dictionary.
    """
    # Convert the element's attributes and children to a dictionary
    data = {key: value for key, value in element.attrib.items()}  # Add attributes if any

    # Add children
    if element.text and element.text.strip():
        data["text"] = element.text.strip()  # Add text content if present

    for child in element:
        child_dict = xml_to_dict(child)
        if child.tag not in data:
            data[child.tag] = child_dict
        else:
            # If multiple children with the same tag exist, store them as a list
            if not isinstance(data[child.tag], list):
                data[child.tag] = [data[child.tag]]
            data[child.tag].append(child_dict)

    return data

def random_pause(min_delay=0.5, max_delay=3):
    """
    A decorator that pauses for a random duration before executing the wrapped function.

    Args:
        min_delay (float): Minimum duration (in seconds) for the pause. Default is 0.5 seconds.
        max_delay (float): Maximum duration (in seconds) for the pause. Default is 3 seconds.
    """
    def decorator(func):
        @wraps(func)  # Preserve the original function's metadata
        def wrapper(*args, **kwargs):
            # Generate a random delay
            pause_duration = random.uniform(min_delay, max_delay)
            print(f"Pausing for {pause_duration:.2f} seconds before calling '{func.__name__}'...")
            time.sleep(pause_duration)  # Pause for the random duration
            return func(*args, **kwargs)  # Call the original function
        return wrapper
    return decorator

@random_pause(0.01, 0.5)
def scrape(url: str) -> dict:
    try:
        headers = {
            'accept': '*/*',
            'accept-language': 'en-US,en;q=0.9',
            'access-control-request-headers': 'authorization,cache-control,expires,if-modified-since,pragma,queue-target,queueit-target,vary',
            'access-control-request-method': 'GET',
            'cache-control': 'max-age=0',
            'origin': 'https://www.watsons.com.hk',
            'priority': 'u=1, i',
            'referer': 'https://www.watsons.com.hk/',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-site',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        }

        response = requests.get(url, headers=headers)
        if response.status_code >= 200 and response.status_code < 400: return parse_to_dict(response.text)
        
    except Exception as e:
        print(e)
        print(traceback.format_exception())


In [33]:
# brand index

index_url = "https://api.watsons.com.hk/api/v2/wtchk/brands?fields=FULL&lang=en_HK&curr=HKD"
index_json = scrape(index_url)
with safe_open("watsons_online_store/brand_index.json", "w+") as j: json.dump(index_json, j, indent=2)


Pausing for 0.49 seconds before calling 'scrape'...


In [34]:
def scrape_brand_products_catalog(brand: dict):
    name = brand.get("name", "")
    code = brand.get("code", "")
    folder_name = f"{name}_{code}"
    full_product_list = []
    items_per_page = 64
    current_page = 0
    url = f"https://api.watsons.com.hk/api/v2/wtchk/products/search?fields=FULL&query=%3AproductBrandCode%3AproductBrandCode%3A{code}&pageSize={items_per_page}&currentPage={current_page}&sort=bestSeller&brandRedirect=true&ignoreSort=false&lang=en_HK&curr=HKD"
    brand_details = scrape(url)
    with safe_open(f"watsons_online_store/brands/{folder_name}/catalog/page_{current_page}.json", "w+") as j: json.dump(brand_details, j, indent=2)
    products_in_this_page = brand_details.get("products",[])
    full_product_list.extend(products_in_this_page)
    page_products_count = len(products_in_this_page)
    
    # need next page
    while page_products_count == items_per_page:
        current_page += 1
        url = f"https://api.watsons.com.hk/api/v2/wtchk/products/search?fields=FULL&query=%3AproductBrandCode%3AproductBrandCode%3A{code}&pageSize={items_per_page}&currentPage={current_page}&sort=bestSeller&brandRedirect=true&ignoreSort=false&lang=en_HK&curr=HKD"
        brand_details = scrape(url)
        with safe_open(f"watsons_online_store/brands/{folder_name}/catalog/page_{current_page}.json", "w+") as j: json.dump(brand_details, j, indent=2)
        products_in_this_page = brand_details.get("products",[])
        full_product_list.extend(products_in_this_page)
        page_products_count = len(products_in_this_page)
    
    with safe_open(f"watsons_online_store/brands/{folder_name}/catalog/full_product_list.json", "w+") as j: json.dump({"products": full_product_list}, j, indent=2)

In [36]:
# scrape for brand details
brands = index_json.get("brands", [])
random.shuffle(brands)

for idx, brand in enumerate(brands):
    scrape_brand_products_catalog(brand)
    if idx % 50 == 0: print(f"{idx}/{len(brands)} done")
    
    

Pausing for 0.19 seconds before calling 'scrape'...
0/893 done
Pausing for 0.37 seconds before calling 'scrape'...
Pausing for 0.31 seconds before calling 'scrape'...
Pausing for 0.01 seconds before calling 'scrape'...
Pausing for 0.19 seconds before calling 'scrape'...
Pausing for 0.13 seconds before calling 'scrape'...
Pausing for 0.04 seconds before calling 'scrape'...
Pausing for 0.10 seconds before calling 'scrape'...
Pausing for 0.04 seconds before calling 'scrape'...
Pausing for 0.22 seconds before calling 'scrape'...
Pausing for 0.05 seconds before calling 'scrape'...
Pausing for 0.09 seconds before calling 'scrape'...
Pausing for 0.12 seconds before calling 'scrape'...
Pausing for 0.19 seconds before calling 'scrape'...
Pausing for 0.08 seconds before calling 'scrape'...
Pausing for 0.13 seconds before calling 'scrape'...
Pausing for 0.05 seconds before calling 'scrape'...
Pausing for 0.26 seconds before calling 'scrape'...
Pausing for 0.50 seconds before calling 'scrape'...
P

In [39]:
def process_brands_folder(brands_folder: str = "watsons_online_store/brands"):
    """
    Loops through all 'xxx/catalog/full_product_list.json' under the 'brands' folder
    and performs scraping for each product.
    
    Args:
        brands_folder (str): The path to the 'brands' folder.
    """
    for root, dirs, files in os.walk(brands_folder):
        # Check if the current directory is a 'catalog' folder
        if os.path.basename(root) == 'catalog':
            # Look for 'full_product_list.json' in the current 'catalog' folder
            if 'full_product_list.json' in files:
                # Extract the 'xxx' part of the path
                # root: brands/xxx/catalog
                # os.path.dirname(root): brands/xxx
                # os.path.basename(os.path.dirname(root)): xxx
                subfolder_name = os.path.basename(os.path.dirname(root))
                
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, 'full_product_list.json')
                
                # Perform your desired action here
                print(f"Processing file: {json_file_path}")
                print(f"Subfolder: {subfolder_name}")
                
                product_list = []
                with safe_open(json_file_path, "r") as j:
                    product_list_json = json.load(j)
                    product_list = product_list_json.get("products", [])
                
                print(f"scraping for {len(product_list)} products...")
                for product in product_list:
                    product_code = product.get("code", "")
                    url = f"https://api.watsons.com.hk/api/v2/wtchk/products/{product_code}?fields=code,purchasable,name,summary,price(formattedValue),images(galleryIndex,FULL),baseProduct,averageRating,classifications,manufacturer,numberOfReviews,categories(FULL),baseOptions,variantOptions,variantType,FULL,availableForPickup,configurable,configuratorType,defaultVariantCode,description,elabCountryDeliveryModeMap,elabDisplayGrossWeight,elabEndPricePromotion(FULL),elabPreOrderDeliveryDate,elabPreOrderStartDate,elabPreOrderEndDate,elabExclusiveBrand,elabIsAdultOnly,elabIsSDD,elabIsStorePickupAllowed,elabMarkDownEliteMemPrice(FULL),elabMarkDownMemPrice(FULL),elabMarkDownPrice(FULL),elabMaxOrderQuantity,elabPrice(FULL),elabPromoEndDate,elabPromoStartDate,elabPromotionEndPrice(FULL),elabRoutingLocation(FULL),elabSubscribable,elabSubscriptionMode,elabVariantType,elabWeight,elabXBorderReferencePricePerProduct,elabXBorderReferenceTaxPricePerProduct,elabXBorderTaxPricePerProduct,masterBrand(FULL),maxOrderQuantity,multidimensional,priceRange,stock(ASIA_DEFAULT),subscriptionOfferList,tags,url&lang=en_HK&curr=HKD"
                    product_details = scrape(url)
                    with safe_open(f"{brands_folder}/{subfolder_name}/products/{product_code}.json", "w+") as j:
                        json.dump(product_details, j, indent=2)


process_brands_folder('watsons_online_store/brands')

Processing file: watsons_online_store/brands/WAY WAY_123174/catalog/full_product_list.json
Subfolder: WAY WAY_123174
scraping for 3 products...
Pausing for 0.47 seconds before calling 'scrape'...
Pausing for 0.14 seconds before calling 'scrape'...
Pausing for 0.50 seconds before calling 'scrape'...
Processing file: watsons_online_store/brands/VITASOY_122543/catalog/full_product_list.json
Subfolder: VITASOY_122543
scraping for 7 products...
Pausing for 0.17 seconds before calling 'scrape'...
Pausing for 0.04 seconds before calling 'scrape'...
Pausing for 0.45 seconds before calling 'scrape'...
Pausing for 0.42 seconds before calling 'scrape'...
Pausing for 0.37 seconds before calling 'scrape'...
Pausing for 0.45 seconds before calling 'scrape'...
Pausing for 0.27 seconds before calling 'scrape'...
Processing file: watsons_online_store/brands/FLUIMUCIL_106900/catalog/full_product_list.json
Subfolder: FLUIMUCIL_106900
scraping for 3 products...
Pausing for 0.04 seconds before calling 'scr

In [41]:
# get all promotions

def scrape_all_promotions(store_folder: str = "watsons_online_store"):
    """
    Loops through all 'xxx/catalog/full_product_list.json' under the 'brands' folder
    and performs scraping for each product.
    
    Args:
        brands_folder (str): The path to the 'brands' folder.
    """
    brands_folder = store_folder + "/brands"
    for root, dirs, files in os.walk(brands_folder):
        # Check if the current directory is a 'catalog' folder
        if os.path.basename(root) == 'catalog':
            # Look for 'full_product_list.json' in the current 'catalog' folder
            if 'full_product_list.json' in files:
                # Extract the 'xxx' part of the path
                # root: brands/xxx/catalog
                # os.path.dirname(root): brands/xxx
                # os.path.basename(os.path.dirname(root)): xxx
                subfolder_name = os.path.basename(os.path.dirname(root))
                
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, 'full_product_list.json')
                
                # Perform your desired action here
                print(f"Processing file: {json_file_path}")
                print(f"Subfolder: {subfolder_name}")
                
                product_list = []
                with safe_open(json_file_path, "r") as j:
                    product_list_json = json.load(j)
                    product_list = product_list_json.get("products", [])
                
                print(f"scraping for {len(product_list)} products...")
                for product in product_list:
                    product_code = product.get("code", "")
                    product_code = product_code[3:]
                    url = f"https://api.watsons.com.hk/api/v2/wtchk/products/{product_code}/promotions?fields=FULL&lang=en_HK&curr=HKD"
                    promotion_details = scrape(url)
                    with safe_open(f"{store_folder}/promotions/{product_code}.json", "w+") as j:
                        json.dump(promotion_details, j, indent=2)

scrape_all_promotions()

Processing file: watsons_online_store/brands/WAY WAY_123174/catalog/full_product_list.json
Subfolder: WAY WAY_123174
scraping for 3 products...
Pausing for 0.22 seconds before calling 'scrape'...
Pausing for 0.26 seconds before calling 'scrape'...
Pausing for 0.44 seconds before calling 'scrape'...
Processing file: watsons_online_store/brands/VITASOY_122543/catalog/full_product_list.json
Subfolder: VITASOY_122543
scraping for 7 products...
Pausing for 0.17 seconds before calling 'scrape'...
Pausing for 0.25 seconds before calling 'scrape'...
Pausing for 0.02 seconds before calling 'scrape'...
Pausing for 0.19 seconds before calling 'scrape'...
Pausing for 0.03 seconds before calling 'scrape'...
Pausing for 0.49 seconds before calling 'scrape'...
Pausing for 0.22 seconds before calling 'scrape'...
Processing file: watsons_online_store/brands/FLUIMUCIL_106900/catalog/full_product_list.json
Subfolder: FLUIMUCIL_106900
scraping for 3 products...
Pausing for 0.13 seconds before calling 'scr