In [None]:
import csv, random, os, time
from selenium import webdriver
from urllib.parse import urlparse
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

from selenium.webdriver.common.by import By 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def configure_driver(options):
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    return driver

def get_categories(driver, url):
    # categories = ['bakery', 'fruit-veg', 'poultry-meat-seafood', 'deli-chilled-meals', 'freezer', \
    #             'international-foods', 'beer-wine-spirits', 'dairy-eggs-fridge', 'lunch-box', \
    #             'snacks-confectionery', 'drinks', 'health-wellness', 'baby', 'pantry', 'pet', \
    #             'cleaning-maintenance', 'electronics', 'beauty-personal-care', 'home-lifestyle' ]
    # Electronics and home-lifestyle was omitted due to size and uselessness (IGA ans Coles does not have it)
    categories = ['cleaning-maintenance']
    # reverse the list
    # categories = categories[::-1]
    for i in categories:
        print(i)    
    return categories

def write_product_to_csv(product, filepath):
    id, name, price, = product['id'], product['name'], product['price']
    base_url = "https://www.woolworths.com.au"
    product_link = product['product_link']
    if not product_link.startswith('/'):
        product_link = '/' + product_link
    full_url = base_url + product_link
    with open(filepath, "a", newline="") as f:            
        writer = csv.writer(f)
        writer.writerow([id, name, price, full_url])


def get_max_page_number(driver):
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "span.page-count"))
        )
        page_count_element = driver.find_element(By.CSS_SELECTOR, "span.page-count")
        max_page_number = int(page_count_element.text)
        return max_page_number
    except Exception as e:
        print(f"Could not determine the max page number: {e}")

def scrape_products_in_category(driver, category, url, first):
    print(f"Scraping products in the category: {category}")
    category_link = os.path.join(url, category)
    driver.get(category_link)
    
    max_page_number = get_max_page_number(driver)
    print(f"Max page number found: {max_page_number}")
    page = 1
    for page in range(page, max_page_number + 1):
        print(f"{page}/{max_page_number}")
        page_url = f"{category_link}?pageNumber={page}"
        driver.get(page_url)
        
        products = get_products_on_page(driver)
        filename = f"{category}.csv"
        if not os.path.exists("woolworths"):
            os.makedirs("woolworths")
        woolworthspath = os.path.join(os.getcwd(), 'woolworths')
        filepath = os.path.join(woolworthspath, filename)
        
        for product in products:
            write_product_to_csv(product, filepath)
        # time.sleep(random.randint(1, 6))

def get_shadow_root(driver, element):
    return driver.execute_script('return arguments[0].shadowRoot', element)

def get_products_on_page(driver):
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "wc-product-tile"))
    )

    products = driver.find_elements(By.CSS_SELECTOR, "wc-product-tile")
    product_details = []

    for shadow_host in products:
        try:
            shadow_root = get_shadow_root(driver, shadow_host)

            name_script = "return arguments[0].querySelector('div.title a').textContent;"
            name = driver.execute_script(name_script, shadow_root)

            price_script = "return arguments[0].querySelector('div.primary').textContent;"
            price = driver.execute_script(price_script, shadow_root)

            link_script = "return arguments[0].querySelector('a[href]').getAttribute('href');"
            product_link = driver.execute_script(link_script, shadow_root)
            
            product_id = urlparse(product_link).path.split("/")[-2]

            product_details.append({
                'name': name.strip(),
                'price': price.strip(),
                'product_link': product_link.strip(),
                'id': product_id
            })
        except Exception as e:
            print(f"Error no textContent: {name.strip()}")
    return product_details

def main():
    url = "https://www.woolworths.com.au/shop/browse"
    options = Options()
    options.add_argument("--enable-javascript")
    options.add_argument("--enable-cookies")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    
    ua = UserAgent()
    userAgent = ua.random
    options.add_argument(f"user-agent={userAgent}")
    
    driver = configure_driver(options)
    
    try:
        print("Here we go...")
        categories = get_categories(driver, url)
        
        first = 0
        for x, category in enumerate(categories):
            print(f"\nNumber: {x}/{len(categories)}")
            scrape_products_in_category(driver, category, url, first)
            # time.sleep(random.randint(1, 6))
            first += 1
                            
        print("Finished")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()