In [None]:
import csv, random, os, time
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

def configure_driver(options):
    # Setup driver
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    return driver

def get_categories(driver, url):
    driver.get(url + "/browse")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    raw_categories = soup.find_all("a", class_="coles-targeting-ShopCategoriesShopCategoryStyledCategoryContainer")
    not_needed = ["/browse/down-down", "/on-special", \
                    "/browse/bonus-ovenware-credits", \
                    "/browse/dairy-eggs-fridge", \
                    "/browse/fruit-vegetables", \
                    "/browse/meat-seafood", \
                    "/browse/tobacco", \
                    "/browse/liquor", \
                    "/browse/bakery", \
                    "/browse/deli", \
                    "/browse/drinks", \
                    "/browse/frozen", \
                    "/browse/baby", \
                    "/browse/pet", \
                    "/browse/household", \
                    "/browse/health-beauty", \
                    "/browse/pantry", \
    ]
    # NB pantry needs to be added later
    categories = [category for category in raw_categories if category.get("href") not in not_needed]
    for category in categories:
        print(category.text)
    return categories

def scrape_products_in_category(driver, category, url, first):
    category_link = category.get("href")
    
    category_link = url + category_link
    print(category_link)
    driver.get(category_link)
    if first == 0:
        time.sleep(35)
    
    while True:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        products = soup.find_all("header", class_="product__header")
        filename = category.text + ".csv"
        # make coles directory if it doesn't exist
        if not os.path.exists("coles"):
            os.makedirs("coles")
        colespath = os.path.join(os.getcwd(), 'coles')
        filepath = os.path.join(colespath, filename)
        
        with open(filepath, "a", newline="") as f:
            writer = csv.writer(f)
            
            for product in products:
                name = product.find("h2", class_="product__title")
                price = product.find("span", class_="price__value")
                product_link = product.find("a", class_="product__link")["href"]
                product_code = product_link.split("-")[-1]
                if name and price:
                    name = name.text.strip()
                    price = price.text.strip()
                    link = url + product_link
                    writer.writerow([product_code, name, price, link])
                    
            pagination = soup.find("ul", class_="coles-targeting-PaginationPaginationUl")
            if not pagination:
                break
            if pagination:
                pages = pagination.find_all("li")
                last_page = int(pages[-2].text.strip()) if pages else 1
            else:
                last_page = 1
            total_pages = int(pages[-2].text.strip())
            start = 94      
            for page in range(start, last_page + 1):
                next_page_link = f"{category_link}?page={page}"
                driver.get(next_page_link)
                soup = BeautifulSoup(driver.page_source, "html.parser")
                products = soup.find_all("header", class_="product__header")
                print(f"{page}/{total_pages}")
                for product in products:
                    name = product.find("h2", class_="product__title")
                    price = product.find("span", class_="price__value")
                    product_link = product.find("a", class_="product__link")["href"]
                    product_code = product_link.split("-")[-1]
                    if name and price:
                        name = name.text.strip()
                        price = price.text.strip()
                        link = url + product_link
                        writer.writerow([product_code, name, price, link])
                time.sleep(random.randint(1, 6))     
            if page == last_page:
                break

def main():
    """
    Main function to orchestrate the scraping process.
    """
    url = "https://www.coles.com.au"
    options = Options()
    
    options.add_argument("--enable-javascript")
    options.add_argument("--enable-cookies")
    
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    
    ua = UserAgent()
    userAgent = ua.random
    options.add_argument(f"user-agent={userAgent}")
    
    driver = configure_driver(options)
    
    try:
        print("Here we go...")
        categories = get_categories(driver, url)
        
        first = 0
        for x, category in enumerate(categories):
            print(f"\nNumber: {x}/{len(categories)}")
            scrape_products_in_category(driver, category, url, first)
            time.sleep(random.randint(1, 6))
            first += 1
                            
        print("Finished")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()