In [None]:
# This file aims to scrape useful information about products from AGI

import csv, random, os, time, math
from selenium import webdriver
from urllib.parse import urlparse
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

from selenium.webdriver.common.by import By 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def configure_driver(options):
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    return driver

def get_categories(driver, url):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.rds-navigation-menu__item button"))
    )
    category_elements = driver.find_elements_by_css_selector("li.rds-navigation-menu__item button")
    categories = [element.get_attribute('id') for element in category_elements]
    base_url = "https://www.igashop.com.au/categories/"
    category_links = [base_url + category.replace('radix-4-trigger-', '').replace(' ', '-').replace(',', '').lower() for category in categories]
    return category_links

def get_subcategories(driver, category_url):
    driver.get(category_url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.min-w-fit.snap-start div a"))
    )
    subcategory_elements = driver.find_elements_by_css_selector("div.min-w-fit.snap-start div a")
    subcategories = [element.get_attribute('href') for element in subcategory_elements]
    return subcategories

def write_product_to_csv(product, filepath):
    id, name, price, url = product['id'], product['name'], product['price'], product['product_link']
    file_exists = os.path.isfile(filepath)

    with open(filepath, "a", newline="") as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(["ID", "Name", "Price", "URL"])
        elif os.stat(filepath).st_size == 0:
            writer.writerow(["ID", "Name", "Price", "URL"])
            writer.writerow([id, name, price, url])
        writer.writerow([id, name, price, url])

def get_n_pages_from_subcategory(driver, sub_category_link):
    driver.get(sub_category_link)
    try:
        element = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "h2"))
        )
        WebDriverWait(driver, 10).until(
            lambda driver: "(" in element.text
        )
        text = element.text

        # Extract the number from the text
        number = int(text.split("(")[1].replace(")", ""))

        # Divide the number by 20 and round up
        n_pages = math.ceil(number / 20)

        # Generate a list of page links
        page_links = [f"{sub_category_link}?page={i}" for i in range(1, n_pages + 1)]
        page_links = [link.replace('1?page=', '') for link in page_links]

        return page_links
    except:
        print(f"No h2 element found in {sub_category_link}")
        return []

def get_shadow_root(driver, element):
    return driver.execute_script('return arguments[0].shadowRoot', element)

def get_products_on_page(driver, page_link, csv_path):
    # Navigate to the given page link
    driver.get(page_link)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[data-rds-modal-close]'))).click()
    
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-product-card='true']")))
    product_elements = driver.find_elements_by_css_selector("[data-product-card='true']")

    products = []
    for product in product_elements:
        try:
            # Get the product name
            name_element = product.find_element_by_css_selector("a[href^='/product/']")
            name = name_element.text.strip()

            # Get the product link
            product_link = name_element.get_attribute('href')
            product_code = product_link.split('-')[-1]
            
            # Get the name based on the product-link
            name = product_link.split('/')[-1].replace('-', ' ').title()
            name = name.replace(product_code, '').strip()
            
            price = "N/A"
            
            try:
                price_element = WebDriverWait(product, 10).until(
                    EC.presence_of_element_located((By.XPATH, ".//div[contains(@class, 'text-') and contains(@class, '18px')]//span[contains(@class, 'font-bold') and contains(@class, 'capsize')]"))
                )
                price = price_element.text.strip()
            except Exception as e:
                print(f"An error occurred while trying to find the price element for {name}: {e}")
                price = "N/A"  # Use a default value for the price if it cannot be found
            
            product = {
                "id": product_code,
                "name": name,
                "price": price,
                "product_link": product_link
            }
            
            write_product_to_csv(product, csv_path)
                
                        
        except Exception as e:
            print(f"An error occurred while processing a product card: {e}")

    return products
    

def main():
    url = "https://www.igashop.com.au/"
    options = Options()
    options.add_argument("--enable-javascript")
    options.add_argument("--enable-cookies")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    
    ua = UserAgent()
    userAgent = ua.random
    options.add_argument(f"user-agent={userAgent}")
    
    driver = configure_driver(options)
    
    path = os.path.join(os.getcwd(), 'agi')
    
    print("Here we go...")
    all_categories = get_categories(driver, url)
    # remove elements in list
    already_done = ['fruit-and-vegetable', 'pantry', 'meat-seafood-and-deli', \
                    'frozen', 'bakery', 'drinks', 'baby', 'dairy-eggs-and-fridge', \
                    'health-and-beauty', 'household', 'pet', 'other', \
        ]
    categories = [i for i in all_categories if i.split('/')[-1] not in already_done]
    len_categories = len(categories)
    
    for x, category in enumerate(categories):
        path_category = category.split('/')[-1]
        agi_category = os.path.join(os.getcwd(), 'agi2', path_category)
        print(f"Category: {path_category} {x+1}/{len_categories}")
        
        if not os.path.exists(agi_category):
            os.makedirs(agi_category)
        
        sub_categories = get_subcategories(driver, category)
        len_sub_categories = len(sub_categories)
        for i, sub_category_link in enumerate(sub_categories):
            subcat = sub_category_link.split('/')[-2]
            print(f"    {i+1}/{len_sub_categories}: {subcat}")
            
            filename = f"{subcat}.csv"
            csv_path = os.path.join(agi_category, filename)
            
            links = get_n_pages_from_subcategory(driver, sub_category_link)
            len_links = len(links)
            for y, link in enumerate(links):
                print(f"        {y+1}/{len_links}: {link}")
                get_products_on_page(driver, link, csv_path)                          


if __name__ == "__main__":
    main()