In [1]:
import logging
from selenium.common.exceptions import NoSuchElementException
import sqlite3
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(),  # Jupyter Notebook'a yönlendirme
        logging.FileHandler("logfile.log")  # Dosyaya yazma
    ],
)
logger = logging.getLogger("web-scraper")

In [2]:
from app.tasks.actions import WebScraper
from selenium.webdriver.common.by import By
import time
scraper = WebScraper()

In [3]:
url="http://example.com"
elements="text"

In [4]:
url = "https://www.amazon.co.uk/Best-Sellers/zgbs/ref=zg_bs_unv_drugstore_0_2826701031_4"
locator = (By.XPATH, '//div[@id="zg_left_colmask"]')
scraper.open_page(url,locator,wait_time=3,definition="openning amazon uk bestseller")

2024-12-30 00:42:37,183 - web-scraper - INFO - Operation: openning amazon uk bestseller
2024-12-30 00:42:37,184 - web-scraper - INFO - Opening URL: https://www.amazon.co.uk/Best-Sellers/zgbs/ref=zg_bs_unv_drugstore_0_2826701031_4
2024-12-30 00:42:42,275 - web-scraper - INFO - Operation: Check element after opening URL: https://www.amazon.co.uk/Best-Sellers/zgbs/ref=zg_bs_unv_drugstore_0_2826701031_4
2024-12-30 00:42:42,276 - web-scraper - INFO - Checking for element with xpath: //div[@id="zg_left_colmask"]
2024-12-30 00:42:42,295 - web-scraper - INFO - Element found and visible
2024-12-30 00:42:42,296 - web-scraper - INFO - Page loaded successfully and element is present


In [5]:

scraper.click((By.XPATH,'//*[@id="nav-global-location-slot"]'),definition="clicking locaiton box")

2024-12-30 00:42:42,299 - web-scraper - INFO - Operation: clicking locaiton box
2024-12-30 00:42:42,300 - web-scraper - INFO - Waiting to click element with xpath: //*[@id="nav-global-location-slot"]
2024-12-30 00:42:42,361 - web-scraper - INFO - Click action performed successfully


In [6]:
scraper.click((By.XPATH,'//*[@id="GLUXZipUpdateInput"]'),"clicking location input bar")

2024-12-30 00:42:42,365 - web-scraper - INFO - Operation: clicking location input bar
2024-12-30 00:42:42,365 - web-scraper - INFO - Waiting to click element with xpath: //*[@id="GLUXZipUpdateInput"]
2024-12-30 00:42:42,926 - web-scraper - INFO - Click action performed successfully


In [7]:
scraper.input_text((By.XPATH,'//*[@id="GLUXZipUpdateInput"]'),"WC2H 9jq","clicking location input update")

2024-12-30 00:42:42,929 - web-scraper - INFO - Operation: clicking location input update
2024-12-30 00:42:42,930 - web-scraper - INFO - Preparing to input text into element with xpath: //*[@id="GLUXZipUpdateInput"]
2024-12-30 00:42:42,985 - web-scraper - INFO - Successfully input text: 'WC2H 9jq' into element with xpath: //*[@id="GLUXZipUpdateInput"]


In [8]:
scraper.click((By.XPATH,'//*[@id="GLUXZipInputSection"]/div[2]'),"clicking accept location button")





2024-12-30 00:42:42,988 - web-scraper - INFO - Operation: clicking accept location button
2024-12-30 00:42:42,989 - web-scraper - INFO - Waiting to click element with xpath: //*[@id="GLUXZipInputSection"]/div[2]
2024-12-30 00:42:43,045 - web-scraper - INFO - Click action performed successfully


In [9]:
scraper.click((By.XPATH,'//*[@id="a-popover-1"]/div/div[2]/span'),"dont know now")


2024-12-30 00:42:43,048 - web-scraper - INFO - Operation: dont know now
2024-12-30 00:42:43,049 - web-scraper - INFO - Waiting to click element with xpath: //*[@id="a-popover-1"]/div/div[2]/span
2024-12-30 00:42:43,614 - web-scraper - INFO - Click action performed successfully


In [10]:
#ACCEPT COOKUE
scraper.click((By.XPATH,'//*[@id="a-autoid-0"]'),"accept cookie",1,True)

2024-12-30 00:42:43,617 - web-scraper - INFO - Operation: accept cookie
2024-12-30 00:42:43,618 - web-scraper - INFO - Waiting to click element with xpath: //*[@id="a-autoid-0"]
2024-12-30 00:42:43,914 - web-scraper - INFO - Click attempt 1 performed successfully
2024-12-30 00:42:44,978 - web-scraper - INFO - Click attempt 2 performed successfully
2024-12-30 00:42:44,992 - web-scraper - INFO - Element disappeared after click


In [11]:
def is_leaf(driver):
    """
    Belirtilen XPath'lerden herhangi birinin sayfada bulunup bulunmadığını kontrol eder.
    
    :param driver: Selenium WebDriver örneği.
    :return: XPath'lerden biri bulunursa True, aksi halde False.
    """
    xpaths = [
        "//div[@class='_p13n-zg-nav-tree-all_style_zg-browse-item__1rdKf _p13n-zg-nav-tree-all_style_zg-browse-height-large__1z5B8' and .//a]/following-sibling::div[@role='group']",
        # Add your additional XPaths here
        "//your/second/xpath",
        "//your/third/xpath"
    ]
    
    for xpath in xpaths:
        try:
            element = driver.find_element(By.XPATH, xpath)
            if element:
                return True
        except NoSuchElementException:
            continue
    
    return False

In [12]:
def collect_function(path, level):
    """
    Toplanan menü yolunu ve seviyesini veritabanına kaydeder.
    """
    try:
        conn = sqlite3.connect('menu_data.db')
        cursor = conn.cursor()
        path_str = " > ".join(path)
        cursor.execute("INSERT INTO menu_paths (path, level) VALUES (?, ?)", (path_str, level))
        conn.commit()
        conn.close()
        logger.info(f"Veri kaydedildi: {path_str} (Seviye {level})")
    except Exception as e:
        logger.error(f"Veri kaydedilirken hata oluştu: {e}")

In [13]:
scraper.scrape_and_save_menu((By.XPATH,"//div[@role='group']//div[@role='treeitem']//a"),(By.XPATH,"//div[@role='group']//div[@role='treeitem']"),is_leaf,1)

2024-12-30 00:42:45,006 - root - INFO - İşleniyor: Root (Seviye: 0)
2024-12-30 00:42:47,329 - root - INFO - İşleniyor: Amazon Devices & Accessories (Seviye: 1)
2024-12-30 00:42:48,366 - root - INFO - İşleniyor: Amazon Renewed (Seviye: 1)
2024-12-30 00:42:49,652 - root - INFO - İşleniyor: Apps & Games (Seviye: 1)
2024-12-30 00:42:51,213 - root - INFO - İşleniyor: Automotive (Seviye: 1)
2024-12-30 00:42:52,604 - root - INFO - İşleniyor: Baby Products (Seviye: 1)
2024-12-30 00:42:54,122 - root - INFO - İşleniyor: Beauty (Seviye: 1)
2024-12-30 00:42:55,495 - root - INFO - İşleniyor: Books (Seviye: 1)
2024-12-30 00:42:57,169 - root - INFO - İşleniyor: Business, Industry & Science (Seviye: 1)
2024-12-30 00:42:58,979 - root - INFO - İşleniyor: CDs & Vinyl (Seviye: 1)
2024-12-30 00:43:00,632 - root - INFO - Progress saved to scraping_progress/20241230_004245_visited.json and scraping_progress/20241230_004245_located.json
2024-12-30 00:43:00,632 - root - INFO - İşleniyor: Climate Pledge Friendl

In [14]:
scraper.quit()

2024-12-30 12:39:53,792 - web-scraper - INFO - Operation: Quit the browser
2024-12-30 12:39:53,792 - web-scraper - INFO - Quitting the browser
