In [1]:
"""
This module sets up the necessary imports and configurations for web scraping using Selenium.
It includes functions for configuring the Edge WebDriver, setting up logging, and utility functions
for processing product data.
"""
import csv
import os

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import(
     TimeoutException,
     StaleElementReferenceException,
      NoSuchElementException
)
from typing import List, Tuple
from time import sleep
from drivers_setup.edge_driver_setup import configure_edge_webdriver
from drivers_setup.logging_setup import (
    setup_logging,
    LogLevel
)
from utils.utility_functions import UtilityFunctions
from utils.urls import *



  """


In [3]:
"""
This cell initializes the UtilityFunctions class instance.
"""
utils = UtilityFunctions()

In [None]:
"""
This module contains the
DadduChargerScraper class which is used to scrape product
data from the DadduCharger website.
"""
class DadduChargerScraper:
    def __init__(self, driver_path: str, log_level=LogLevel.INFO):
        self.driver = configure_edge_webdriver(driver_path)
        self.logger = setup_logging(log_level)

    def get_total_pages(self) -> int:
        """Extract the total number of pages from the pagination element."""
        try:
            pagination = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'page-numbers'))
            )
            page_numbers = pagination.find_elements(By.CLASS_NAME, 'page-numbers')

            last_page = max(
                int(page.text) for page in page_numbers if page.text.isdigit()
            )

            return last_page

        except TimeoutException:
            self.logger.error("Pagination not found.")
            return 1

    def scrape_category(self, url: str) -> List[Tuple[str, str, str, str, str, str, str]]:
        """Scrapes all pages of a category by dynamically counting total pages."""
        self.driver.get(url)
        total_pages = self.get_total_pages()
        wait = WebDriverWait(self.driver, 10)
        products = []

        for page_num in range(1, total_pages + 1):
            paginated_url = f"{url}/?product-page={page_num}"
            self.driver.get(paginated_url)

            try:
                product_list = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'products')))
                product_items = product_list.find_elements(By.CLASS_NAME, 'product')
                for item in product_items:
                    try:
                        title_element = item.find_element(By.CLASS_NAME, 'woocommerce-loop-product__title')
                        title = title_element.text

                        price_element = item.find_elements(By.CLASS_NAME, 'price')
                        price_text = price_element[0].text if price_element else 'N/A'
                        price = utils.extract_price(price_text)

                        product_url = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
                        image_url = item.find_element(By.TAG_NAME, 'img').get_attribute('src')

                        vendor = title.split()[0] if title else "Unknown"
                        condition = "Used" if "Used" in title else "New"
                        category = utils.categorize_product(title)

                        #availability = self.scrape_availability(product_url)

                        products.append((utils.clean_title(title), price, category, vendor, condition, "DadduCharger", product_url, image_url))

                    except StaleElementReferenceException:
                        self.logger.error("Stale element reference error while scraping a product.")

            except TimeoutException:
                self.logger.error(f"Timeout while loading page {page_num} for URL: {url}")
                break

        return products


    def scrape_all_categories(self) -> List[Tuple[str, str, str, str, str, str, str]]:
        """Scrapes all predefined category URLs."""
        all_products = []
        for url in dadducharger_urls:
            self.logger.info(f"Scraping category: {url}")
            products = self.scrape_category(url)
            all_products.extend(products)
        return all_products


    def run(self):
        """Runs the scraping process."""
        try:
            data = self.scrape_all_categories()
        finally:
            self.driver.quit()

        utils.save_to_csv(data, folder="output_data", filename="dadducharger_products.csv")
        self.logger.info("Data saved to dadducharger_products.csv")


if __name__ == "__main__":
    scraper = DadduChargerScraper(WEB_DRIVER_EXECUTABLE_PATH)
    scraper.run()


In [None]:
"""
TechMatchedScraper class is used to scrape product data from the TechMatched website.

Methods:
    get_last_page_number: Extracts the last page number from the pagination element.
    scrape_category: Scrapes all products from a given category URL.
    scrape_all_categories: Scrapes all predefined category URLs.
    save_to_csv: Saves scraped data to a CSV file.
    run: Runs the scraping process.
"""
class TechMatchedScraper:
    def __init__(self, driver_path: str, log_level=LogLevel.INFO):
        self.driver = configure_edge_webdriver(driver_path)
        self.wait = WebDriverWait(self.driver, 5)
        self.logger = setup_logging(log_level)



    def scrape_category(self, url: str) -> List[Tuple[str, str, str, str, str, str, str]]:
        products = []
        page = 1

        while True:
            full_url = f"{url}page/{page}/"
            self.logger.info(f"Scraping page {page} of {url}...")
            self.driver.get(full_url)

            try:
                product_list = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'products')))
                product_items = product_list.find_elements(By.CLASS_NAME, 'product')

                if not product_items:
                    break

                for item in product_items:
                    title_element = item.find_element(By.CLASS_NAME, 'woocommerce-loop-product__title')
                    title = title_element.text

                    price_element = item.find_elements(By.CLASS_NAME, 'price')
                    price_text = price_element[0].text if price_element else 'N/A'
                    price = utils.extract_price(price_text)

                    product_url = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    image_url = item.find_element(By.TAG_NAME, 'img').get_attribute('src')

                    vendor = title.split()[0] if title else "Unknown"
                    condition = "Used" if "Used" in title else "New"

                    category = utils.categorize_product(title)

                    website = "TechmMatched"
                    products.append((utils.clean_title(title), price, category, vendor, condition,website, product_url, image_url))

                last_page = self.get_last_page_number()
                if page >= last_page:
                    break

                page += 1

            except TimeoutException:
                self.logger.error(f"Timeout while loading page {page}.")
                break
            except StaleElementReferenceException:
                self.logger.error("Stale element reference error.")
                break
            except Exception as e:
                self.logger.error(f"Unexpected error: {e}")
                break

        return products

    def scrape_all_categories(self) -> List[Tuple[str, str, str, str, str, str, str]]:
        all_products = []
        for url in techmatched_urls:
            all_products.extend(self.scrape_category(url))
        return all_products


    def run(self):
        """Runs the scraping process."""
        try:
            data = self.scrape_all_categories()
        finally:
            self.driver.quit()

        utils.save_to_csv(data,folder="output_data", filename="techmatched_products.csv")
        self.logger.info("Data saved to techmatched_products.csv")


if __name__ == "__main__":
    driver = configure_edge_webdriver(WEB_DRIVER_EXECUTABLE_PATH)
    scraper = TechMatchedScraper(WEB_DRIVER_EXECUTABLE_PATH)
    scraper.run()

In [None]:
"""
JunaidTechScraper class is used to scrape product data from the JunaidTech website.

Methods:
    get_total_pages: Extracts the total number of pages from the pagination element.
    scrape_category: Scrapes all products from a given category URL.
    scrape_all_categories: Scrapes all predefined category URLs.
    run: Runs the scraping process.
"""
class JunaidTechScraper:
    def __init__(self, driver_path: str, log_level=LogLevel.INFO):
        self.driver = configure_edge_webdriver(driver_path)
        self.wait = WebDriverWait(self.driver, 10)
        self.logger = setup_logging(log_level)

    def get_total_pages(self) -> int:
        """Extracts the total number of pages from the pagination element."""
        try:
            pagination = self.wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, 'pagination'))
            )
            page_numbers = pagination.find_elements(By.CLASS_NAME, 'PageNumber')
            last_page = max(int(page.text) for page in page_numbers if page.text.isdigit())
            return last_page
        except TimeoutException:
            self.logger.error("Pagination not found.")
            return 1

    def scrape_category(self, url: str) -> List[Tuple[str, str, str, str, str, str, str, str]]:
        """Scrapes all pages of a category by dynamically counting total pages."""
        self.driver.get(url)
        total_pages = self.get_total_pages()
        products = []

        for page_num in range(1, total_pages + 1):
            paginated_url = f"{url}?sort=2&page={page_num}"
            print(paginated_url)
            self.driver.get(paginated_url)
            try:
                self.wait.until(EC.presence_of_element_located((By.ID, 'divListView')))

                product_items = self.driver.find_elements(By.XPATH, "//div[contains(@class, 'item') and contains(@class, 'list-view')]")

                for item in product_items:
                    try:
                        title_element = item.find_element(By.XPATH, ".//h4[@name='list-productname']/a")
                        title = title_element.text.strip()
                        product_url = paginated_url + title_element.get_attribute("href")

                        image_element = item.find_element(By.XPATH, ".//div[@class='image']/a/img")
                        image_url = image_element.get_attribute("src")

                        try:
                            price_element = item.find_element(By.CLASS_NAME, 'price')
                            price = price_element.text.strip()
                        except NoSuchElementException:
                            price = "N/A"

                        vendor = title.split()[0] if title else "Unknown"
                        condition = "Used" if "Used" in title else "New"

                        category = UtilityFunctions.categorize_product(title)
                        website = "JunaidTech"

                        products.append((UtilityFunctions.clean_title(title), price, category, vendor, condition, website, product_url, image_url))

                    except Exception as e:
                        print(f"Error extracting product details: {e}")
            except TimeoutException:
                self.logger.error(f"Timeout while loading page {page_num} for URL: {url}")
                break

        return products

    def scrape_all_categories(self) -> List[Tuple[str, str, str, str, str, str, str, str]]:
        all_products = []
        for url in junaidtech_urls:
            all_products.extend(self.scrape_category(url))
        return all_products

    def run(self):
        """Runs the scraping process."""
        try:
            data = self.scrape_all_categories()
        finally:
            self.driver.quit()

        utils.save_to_csv(data, folder="output_data", filename="junaidtech_products.csv")
        self.logger.info("Data saved to junaidtech_products.csv")


if __name__ == "__main__":
    scraper = JunaidTechScraper(WEB_DRIVER_EXECUTABLE_PATH)
    scraper.run()


In [None]:
class PakLapScraper:
    def __init__(self, driver_path: str, log_level=LogLevel.INFO):
        self.driver = configure_edge_webdriver(driver_path)
        self.wait = WebDriverWait(self.driver, 10)
        self.logger = setup_logging(log_level)

    import re

    def get_total_pages(self) -> int:
        """Extracts the total number of pages by counting numeric page links."""
        try:
            pagination = self.wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, 'pages'))
            )
            page_items = pagination.find_elements(By.CSS_SELECTOR, '.pages-items li')

            page_texts = [item.text.strip() for item in page_items]
            print(f"Extracted Page Items: {page_texts}")

            page_numbers = [int(num) for text in page_texts for num in re.findall(r'\d+', text)]

            print(f"Extracted Page Numbers: {page_numbers}")

            return max(page_numbers) if page_numbers else 1
        except TimeoutException:
            print("Pagination not found. Assuming 1 page.")
            return 1

    def scrape_category(self, url: str) -> List[Tuple[str, str, str, str, str, str, str]]:
        """Scrapes all pages of a category dynamically."""
        self.driver.get(url)
        total_pages = self.get_total_pages()
        print(f"Total pages for {url}: {total_pages}")

        products = []
        for page_num in range(1, total_pages + 1):
            paginated_url = f"{url}&p={page_num}" if "?" in url else f"{url}?p={page_num}"
            print(f"Scraping: {paginated_url}")
            self.driver.get(paginated_url)

            try:
                product_list = self.wait.until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'products'))
                )
                product_items = product_list.find_elements(By.CLASS_NAME, 'product-item')

                for item in product_items:
                    try:
                        title_element = item.find_element(By.CLASS_NAME, 'product-item-link')
                        title = title_element.text.strip()

                        price_element = item.find_elements(By.CLASS_NAME, 'price')
                        price_text = price_element[0].text if price_element else 'N/A'
                        price = utils.extract_price(price_text)

                        product_url = title_element.get_attribute('href')
                        image_url = item.find_element(By.TAG_NAME, 'img').get_attribute('src')

                        vendor = title.split()[0] if title else "Unknown"
                        condition = "Used" if "Used" in title else "New"
                        category = utils.categorize_product(title)

                        products.append((utils.clean_title(title), price, category, vendor, condition, "PakLap", product_url, image_url))

                    except StaleElementReferenceException:
                        print("Stale element reference error while scraping a product.")

            except TimeoutException:
                print(f"Timeout while loading page {page_num} for URL: {url}")
                break

        return products


    def scrape_all_categories(self):
        all_products = []
        for url in paklap_urls:
            all_products.extend(self.scrape_category(url))
        return all_products

    def run(self):
        """Runs the scraping process."""
        try:
            data = self.scrape_all_categories()
        finally:
            self.driver.quit()

        utils.save_to_csv(data, folder="output_data", filename="paklap_products.csv")
        print("Data saved to paklap_products.csv")

if __name__ == "__main__":
    scraper = PakLapScraper(WEB_DRIVER_EXECUTABLE_PATH)
    scraper.run()

Extracted Page Items: ["You're currently reading page\n1", 'Page\n2', 'Page\n3', 'Page\n4', 'Page\n5', 'Page\nNext']
Extracted Page Numbers: []
Total pages for https://www.paklap.pk/laptops-prices.html: 1
Scraping: https://www.paklap.pk/laptops-prices.html?p=1
Pagination not found. Assuming 1 page.
Total pages for https://www.paklap.pk/accessories.html: 1
Scraping: https://www.paklap.pk/accessories.html?p=1
Pagination not found. Assuming 1 page.
Total pages for https://www.paklap.pk/used-laptop-price-pakistan.html: 1
Scraping: https://www.paklap.pk/used-laptop-price-pakistan.html?p=1
Extracted Page Items: ["You're currently reading page\n1", 'Page\n2', 'Page\n3', 'Page\nNext']
Extracted Page Numbers: []
Total pages for https://www.paklap.pk/apple-products.html: 1
Scraping: https://www.paklap.pk/apple-products.html?p=1
Data saved to output_data\paklap_products.csv
Data saved to paklap_products.csv
