In [68]:
import glob
import re
from typing import Union, List, Dict
from urllib.parse import unquote, quote_plus

import pandas as pd
# !pip install webdriver_manager

In [69]:
import time

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

In [72]:
electronic_household_items = [
    "television", "iron", "vacuum cleaner", "blender", "microwave", "toaster", "coffee maker",
    "dishwasher", "refrigerator", "oven", "washer", "dryer", "air conditioner", "heater", "fan",
    "hair dryer", "electric kettle", "rice cooker", "slow cooker", "pressure cooker", "food processor",
    "mixer", "juicer", "grill", "waffle maker", "sandwich maker", "popcorn maker", "ice cream maker",
    "electric griddle", "hot plate", "water purifier", "humidifier", "dehumidifier", "air purifier",
    "ceiling fan", "space heater", "robot vacuum", "cordless drill", "security camera", "smart doorbell",
    "smart lock", "thermostat", "smart light switch", "smart plug", "smart speaker", "sound bar",
    "home theater system", "streaming device", "gaming console", "router", "smart TV", "wireless charger",
    "Bluetooth speaker", "tablet", "e-reader", "smartphone", "smartwatch", "fitness tracker", "laptop",
    "desktop computer", "monitor", "keyboard", "mouse", "external hard drive", "USB flash drive",
    "headphones", "earbuds", "webcam", "digital camera", "camcorder", "drone", "smart light bulb",
    "electric toothbrush", "electric shaver", "hair straightener", "hair curler", "foot massager",
    "electric blanket", "electric skillet", "bread maker", "sous vide cooker", "food dehydrator",
    "electric wine opener", "electric can opener", "electric knife", "electric pressure washer",
    "smart refrigerator", "smart oven", "smart microwave", "robot lawn mower", "video doorbell",
    "smart garage door opener", "smart blinds", "smart irrigation system", "smart smoke detector",
    "smart carbon monoxide detector", "smart thermostat", "smart security system", "home automation hub",
    "smart scale", "robot mop", "smart ceiling fan", "smart alarm clock"
]

In [73]:
class Product:
    def __init__(self, product_id, name, category, price, description, about, brand, rating, image_url):
        self.product_id = product_id
        self.name = name
        self.price = price
        self.category = category
        self.description = description
        self.about = about
        self.brand = brand
        self.rating = rating
        self.image_url = image_url
        self.reviews = []

    def add_review(self, review):
        self.reviews.append(review)


In [74]:
class Review:
    def __init__(self, text: str):
        self.text: str = text

In [75]:
class ReviewScrapper:
    def __init__(self):
        options = Options()
        options.headless = True
        self.driver = webdriver.Chrome(options=options)

    def fetch_reviews(self, product_link):
        self.driver.get(product_link)
        time.sleep(5)
        reviews = []

        try:
            self.driver.find_element(By.PARTIAL_LINK_TEXT, "See more reviews").click()
            time.sleep(5)

            # Fetch reviews from the next page
            html_content = self.driver.page_source
            reviews.extend(self.parse_review_page(html_content))

            # Check for the next page link and click it
            next_page = self.driver.find_elements(By.CSS_SELECTOR, "li.a-last a")
            if next_page:
                next_page[0].click()
                time.sleep(5)  # Wait for the next page to load

                # Fetch reviews from the next page
                html_content = self.driver.page_source
                reviews.extend(self.parse_review_page(html_content))

        except Exception as e:
            print(f"Error fetching reviews: {e}")
        finally:
            self.cleanup()
        return reviews

    def parse_review_page(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        review_texts = [tag.get_text(strip=True) for tag in soup.find_all('span', {'data-hook': 'review-body'})]
        return [Review(text) for text in review_texts]

    def cleanup(self):
        self.driver.quit()


In [76]:
class Scrapper:
    def __init__(self):
        self.keyword: Union[str | None] = None
        self.product_count: int = 11
        self.base_url: str = "https://www.amazon.ca"
        self.products: List[Product] = []
        self.skipped_categories: List[str] = []
        self.max_retries: int = 1
        self.categories: List[str] = electronic_household_items
        # self.categories: List[str] = [
        #     # "smart TV", 12-
        #     # "Bluetooth speaker",
        #     "tablet",
        #     # "e-reader", "smartphone", "smartwatch", "fitness tracker", "laptop",
        #     # "desktop computer", "monitor", "keyboard", "mouse", "external hard drive", "USB flash drive",
        #     # "smart plug", "smart speaker", "sound bar",
        # ]

        self.headers: Dict[str, str] = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            'Accept-Language': 'en-US, en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
            'Referer': 'https://www.google.com/',
        }

        # Initialize Selenium WebDriver
        options = Options()
        options.headless = True
        self.driver = webdriver.Chrome(options=options)

    def set_keyword(self, keyword: str):
        self.keyword = keyword

    def search_products(self):
        search_url = f"{self.base_url}/s?k={quote_plus(self.keyword)}"

        for _ in range(self.max_retries):
            try:
                response = requests.get(search_url, headers=self.headers)
                response.raise_for_status()
                print(search_url, response.status_code)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    product_links = [
                        self.base_url + tag['href']
                        for tag in soup.find_all('a', {'class': 'a-link-normal s-no-outline'}, href=True)
                    ]

                    print(f"Found {len(product_links)} products")
                    return product_links[10:self.product_count]
                else:
                    print(f"Failed to fetch the search results. Status code: {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Exception Error fetching search results: {e}")
            except Exception as e:
                print(f"Error fetching search results: {e}")

            # Wait before retrying
            time.sleep(5)
            # If all retries fail, add the keyword to skipped categories
        self.skipped_categories.append(self.keyword)
        return []

    def extract_product_details(self, product_link: str):
        response = requests.get(product_link, headers=self.headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            try:
                category = self.keyword

                # Decode the URL to handle escaped characters
                decoded_url = unquote(product_link)

                # Regular expression pattern to match the ASIN in the URL
                asin_pattern = r'/dp/([A-Z0-9]{10})/'
                asin_match = re.search(asin_pattern, decoded_url)

                # Return the matched ASIN or None if no match is found
                product_id = asin_match.group(1) if asin_match else None

                name = soup.find(id="productTitle").get_text(strip=True)
                name = name.strip() if name else None

                brand = soup.select_one('tr.po-brand')
                brand = brand.get_text(strip=True).replace('Brand', '') if brand else None

                if brand is None:
                    brand = soup.select_one('a#bylineInfo')
                    brand = brand.get_text(strip=True).replace('Brand:', '') if brand else None

                # scrape product price
                price = soup.select_one('span.a-offscreen')
                price = price.text.replace('$', '') if price else None

                if not price:
                    price_whole = soup.select_one('span.a-price-whole').get_text(strip=True)
                    price_decimal = soup.select_one('span.a-price-fraction').get_text(strip=True)
                    price = f'{price_whole}.{price_decimal}'

                about = soup.find(id="feature-bullets").get_text(strip=True)
                about = about.strip() if about else None

                description = soup.select_one('#productDescription')
                description = description.text.strip() if description else None

                image_url = soup.find("img", {"id": "landingImage"})
                image_url = image_url['src'] if image_url else None

                # scrape product rating
                rating_element = soup.select_one('#acrPopover').attrs.get('title')
                rating = rating_element.replace('out of 5 stars', '') if rating_element else None

                return Product(
                    product_id=product_id,
                    name=name,
                    category=category,
                    price=price,
                    description=description,
                    about=about,
                    brand=brand,
                    rating=rating,
                    image_url=image_url
                )
            except AttributeError:
                print("Error parsing product details.")
                return None
        else:
            print("Failed to fetch the product page.")
            return None

    def aggregate_data(self):
        product_links = self.search_products()

        for link in product_links:
            product = self.extract_product_details(link)

            if product:
                self.products.append(product)
                review_scraper = ReviewScrapper()
                reviews = review_scraper.fetch_reviews(link)

                for review in reviews:
                    product.add_review(review)

    def save_to_csv(self, file_name: str):
        data = []
        columns = ['Id', 'Name', 'Category', 'Price', 'Description', 'About', 'Rating', 'ImageUrl', 'Brand', 'Review']

        if len(self.products) == 0:
            return

        for product in self.products:
            for review in product.reviews:
                data.append({
                    'Id': product.product_id,
                    'Name': product.name,
                    'Category': product.category,
                    'Price': product.price,
                    'Description': product.description,
                    'About': product.about,
                    'Rating': product.rating,
                    'ImageUrl': product.image_url,
                    'Brand': product.brand,
                    'Review': review.text,
                })

        df = pd.DataFrame(data, columns=columns)
        df.to_csv(file_name, index=False)

        # reset the current data
        self.products = []

        print(f"Data saved to {file_name}")

    def merge_csv(self):
        csv_files = glob.glob('./data/*.{}'.format('csv'))

        if len(csv_files) == 0:
            return

        dataset = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
        dataset.to_csv('amazon_electronics_products_reviews.csv', index=False)

        print(f"Data saved and merged.")

    def cleanup(self):
        self.driver.quit()

    def run(self):
        for category in self.categories:
            name = category.replace(' ', '_')
            file_name = f'./data/amazon_{name}_products_reviews.csv'
            self.set_keyword(category)
            self.aggregate_data()
            self.save_to_csv(file_name)

            time.sleep(5)

        self.cleanup()
        print(f'Skipped categories are: {self.skipped_categories}')
        self.merge_csv()


if __name__ == "__main__":
    scraper = Scrapper()
    scraper.run()


https://www.amazon.ca/s?k=tablet 200
Found 69 products
Data saved to ./data/amazon_tablet_products_reviews.csv
Skipped categories are: []
