In [1]:
link = 'https://www.amazon.com/Apple-2024-MacBook-13-inch-Laptop/product-reviews/B0CX3NVXV9/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'

## Funtions

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains


In [3]:
def wait_for_page_load(driver, timeout=10):
    '''
    Wait for the page to load within the given timeout period.
    If the page does not load in the specified time, an exception is raised.
    '''
    try:
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
    except Exception as e:
        print(f"Error while waiting for page load: {e}")


In [4]:
def load_page(driver, URL):
    '''
    Load the given URL and wait until the page is fully loaded.
    If the page does not load within 10 seconds, an exception will be raised.
    '''
    driver.get(URL)
    wait_for_page_load(driver)
    assert "No results found." not in driver.page_source

In [5]:
def scroll_page(driver, pause_time=1, scroll_increment=300, scroll_pause=0.5):
    '''
    Scrolls down the entire page slowly to mimic human behavior and allow content to load properly.
    
    Parameters:
    - driver: The Selenium WebDriver instance.
    - pause_time: Time to wait (in seconds) after scrolling to allow content to load.
    - scroll_increment: Number of pixels to scroll in each step.
    - scroll_pause: Time to wait (in seconds) between each scroll increment.
    '''
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
        
        time.sleep(scroll_pause)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            break
        
        last_height = new_height
        
        time.sleep(pause_time)

    print("Reached the bottom of the page.")


In [6]:
def click_next_page(driver):
    '''
    Attempts to find and click the "Next page" button if it exists.
    Handles the case where the last page does not contain a link.
    '''
    try:
        next_button = driver.find_element(By.XPATH, "//li/a[text()='Next page']")
        
        href = next_button.get_attribute("href")
        if href:
            next_button.click()
            return True
        else:
            return False
    except NoSuchElementException:
        try:
            li_tag = driver.find_element(By.XPATH, "//li[text()='Next page']")
            return False
        except NoSuchElementException:
            return False


In [7]:
def get_reviews(driver):
    '''
    Retrieves and returns a list of reviews from the current page.
    '''
    reviews = []
    try:
        review_elements = driver.find_elements(By.CLASS_NAME, 'a-section.review.aok-relative')
        
        for review in review_elements:
            try:
                print("Processing a review")
                reviewer_name = review.find_element(By.XPATH, ".//span[@class='a-profile-name']").text
                print('name fetched',reviewer_name)
                review_title = review.find_element(By.XPATH, ".//a[contains(@class, 'review-title-content')]/span[2]").text
                print('title fetched')
                
                star_rating = review.find_element(By.XPATH, ".//i[@data-hook='review-star-rating']/span").text
                print('rating fetched')
                review_text = review.find_element(By.XPATH, ".//span[contains(@class, 'review-text-content')]/span").text
                print('text fetched')
                review_date = review.find_element(By.XPATH, ".//span[contains(@class, 'review-date')]").text
                print('date fetched')
                reviews.append({
                    'reviewer_name': reviewer_name,
                    'review_title': review_title,
                    'star_rating': star_rating,
                    'review_text': review_text,
                    'review_date': review_date
                })
            
            except Exception:
                print(f"Error while processing a review: {review}")
        
    except Exception as e:
        print(f"Error while getting reviews: {e}")
    
    return reviews


In [8]:
def start_browser():
    '''
    Initializes the WebDriver and returns the driver instance.
    '''
    driver = webdriver.Edge()  
    return driver


In [9]:
def close_browser(driver):
    '''
    Closes the browser after a brief delay.
    '''
    print('Closing the browser')
    time.sleep(3)
    driver.quit()

## main

In [10]:
driver  = start_browser()

There was an error managing msedgedriver (error decoding response body); using driver found in the cache


In [11]:
load_page(driver,link)

In [12]:
data = []

In [13]:
run = True
i = 1
while(run):
    print(f'\n\n\nOn page {i}')
    i+=1
    data.append(get_reviews(driver))
    wait_for_page_load(driver)
    time.sleep(5)
    scroll_page(driver,pause_time=5)
    run = click_next_page(driver)





On page 1
Processing a review
name fetched buyer
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched Michelle
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched Joe
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched Nicholas Baab
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched Adam M.
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched Lee Pallansch
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched Jeremy
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched AmazonUser476
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched Mehr
title fetched
rating fetched
text fetched
date fetched
Processing a review
name fetched divadayton
title fetched
rating fetched
text fetched
date fetched
Reached the bot

In [14]:
flattened_list = [item for sublist in data for item in sublist]

In [15]:
len(flattened_list)

10

In [16]:
flattened_list

[{'reviewer_name': 'buyer',
  'review_title': 'Totally capable and worth the buy',
  'star_rating': '',
  'review_text': "I recently bought this machine for two purposes. The deal was I'd use this for my freelance work for a month while my 2019 Macbook Pro was being fixed and then my niece gets it for college. So it's really a gift, but I got to use it for a few weeks.\n\nMy biggest grip is that the ports are all on one side, the side that makes plugging things in on my desk awkward. My Macbook Pro has two ports on each side, and I always end up using the right side ports for charging and for the dongle.\n\nBut I almost never use it as a LAPtop. It's really a desktop machine that I take from desk to desk with rare appearances not plugged into an external monitor and other devices. It is more portable than my Pro but not enough for it to make a big difference. On a small popup desk in a university auditorium and bouncing around campus, this is probably more noticeable, I just don't have

In [17]:
close_browser(driver=driver)

Closing the browser
