In [2]:
pip install selenium undetected-chromedriver pandas tqdm


Note: you may need to restart the kernel to use updated packages.


In [11]:
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.1.0 webdriver-manager-4.0.2


In [28]:
import os
import re
import time
import pickle
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Paths
cookies_path = '../notebooks/cookies'
urls_path = '../data/raw/review_urls.txt'
output_csv = '../data/raw/amazon_reviews_raw.csv'

# Load cookies
def load_cookies(driver, cookies_path):
    for file in os.listdir(cookies_path):
        if file.endswith('.pkl'):
            cookies = pickle.load(open(os.path.join(cookies_path, file), "rb"))
            for cookie in cookies:
                if 'sameSite' in cookie and cookie['sameSite'] == 'None':
                    cookie['sameSite'] = 'Strict'
                driver.add_cookie(cookie)

# Scroll to load all dynamic reviews
def scroll_to_load_reviews(driver, pause=2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(8):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Extract review texts only from current page
def extract_reviews_on_page(driver):
    reviews = []
    review_elements = driver.find_elements(By.CSS_SELECTOR, ".review")
    for r in review_elements:
        try:
            text = r.find_element(By.CSS_SELECTOR, ".review-text-content span").text.strip()
            reviews.append(text)
        except:
            continue
    return reviews

# Click next page if exists
def click_next_page(driver):
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "li.a-last a")
        driver.execute_script("arguments[0].scrollIntoView();", next_button)
        time.sleep(1.5)
        next_button.click()
        return True
    except:
        return False

# Extract overall product rating
def extract_overall_product_rating(driver):
    try:
        rating_element = driver.find_element(By.CSS_SELECTOR, 'span[data-hook="rating-out-of-text"]')
        rating_text = rating_element.text.strip()
        match = re.search(r'([\d.]+)', rating_text)
        return float(match.group(1)) if match else ""
    except:
        return ""

# Extract product ID from URL
def extract_product_id(url):
    match = re.search(r'/product-reviews/([A-Z0-9]{10})', url)
    return match.group(1) if match else None

# Get already scraped productIDs
def get_completed_product_ids(output_csv):
    if os.path.exists(output_csv) and os.path.getsize(output_csv) > 0:
        try:
            df = pd.read_csv(output_csv)
            if "productID" in df.columns:
                return set(df["productID"].dropna().astype(str).unique())
        except Exception as e:
            print(f"⚠️ Error reading existing CSV: {e}")
    return set()

# Scrape reviews
def scrape_amazon_reviews(driver, product_urls, max_reviews_per_product=500):
    completed_ids = get_completed_product_ids(output_csv)

    for url in tqdm(product_urls, desc="Scraping Products"):
        product_id = extract_product_id(url)
        if not product_id or product_id in completed_ids:
            continue

        driver.get(url)
        time.sleep(3)

        overall_rating = extract_overall_product_rating(driver)
        reviews_collected = []

        while len(reviews_collected) < max_reviews_per_product:
            scroll_to_load_reviews(driver)
            page_reviews = extract_reviews_on_page(driver)

            for text in page_reviews:
                if text and len(reviews_collected) < max_reviews_per_product:
                    reviews_collected.append(text)

            if not click_next_page(driver):
                break
            time.sleep(3)

        # Prepare row with review1...review500
        row = {
            "productID": product_id,
            "overall_rating": overall_rating
        }

        for i in range(max_reviews_per_product):
            row[f"review{i+1}"] = reviews_collected[i] if i < len(reviews_collected) else ""

        # Safe CSV save
        try:
            df_existing = pd.read_csv(output_csv) if os.path.exists(output_csv) and os.path.getsize(output_csv) > 0 else pd.DataFrame()
        except Exception as e:
            print(f"⚠️ Could not load existing CSV: {e}")
            df_existing = pd.DataFrame()

        df_new = pd.DataFrame([row])
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        df_combined.to_csv(output_csv, index=False)

        print(f"✅ Reviews and rating for {product_id} saved.")

# Setup Selenium
options = Options()
options.add_argument('--start-maximized')
options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Inject cookies
driver.get("https://www.amazon.in")
time.sleep(3)
load_cookies(driver, cookies_path)
driver.refresh()
time.sleep(5)
input("🔐 Check if logged in. Press Enter to continue... ")

# Load URLs
with open(urls_path, 'r') as f:
    all_urls = [line.strip() for line in f if line.strip()]

# Test mode control
test_mode = True
num_test_products = 57
product_urls = all_urls[:num_test_products] if test_mode else all_urls

# Start scraper
scrape_amazon_reviews(driver, product_urls)
driver.quit()


🔐 Check if logged in. Press Enter to continue...  


Scraping Products:   2%|█                                                             | 1/57 [01:09<1:04:29, 69.09s/it]

✅ Reviews and rating for B0DSG8SNXH saved.


Scraping Products:   4%|██▏                                                           | 2/57 [02:17<1:03:09, 68.90s/it]

✅ Reviews and rating for B0DFY3XCB6 saved.


Scraping Products:   5%|███▎                                                          | 3/57 [03:26<1:01:58, 68.85s/it]

✅ Reviews and rating for B09TVVGXWS saved.


Scraping Products:   7%|████▎                                                         | 4/57 [04:35<1:00:46, 68.81s/it]

✅ Reviews and rating for B071Z8M4KX saved.


Scraping Products:   9%|█████▌                                                          | 5/57 [05:44<59:37, 68.79s/it]

✅ Reviews and rating for B0C8JB3G5W saved.


Scraping Products:  11%|██████▋                                                         | 6/57 [06:53<58:36, 68.95s/it]

✅ Reviews and rating for B07WFPL9PB saved.


Scraping Products:  12%|███████▊                                                        | 7/57 [08:02<57:30, 69.02s/it]

✅ Reviews and rating for B0D22QWQHR saved.


Scraping Products:  14%|████████▉                                                       | 8/57 [09:11<56:23, 69.05s/it]

✅ Reviews and rating for B0D3R1JQ7D saved.


Scraping Products:  16%|██████████                                                      | 9/57 [10:21<55:18, 69.14s/it]

✅ Reviews and rating for B07WHQHNZC saved.


Scraping Products:  18%|███████████                                                    | 10/57 [11:29<54:02, 68.99s/it]

✅ Reviews and rating for B077BFH786 saved.


Scraping Products:  19%|████████████▏                                                  | 11/57 [12:38<52:49, 68.91s/it]

✅ Reviews and rating for B098NS6PVG saved.


Scraping Products:  21%|█████████████▎                                                 | 12/57 [13:46<51:35, 68.80s/it]

✅ Reviews and rating for B0DH3J6LB9 saved.


Scraping Products:  23%|██████████████▎                                                | 13/57 [14:55<50:25, 68.77s/it]

✅ Reviews and rating for B07R3386PP saved.


Scraping Products:  25%|███████████████▍                                               | 14/57 [16:04<49:15, 68.73s/it]

✅ Reviews and rating for B0BDRVFDKP saved.


Scraping Products:  26%|████████████████▌                                              | 15/57 [17:12<48:06, 68.72s/it]

✅ Reviews and rating for B0CBTTCJL6 saved.


Scraping Products:  28%|█████████████████▋                                             | 16/57 [18:23<47:21, 69.31s/it]

✅ Reviews and rating for B0DFQ1R3W4 saved.


Scraping Products:  30%|██████████████████▊                                            | 17/57 [19:32<46:09, 69.23s/it]

✅ Reviews and rating for B07WHS99FG saved.


Scraping Products:  32%|███████████████████▉                                           | 18/57 [20:41<44:52, 69.04s/it]

✅ Reviews and rating for B0D2R2MXXJ saved.


Scraping Products:  33%|█████████████████████                                          | 19/57 [21:50<43:39, 68.94s/it]

✅ Reviews and rating for B0DCNWN8NZ saved.


Scraping Products:  35%|██████████████████████                                         | 20/57 [22:58<42:28, 68.88s/it]

✅ Reviews and rating for B0D63CNLJ9 saved.


Scraping Products:  37%|███████████████████████▏                                       | 21/57 [24:07<41:20, 68.90s/it]

✅ Reviews and rating for B0DSG51QM1 saved.


Scraping Products:  39%|████████████████████████▎                                      | 22/57 [25:16<40:08, 68.81s/it]

✅ Reviews and rating for B0D5YCYS1G saved.


Scraping Products:  40%|█████████████████████████▍                                     | 23/57 [26:25<39:02, 68.88s/it]

✅ Reviews and rating for B01DEWVZ2C saved.


Scraping Products:  42%|██████████████████████████▌                                    | 24/57 [27:34<37:51, 68.84s/it]

✅ Reviews and rating for B0CVN4DNWY saved.


Scraping Products:  44%|███████████████████████████▋                                   | 25/57 [28:43<36:43, 68.87s/it]

✅ Reviews and rating for B0CZ3ZPD8B saved.


Scraping Products:  46%|████████████████████████████▋                                  | 26/57 [29:51<35:32, 68.79s/it]

✅ Reviews and rating for B097JJ2CK6 saved.


Scraping Products:  47%|█████████████████████████████▊                                 | 27/57 [31:00<34:22, 68.74s/it]

✅ Reviews and rating for B0CRH561RC saved.


Scraping Products:  49%|██████████████████████████████▉                                | 28/57 [32:08<33:12, 68.70s/it]

✅ Reviews and rating for B0D7HZ3KK9 saved.


Scraping Products:  51%|████████████████████████████████                               | 29/57 [33:17<32:03, 68.71s/it]

✅ Reviews and rating for B0BZ3WNQ5T saved.


Scraping Products:  53%|█████████████████████████████████▏                             | 30/57 [34:26<30:55, 68.72s/it]

✅ Reviews and rating for B0CHX1W1XY saved.


Scraping Products:  54%|██████████████████████████████████▎                            | 31/57 [35:35<29:47, 68.75s/it]

✅ Reviews and rating for B0D7MNX9Y5 saved.


Scraping Products:  56%|███████████████████████████████████▎                           | 32/57 [36:43<28:38, 68.75s/it]

✅ Reviews and rating for B0BDYVC5TD saved.


Scraping Products:  58%|████████████████████████████████████▍                          | 33/57 [37:52<27:31, 68.80s/it]

✅ Reviews and rating for B0DLW427YG saved.


Scraping Products:  60%|█████████████████████████████████████▌                         | 34/57 [39:01<26:23, 68.86s/it]

✅ Reviews and rating for B09N3ZNHTY saved.


Scraping Products:  61%|██████████████████████████████████████▋                        | 35/57 [40:10<25:13, 68.78s/it]

✅ Reviews and rating for B0D18192T2 saved.


Scraping Products:  63%|███████████████████████████████████████▊                       | 36/57 [41:19<24:06, 68.88s/it]

✅ Reviews and rating for B0C5RK3X6F saved.


Scraping Products:  65%|████████████████████████████████████████▉                      | 37/57 [42:28<22:58, 68.92s/it]

✅ Reviews and rating for B0DFMHM9CM saved.


Scraping Products:  67%|██████████████████████████████████████████                     | 38/57 [43:37<21:49, 68.93s/it]

✅ Reviews and rating for B09R24JBYV saved.


Scraping Products:  68%|███████████████████████████████████████████                    | 39/57 [44:46<20:41, 68.97s/it]

✅ Reviews and rating for B0DZHXTPK1 saved.


Scraping Products:  70%|████████████████████████████████████████████▏                  | 40/57 [45:55<19:31, 68.90s/it]

✅ Reviews and rating for B0CP54XBWN saved.


Scraping Products:  72%|█████████████████████████████████████████████▎                 | 41/57 [47:04<18:23, 68.94s/it]

✅ Reviews and rating for B0DCZ3WDTB saved.


Scraping Products:  74%|██████████████████████████████████████████████▍                | 42/57 [48:12<17:12, 68.84s/it]

✅ Reviews and rating for B01HJI0FS2 saved.


Scraping Products:  75%|███████████████████████████████████████████████▌               | 43/57 [49:21<16:03, 68.82s/it]

✅ Reviews and rating for B07WHRZRD7 saved.


Scraping Products:  77%|████████████████████████████████████████████████▋              | 44/57 [50:30<14:55, 68.90s/it]

✅ Reviews and rating for B082WYMTWF saved.


Scraping Products:  79%|█████████████████████████████████████████████████▋             | 45/57 [51:39<13:46, 68.90s/it]

✅ Reviews and rating for B0B3MNYGTW saved.


Scraping Products:  81%|██████████████████████████████████████████████████▊            | 46/57 [52:48<12:37, 68.82s/it]

✅ Reviews and rating for B0DLW4QD72 saved.


Scraping Products:  82%|███████████████████████████████████████████████████▉           | 47/57 [53:57<11:27, 68.78s/it]

✅ Reviews and rating for B0D2R26HFV saved.


Scraping Products:  84%|█████████████████████████████████████████████████████          | 48/57 [55:06<10:20, 68.95s/it]

✅ Reviews and rating for B0DLW1L5PR saved.


Scraping Products:  86%|██████████████████████████████████████████████████████▏        | 49/57 [56:15<09:12, 69.01s/it]

✅ Reviews and rating for B07XCM6T4N saved.


Scraping Products:  88%|███████████████████████████████████████████████████████▎       | 50/57 [57:24<08:03, 69.11s/it]

✅ Reviews and rating for B0BG8LZNYL saved.


Scraping Products:  89%|████████████████████████████████████████████████████████▎      | 51/57 [58:33<06:54, 69.07s/it]

✅ Reviews and rating for B00LVMTA2A saved.


Scraping Products:  91%|█████████████████████████████████████████████████████████▍     | 52/57 [59:42<05:44, 69.00s/it]

✅ Reviews and rating for B00MVV81MK saved.


Scraping Products:  93%|████████████████████████████████████████████████████████▋    | 53/57 [1:00:51<04:35, 68.96s/it]

✅ Reviews and rating for B0CHX2F5QT saved.


Scraping Products:  95%|█████████████████████████████████████████████████████████▊   | 54/57 [1:02:01<03:27, 69.14s/it]

✅ Reviews and rating for B0DLW44CGS saved.


Scraping Products:  96%|██████████████████████████████████████████████████████████▊  | 55/57 [1:03:10<02:18, 69.10s/it]

✅ Reviews and rating for B09ZPL5VYM saved.


Scraping Products:  98%|███████████████████████████████████████████████████████████▉ | 56/57 [1:04:19<01:09, 69.27s/it]

✅ Reviews and rating for B0CHX3TW6X saved.


Scraping Products: 100%|█████████████████████████████████████████████████████████████| 57/57 [1:05:29<00:00, 68.93s/it]

✅ Reviews and rating for B0DH8BZ7V9 saved.



