In [41]:
import os
import time
import pickle
import re
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import undetected_chromedriver as uc

# ---------- Setup Chrome ----------
options = uc.ChromeOptions()
options.add_argument('--start-maximized')
driver = uc.Chrome(options=options)
time.sleep(2)

# ---------- Load Cookies ----------
cookies_path = "cookies/amazon_cookies.pkl"
driver.get("https://www.amazon.in/")
time.sleep(3)

if os.path.exists(cookies_path):
    with open(cookies_path, "rb") as f:
        cookies = pickle.load(f)
        for cookie in cookies:
            if 'sameSite' in cookie:
                cookie['sameSite'] = 'Strict'
            try:
                driver.add_cookie(cookie)
            except:
                continue

driver.refresh()
time.sleep(3)

# ---------- Helpers ----------
def parse_review_count(text):
    text = text.replace(",", "").lower().strip()
    try:
        if "k" in text:
            return int(float(text.replace("k", "")) * 1000)
        elif "m" in text:
            return int(float(text.replace("m", "")) * 1_000_000)
        return int(text)
    except:
        return 0

def extract_asin(url):
    match = re.search(r'/product-reviews/([A-Z0-9]{10})', url)
    if match:
        return match.group(1)
    return None

# ---------- Auto Resume ----------
save_path = "../data/raw/review_urls.txt"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

collected_urls = set()
collected_asins = set()

if os.path.exists(save_path):
    with open(save_path, "r") as f:
        for line in f:
            url = line.strip()
            asin = extract_asin(url)
            if url:
                collected_urls.add(url)
            if asin:
                collected_asins.add(asin)

print(f"🟡 Loaded {len(collected_urls)} URLs and {len(collected_asins)} ASINs from previous runs.")

# ---------- Start Scraping ----------
page = 1
max_pages = 50

base_url = "https://www.amazon.in/s?i=electronics&rh=n%3A976419031&fs=true"
driver.get(base_url)
time.sleep(5)

try:
    while page <= max_pages and len(collected_urls) < 50:
        if not driver.window_handles:
            print("❌ All browser windows are closed. Exiting.")
            break

        driver.switch_to.window(driver.window_handles[0])

        print(f"\n🔍 Scanning page {page}")
        time.sleep(4)

        products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")

        for product in products:
            try:
                link_elem = product.find_element(By.XPATH, ".//a[contains(@class, 'a-link-normal s-no-outline')]")
                ActionChains(driver).key_down(Keys.CONTROL).click(link_elem).key_up(Keys.CONTROL).perform()
                time.sleep(3)

                driver.switch_to.window(driver.window_handles[-1])
                time.sleep(3)

                driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.5);")
                time.sleep(2)

                try:
                    see_reviews_btn = driver.find_element(By.XPATH, "//a[contains(@data-hook, 'see-all-reviews-link-foot')]")
                    see_reviews_btn.click()
                    time.sleep(3)
                except NoSuchElementException:
                    print("ℹ️ No 'See all reviews' link found.")
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    continue

                time.sleep(3)
                try:
                    count_elem = driver.find_element(By.XPATH, "//div[contains(@data-hook, 'cr-filter-info-review-rating-count')]")
                    count_text = count_elem.text.strip()
                    print(f"🔹 Raw count text: {count_text}")

                    match = re.search(r'([\d,]+)\s+with reviews', count_text)
                    if match:
                        review_count = parse_review_count(match.group(1))
                        print(f"🔹 Written reviews found: {review_count}")

                        if review_count >= 500:
                            review_url = driver.current_url
                            asin = extract_asin(review_url)

                            if asin and asin not in collected_asins:
                                collected_asins.add(asin)
                                collected_urls.add(review_url)
                                print(f"✅ Saved: {review_url}")
                                with open(save_path, "a") as f:
                                    f.write(review_url + "\n")
                            else:
                                print(f"⚠️ Skipped duplicate ASIN: {asin}")

                    else:
                        print("❌ Could not extract written review count.")

                except Exception as e:
                    print(f"❌ Failed to extract review count: {e}")

                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                time.sleep(2)

            except Exception as e:
                print(f"⚠️ Skipped product due to: {e}")
                try:
                    if len(driver.window_handles) > 1:
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])
                except:
                    pass
                continue

        if len(collected_urls) >= 50:
            break

        try:
            next_btn = driver.find_element(By.XPATH, "//a[contains(@class, 's-pagination-next')]")
            driver.execute_script("arguments[0].scrollIntoView();", next_btn)
            time.sleep(1)
            next_btn.click()
            page += 1
            time.sleep(4)
        except:
            print("❌ No next page or failed to click.")
            break

except KeyboardInterrupt:
    print("\n🛑 Interrupted by user. Saving progress...")

except Exception as e:
    print(f"\n🚨 Unexpected error: {e}")

finally:
    driver.quit()
    print(f"\n✅ Done. Collected total {len(collected_urls)} unique review URLs with ≥500 written reviews.")


🟡 Loaded 0 URLs and 0 ASINs from previous runs.

🔍 Scanning page 1
🔹 Raw count text: 21 total ratings, 1,165 with reviews
🔹 Written reviews found: 1165
✅ Saved: https://www.amazon.in/product-reviews/B0DSG8SNXH/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
❌ Failed to extract review count: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[contains(@data-hook, 'cr-filter-info-review-rating-count')]"}
  (Session info: chrome=135.0.7049.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00FA80E3+60707]
	GetHandleVerifier [0x00FA8124+60772]
	(No symbol) [0x00DD0683]
	(No symbol) [0x00E18660]
	(No symbol) [0x00E189FB]
	(No symbol) [0x00E61022]
	(No symbol) [0x00E3D094]
	(No symbol) [0x00E5E824]
	(No symbol) [0x00E3CE46]
	(No symbol) [0x00E0C5D3]
	(No symbol) [0x00E0D424]
	GetHandleVerifier [0x011EBBC3+2435075