In [14]:
import os
import sys
import time
import json
import requests
import yaml

import numpy as np
import pandas as pd
from tqdm import tqdm

from urllib.parse import urlparse, urlunparse
from sqlalchemy import create_engine

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

def start_headless_browser():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--disable-default-apps')
    options.add_argument('--disable-translate')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-features=IsolateOrigins,site-per-process')
    options.add_argument('--blink-settings=imagesEnabled=false')
    options.add_argument('--disable-javascript')
    
    browser = webdriver.Chrome(options=options)

    browser.execute_cdp_cmd('Network.setUserAgentOverride', {
    "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        })
    #browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    #  "source": """
    #    Object.defineProperty(navigator, 'webdriver', {
    #      get: () => undefined
    #    })
    #  """
    #})
    #browser.execute_cdp_cmd("Network.enable", {})
    #browser.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": {"User-Agent": "browser1"}})
    return browser

def query_comments(url):
    browser.get(url)
    for element in browser.find_elements(By.CSS_SELECTOR,"div"):
        if element.get_attribute("role") == "combobox":
            language_box = element
    language_box.find_elements(By.CSS_SELECTOR,"div")[0].click()
    language_box.click()
    time.sleep(0.5)
    for element in browser.find_elements(By.CSS_SELECTOR,"ul"):
        if (element.get_attribute("aria-label") == "语言")|(element.get_attribute("aria-label") == "Language"):
            language_box = element
    language_box.find_elements(By.CSS_SELECTOR,"li")[-1].click()

def next_page_comments():
    span_elements = browser.find_elements(By.CSS_SELECTOR,"span")
    span_elements.reverse()
    for span in span_elements:
        if (span.text == "加载更多")|(span.text == "Load more"):
            next_page = span
            break
    next_page.click()

def parse_comments(path=None):
    if path != None:
        with open(f"{path}","r",encoding="utf-8") as file:
            response = file.read()
    else:
        response = browser.page_source
    
    response = BeautifulSoup(response)

    # 从浏览器获取标题和评论css id
    class_list = [section.get_attribute("class") for section in browser.find_elements(By.CSS_SELECTOR,"section")]
    header_class,comment_class = class_list[0],class_list[-1]
    info = browser.find_element(By.CSS_SELECTOR,f"section.{header_class}").text.split("\n")
    
    results = []
    
    response_list = response.find_all("section",attrs={"class":f"{comment_class}"})
    length = len(response_list)
    
    for i in tqdm(range(length)):
        try:
            rate = response_list[i].find("div",attrs={"class":"B1UG8d"}).attrs["title"]
            user = response_list[i].find("span").text
            comment = response_list[i].find("p").text
            comment_time = response_list[i].find("span",attrs={"class":"ydlbEf"}).text
            helpful_num = response_list[i].find("span",attrs={"class":"ZRk0Tb"}).text
            result = {"rate":rate, "user":user, "comment":comment, "comment_time":comment_time, "helpful":helpful_num}
            results.append(result)
        except:
            continue
    
    comment_output = pd.DataFrame(results)
    try:
        comment_output["comment_time"] = pd.to_datetime(comment_output["comment_time"],format="%Y年%m月%d日")
    except:
        try:
            comment_output["comment_time"] = pd.to_datetime(comment_output["comment_time"], format='%b %d, %Y')
        except:
            pass
    return comment_output

def check_browser():
    time_now = pd.Timestamp.now().strftime(format="%m-%d-%H-%M")
    browser.save_screenshot(f"log/excptions/{time_now}.png")
    with open(f"log/excptions/{time_now}.html","w",encoding="utf-8") as file:
        file.write(browser.page_source)

In [19]:
browser = start_headless_browser()
check_browser()

with open("config/coupon_chrome_extensions_url.yaml","r") as file:
    tools = yaml.safe_load(file.read())
for name,url in tools.items():
    if os.path.exists(f"data/chrome_extension/{name}.csv"):
        continue
    print(name,url)
    query_comments(url)
    for i in tqdm(range(150)):
        try:
            next_page_comments()
        except:
            try:
                next_page_comments()
            except:
                check_browser()
                break
        time.sleep(1.5)
        if i%200 == 0:
            with open(f"data/chrome_extension/{name}_{i}.html","w",encoding="utf-8") as file:
                file.write(browser.page_source)
    output_data = parse_comments(path=None) 
    output_data.to_csv(f"data/chrome_extension/{name}.csv")

parse_comments()

couponbirds https://chromewebstore.google.com/detail/couponbirds-smartcoupon-c/pnedebpjhiaidlbbhmogocmffpdolnek/reviews?hl=en


100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [04:34<00:00,  1.83s/it]
100%|████████████████████████████████████████████████████████████████████████████| 1510/1510 [00:00<00:00, 6848.03it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1510/1510 [00:00<00:00, 5540.98it/s]


Unnamed: 0,rate,user,comment,comment_time,helpful
0,4 stars,Ben George,Always making an effort to save and often succ...,2024-04-10,
1,4 stars,Heather West,"This has worked once for me, but I'm saving mo...",2024-04-10,
2,5 stars,Jordan Banks,good,2024-04-10,
3,5 stars,CharlieGTR,VERY GOOD!,2024-04-10,
4,5 stars,Rosshondra Irving,almost 100 off,2024-04-10,
...,...,...,...,...,...
1505,5 stars,Rachel (RP Imagery),Just saved 100 bucks! thank you!,2024-01-18,
1506,5 stars,Perry Carrison,The easiest $21 I ever saved. Go for it! Coupo...,2024-01-18,
1507,5 stars,Alex,,2024-01-18,
1508,5 stars,Igor Lebediev,,2024-01-18,


In [16]:
output_data = parse_comments()

100%|██████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<?, ?it/s]


In [17]:
output_data

In [20]:
def fetchUrl(url,cookies=None):
    """
    Fetches content from a URL using a GET request with custom User-Agent and cookies.
    """
    # Ensure headers and cookies are dictionaries, if not provided, initialize as empty dicts
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    headers = {"User-Agent": user_agent}
    if cookies is None:
        cookies = {}
    
    # Perform the GET request
    response = requests.get(url, headers=headers, cookies=cookies)
    
    return response

def amazon_cookie_manage():
    """save cookie from https://moz.com/domain-analysis"""
    with open("config/amazon_cookie.json","r") as file:
        cookies = json.loads(file.read())
        cookies = cookies["cookies"]
        cookies_amazon = {cookie["name"]:cookie["value"] for cookie in cookies}
    return cookies_amazon

In [21]:
cookies = amazon_cookie_manage()
result = fetchUrl("https://www.amazon.com/Wordle-Party-Players-Official-Inspired/dp/B0B5B9CP17",cookies=cookies)

In [25]:
with open("amazon_test.html","w",encoding="utf-8") as file:
    file.write(result.text)

In [170]:
response.find_all("div",attrs={"id":"cm-cr-dp-review-list"})[0].text

"k. dorst5.0 out of 5 stars\nTaste amazing!\nReviewed in the United States on February 26, 2024Flavor Name: RaspberryVerified Purchase\nThis was highly recommended to me by my daughter who is deep into fitness and super healthy in general. It’s amazing all the benefits you get from taking this daily. I take a full tablespoon every morning on an empty stomach before having my coffee. I feel like it’s helping my skin and hair as well as helping to keep me healthy. Several coworkers are sick and I work side by side  them and yet I haven’t caught their sickness!! Can’t say 100% if it’s due to the sea moss but also can’t say that it’s not. None of my coworkers are taking sea moss, so yeah! I bought the raspberry one, it’s soooo good!! I love the taste!!! I will be ordering again soon!\nRead more\n21 people found this helpful\n\n\n              Helpful\n\n\nReport\nCaprice Caliendo5.0 out of 5 stars\nGreat taste\nReviewed in the United States on March 20, 2024Flavor Name: Apple CinnamonVerif

In [171]:
def parse_page(path):
    with open(f"{path}","r",encoding="utf-8") as file:
        response = file.read()
    response = BeautifulSoup(response)

    title = response.find("span",attrs={"id":"productTitle"}).text.strip()
    rate = response.find("span",attrs={"id":"acrPopover"}).span.a.span.text.strip()
    rate_number = response.find("span",attrs={"id":"acrCustomerReviewText"}).text.strip()
    sold_number = response.find("span",attrs={"id":"social-proofing-faceout-title-tk_bought"}).span.text.strip()
    description = response.find("div",attrs={"id":"feature-bullets"}).ul.text
    rate_dist = [tr.text for tr in response.select("#cm_cr_dp_d_rating_histogram")[0].find_all("tr")]
    ai_comment = response.select("#product-summary")[0].text
    keyword_comments = response.select("#cr-product-insights-cards > div:nth-child(2) > div.a-section.a-spacing-mini._cr-product-insights_style_sentiment-section__6zKPq > div ")[0].find_all("span")
    keyword_comments = [span.text for span in keyword_comments]
    nav = " ".join([li.text.strip() for li in response.select("#wayfinding-breadcrumbs_feature_div > ul > li")])
    user_reviews = response.find_all("div",attrs={"id":"cm-cr-dp-review-list"})[0].text

    product_info = {
        "title": title,
        "rate": rate,
        "rate_number": rate_number,
        "sold_number": sold_number,
        "description": description,
        "rate_dist": rate_dist,
        "ai_comment": ai_comment,
        "keyword_comments": keyword_comments,
        "nav": nav,
        "user_reviews":user_reviews
    }

    return product_info

In [144]:
url_list = os.listdir("data/amazon_html/")
url_list

['amazon_0.html',
 'amazon_1.html',
 'amazon_10.html',
 'amazon_100.html',
 'amazon_101.html',
 'amazon_102.html',
 'amazon_103.html',
 'amazon_104.html',
 'amazon_105.html',
 'amazon_106.html',
 'amazon_107.html',
 'amazon_108.html',
 'amazon_109.html',
 'amazon_11.html',
 'amazon_110.html',
 'amazon_111.html',
 'amazon_112.html',
 'amazon_113.html',
 'amazon_114.html',
 'amazon_115.html',
 'amazon_116.html',
 'amazon_117.html',
 'amazon_118.html',
 'amazon_119.html',
 'amazon_12.html',
 'amazon_120.html',
 'amazon_121.html',
 'amazon_122.html',
 'amazon_123.html',
 'amazon_124.html',
 'amazon_125.html',
 'amazon_126.html',
 'amazon_127.html',
 'amazon_128.html',
 'amazon_129.html',
 'amazon_13.html',
 'amazon_130.html',
 'amazon_131.html',
 'amazon_132.html',
 'amazon_133.html',
 'amazon_134.html',
 'amazon_135.html',
 'amazon_136.html',
 'amazon_137.html',
 'amazon_138.html',
 'amazon_139.html',
 'amazon_14.html',
 'amazon_140.html',
 'amazon_141.html',
 'amazon_142.html',
 'amazon_

In [172]:
output = []
for url in tqdm(url_list):
    try:
        result = parse_page(f"data/amazon_html/{url}")
        output.append(result)
    except:
        continue

100%|████████████████████████████████████████████████████████████████████████████████| 362/362 [01:40<00:00,  3.61it/s]


In [173]:
pd.DataFrame(output)

Unnamed: 0,title,rate,rate_number,sold_number,description,rate_dist,ai_comment,keyword_comments,nav,user_reviews
0,"Amazon Fire TV Stick, HD, sharp picture qualit...",4.7,"462,719 ratings",10K+ bought in past month,50% more powerful than the previous generati...,"[5 star82%, 4 star11%, 3 star3%, 2 star1%, 1 s...",Customers sayCustomers like the performance an...,"[Performance, Ease of use, Quality]",,CARLOS PINEDA5.0 out of 5 stars\nAwesome quali...
1,Pure Instinct Roll-On - The Original Pheromone...,3.9,"81,031 ratings",10K+ bought in past month,"More than 20 years ago, Pure Instinct change...","[5 star52%, 4 star13%, 3 star14%, 2 star8%, 1 ...",Customers sayCustomers like the portability of...,"[Appearance, Portability, Ease of application,...",Beauty & Personal Care › Fragrance › Men's › C...,Natalia5.0 out of 5 stars\nWorks! Just not how...
2,Quest 2 — Advanced All-In-One Virtual Reality ...,4.7,"71,784 ratings",30K+ bought in past month,Experience total immersion with 3D positiona...,"[5 star84%, 4 star9%, 3 star3%, 2 star1%, 1 st...",Customers sayCustomers like the ease of setup ...,"[Quality, Entertainment value, Ease of setup, ...",Video Games › Virtual Reality › Standalone Har...,Yaura5.0 out of 5 stars\nAmazing Overall\nRevi...
3,Google Pixel 6a - 5G Android Phone - Unlocked ...,4.3,"4,762 ratings",200+ bought in past month,Google Pixel 6a adapts to you; it’s super fa...,"[5 star67%, 4 star15%, 3 star5%, 2 star4%, 1 s...",Customers sayCustomers like the value of the c...,"[Value, Photos, Quality, Battery life, Chargin...",Cell Phones & Accessories › Cell Phones,Nurse Bill5.0 out of 5 stars\nSolid as a rock\...
4,"WalkingPad Folding Treadmill, Ultra Slim Folda...",3.9,803 ratings,500+ bought in past month,Ultra Slim Foldable Running Board For Easy S...,"[5 star57%, 4 star16%, 3 star6%, 2 star6%, 1 s...",Customers sayCustomers like the ease of foldin...,"[Ease of folding, Performance, Ease of use, Qu...",Sports & Outdoors › Exercise & Fitness › Cardi...,Amazon Customer5.0 out of 5 stars\nWalking whi...
...,...,...,...,...,...,...,...,...,...,...
98,Amazon Basics High-Speed HDMI Cable For Televi...,4.7,"533,087 ratings",10K+ bought in past month,IN THE BOX: HDMI cable (A Male to A Male) fo...,"[5 star82%, 4 star12%, 3 star4%, 2 star1%, 1 s...","Customers sayCustomers like the quality, value...","[Performance, Quality, Value, Fit, Picture qua...",Electronics › Television & Video › Accessories...,carla5.0 out of 5 stars\nYear-Long Reliability...
99,Hero Cosmetics Mighty Patch™ Original Patch - ...,4.6,"142,363 ratings",100K+ bought in past month,🏆 The Original Award-Winning Acne Patch: Mig...,"[5 star76%, 4 star12%, 3 star7%, 2 star2%, 1 s...","Customers sayCustomers like the quality, visib...","[Performance, Effect on skin, Quality, Sensiti...",Beauty & Personal Care › Skin Care › Face › Tr...,Ana Maria5.0 out of 5 stars\nMaybe not the bes...
100,"Nugenix Total-T, Free and Total Testosterone B...",4.1,"15,820 ratings",10K+ bought in past month,"Scientifically formulated, our new-and-impro...","[5 star59%, 4 star15%, 3 star12%, 2 star5%, 1 ...","Customers sayCustomers like the energy, qualit...","[Energy, Quality, Effects, Performance, Sleep ...",Health & Household › Diet & Sports Nutrition ›...,Odizie Ajike5.0 out of 5 stars\nUOUR PRODUCT\n...
101,URO Vaginal Probiotics for Women pH Balance wi...,4.3,340 ratings,100K+ bought in past month,Promotes Vaginal Health: URO Probiotic for w...,"[5 star74%, 4 star8%, 3 star3%, 2 star3%, 1 st...","Customers sayCustomers like the vitamins, perf...","[Performance, Smell, Vitamins, Health benefits...",Health & Household › Health Care › Women's Hea...,A. Williams5.0 out of 5 stars\nI’ve noticed a ...


In [175]:
def sleep_time(base,random=1):
    random_time = base + np.random.randint(10000,99999)/(10**5)
    time.sleep(random_time)

In [174]:
pd.DataFrame(output).to_clipboard()

In [176]:
input_data = pd.read_csv("data/amazon_url_test.csv")
url_list = input_data["URL"].to_list()

In [None]:

result = fetchUrl("https://www.amazon.com/Wordle-Party-Players-Official-Inspired/dp/B0B5B9CP17",cookies=cookies)

In [None]:
i = 1500
for url in tqdm(url_list[1500:2000]):
 
    url = url+"?language=en_US"
    sleep_time(1)
    cookies = amazon_cookie_manage()
    response = fetchUrl(f"{url}",cookies=cookies)
    with open(f"data/amazon_html/amazon_{i}.html","w",encoding="utf-8") as file:
        file.write(response.text)
    sleep_time(3)
    i += 1

  5%|███▊                                                                           | 24/500 [03:24<1:06:31,  8.39s/it]