# Jingdong Extract Top 50 Powerbanks Review Info

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  # for change chrome setting 
from selenium.webdriver.chrome.service import Service  # for manage chromedriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import random
import json
from pprint import pprint

In [2]:
def get_chromedriver(webdriver_path="chromedriver.exe",
                     disable_logging=True,
                     headless=False,
                     user_agent=None,
                     disable_automation_features=True,
                     patch_webdriver_property=True,
                     implicit_wait=10):
    """
    Launch the Chrome browser's WebDriver.
    启动 Chrome 浏览器的 WebDriver。

    Args:
        webdriver_path (str): Path to chromedriver.exe. Defaults to current directory.
                              chromedriver.exe 的路径，默认当前目录。
        disable_logging (bool): Whether to suppress Chrome logs. Defaults to True.
                                是否禁用控制台日志输出，默认 True。
        headless (bool): Whether to run Chrome in headless mode (no GUI). Defaults to False.
                         是否以无头模式运行浏览器，默认 False。
        user_agent (str or None): Custom User-Agent string. If None, no override.
                                  Can use https://www.whatismybrowser.com/detect/what-is-my-user-agent/ to find your current User-Agent.
                                  自定义 User-Agent 字符串，默认 None 表示不更改。
                                  可以使用 https://www.whatismybrowser.com/detect/what-is-my-user-agent/ 来查看当前 User-Agent。
        disable_automation_features (bool): Disable automation flags to bypass detection. Defaults to True.
                                            是否关闭 Chrome 的自动化特征（例如自动扩展），默认 True。
        patch_webdriver_property (bool): Patch navigator.webdriver to avoid detection. Defaults to True.
                                         是否将 navigator.webdriver 设置为 undefined，默认 True。
        implicit_wait (int): Global implicit wait time in seconds. Defaults to 10.
                             全局隐式等待时间（秒），默认 10 秒。

    Returns:
        webdriver.Chrome: Initialized Chrome WebDriver instance.
                          已初始化的 Chrome WebDriver 实例。
    """
    options = Options()

    # Suppress Chrome logs
    # 禁用 Chrome 日志输出
    if disable_logging:
        options.add_experimental_option("excludeSwitches", ["enable-logging"])

    # Enable headless mode (no browser UI)
    # 启用无头模式（不显示浏览器界面）
    if headless:
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--window-size=1920x1080")

    # Set custom User-Agent if provided
    # 设置自定义 User-Agent（如果传入）
    if user_agent is not None:
        options.add_argument(f"user-agent={user_agent}")

    # Disable automation detection features
    # 关闭自动化检测相关特征
    if disable_automation_features:
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_experimental_option("useAutomationExtension", False)
        options.add_experimental_option("excludeSwitches", ["enable-automation"])

    # Initialize WebDriver
    # 初始化 WebDriver
    wd = webdriver.Chrome(service=Service(webdriver_path), options=options)

    # Apply implicit wait
    # 设置全局隐式等待时间
    wd.implicitly_wait(implicit_wait)

    # Patch navigator.webdriver = undefined to evade detection
    # 修改 navigator.webdriver 为 undefined 以规避检测
    if patch_webdriver_property:
        wd.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                })
            """
        })

    return wd


In [3]:
def random_sleep(min_sec=9.0, max_sec=12.0):
    """
    Sleep for a random duration between min_sec and max_sec.
    Used to simulate human behavior and avoid detection as an automated script.
    
    在 min_sec 到 max_sec 之间随机 sleep 一段时间。
    用于模拟人类行为，避免被检测为自动脚本。

    Args:
        min_sec (float): Minimum sleep duration in seconds.
                         最小睡眠时间（秒），默认 9.0 秒。
        max_sec (float): Maximum sleep duration in seconds.
                         最大睡眠时间（秒），默认 12.0 秒。
    """
    delay = random.uniform(min_sec, max_sec)
    time.sleep(delay)

In [4]:
def clean_and_add_cookies(wd, cookie_path):
    """
    Load cookies from a JSON file and add them to the WebDriver instance.
    从 JSON 文件加载 Cookie 并添加到 WebDriver 实例中。

    Args:
        wd (webdriver.Chrome): 
            The initialized Chrome WebDriver instance. 
            已初始化的 Chrome WebDriver 实例。
        cookie_path (str): 
            Path to the cookie file in JSON format.
            Cookie 文件的路径，格式为 JSON。
    """
    # Open the cookie file and load cookies
    # 打开 Cookie 文件并加载 Cookie
    with open(cookie_path, "r", encoding="utf-8") as f:
            cookies = json.load(f)
        
    # Clean the cookie data: Cookies may have None or "no_restriction" values that need to be converted to valid values
    # 清洗 Cookie 数据: Cookie 中有些值为 None 或 "no_restriction"，需要转换为合法值
    for c in cookies:
        # Convert "no_restriction" or None to valid values
        # 将 "no_restriction" 或 None 转换为合法值
        if c.get("sameSite") not in ["Strict", "Lax", "None"]:
            c["sameSite"] = "None"
        # # Selenium does not accept null values, so we need to remove these keys
        # # Selenium 不接受 null，需要删除这些键
        # for key in list(c.keys()):
        #     if c[key] is None:
        #         del c[key]
    
    # Add cookies to the WebDriver
    # 将 Cookie 添加到 WebDriver
    for ck in cookies:
        wd.add_cookie(ck)

## Extract Top 50 Powerbanks SKUs

In [5]:
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"

# Initialize the Chrome WebDriver
driver = get_chromedriver(
    webdriver_path="../../chromedriver.exe",
    disable_logging=True,
    headless=False,
    user_agent=ua,
    disable_automation_features=True,
    patch_webdriver_property=True,
    implicit_wait=30
)

In [None]:
cookie_path = "../data/jd_cache.json"
# jd_category_page = "https://list.jd.com/list.html?cat=9987,830,13658"
jd_category_page = "https://list.jd.com/list.html?cat=9987%2C830%2C13658&psort=3&psort=3&0.7649336460749802#J_main"
jd_skus_path = "../data/jd_top50_skus.csv"

# Navigate to the JD homepage
print("Navigating to JD homepage...")
driver.get("https://www.jd.com/?country=USA")
random_sleep()

# Load cookies from the JSON file and add them to the WebDriver
print("Loading cookies from file...")
clean_and_add_cookies(driver, cookie_path)

# Refresh the page to ensure cookies are applied
print("Refreshing the page to apply cookies...")
driver.refresh()
random_sleep()

Navigating to JD homepage...
Loading cookies from file...
Refreshing the page to apply cookies...


In [None]:
# Navigate to the JD category page
print("Navigating to JD category page...")
driver.get(jd_category_page)
random_sleep()

Navigating to JD category page...


In [29]:
# Get the current items count before scrolling
items_before = len(driver.find_elements(By.CSS_SELECTOR, "#J_goodsList .gl-item"))
# items_before = len(driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]//li[contains(@class, "gl-item")]'))
print(f"Items before scrolling: {items_before}")

# Scroll to the bottom of the page to load more products
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
random_sleep()

items_after = len(driver.find_elements(By.CSS_SELECTOR, "#J_goodsList .gl-item"))
print(f"Items after scrolling: {items_after}")

Items before scrolling: 30
Items after scrolling: 60


In [31]:
# Get the full list of items after scrolling
items = driver.find_elements(By.CSS_SELECTOR, "#J_goodsList .gl-item")
print(f"Total items found: {len(items)}")

# Extract SKU and store information from the items
sku_store_list = []

for i, item in enumerate(items):
    if i >= 50:
        break
    
    sku = item.get_attribute("data-sku")
    
    try:
        store = item.find_element(By.CSS_SELECTOR, ".p-shop").text.strip()
    except: 
        store = ""

    if sku:
        sku_store_list.append({
            "SKU": sku,
            "Store": store
        })
        print(f"Item {i+1}: SKU = {sku}, Store = {store}")


Total items found: 60
Item 1: SKU = 100024102298, Store = 罗马仕京东自营旗舰店
Item 2: SKU = 100096418777, Store = 小米京东自营旗舰店
Item 3: SKU = 100097634241, Store = CUKTECH酷态科京东自营旗舰店
Item 4: SKU = 100073745484, Store = 罗马仕京东自营旗舰店
Item 5: SKU = 100140584252, Store = 倍思（Baseus）京东自营旗舰店
Item 6: SKU = 100105445886, Store = 绿联（UGREEN）京东自营旗舰店
Item 7: SKU = 100200931898, Store = 罗马仕京东自营旗舰店
Item 8: SKU = 100157733819, Store = 罗马仕京东自营旗舰店
Item 9: SKU = 100199404088, Store = CUKTECH酷态科京东自营旗舰店
Item 10: SKU = 10127956763136, Store = epcbook京东自营官方旗舰店
Item 11: SKU = 100133826752, Store = 京东京造自营旗舰店
Item 12: SKU = 100042985009, Store = 倍思（Baseus）京东自营旗舰店
Item 13: SKU = 100031532138, Store = 小米京东自营旗舰店
Item 14: SKU = 100144381044, Store = 绿联（UGREEN）京东自营旗舰店
Item 15: SKU = 100044819549, Store = 罗马仕京东自营旗舰店
Item 16: SKU = 10142242864010, Store = IMF旗舰店
Item 17: SKU = 10090856414152, Store = 超级马官方旗舰店
Item 18: SKU = 100047622232, Store = 睿量（REMAX）京东自营旗舰店
Item 19: SKU = 10129004694713, Store = 炫威达旗舰店
Item 20: SKU = 10006380301

In [32]:
df = pd.DataFrame(sku_store_list)
df.to_csv(jd_skus_path, index=False, encoding="utf-8-sig")

In [33]:
driver.quit()

## Extract Review Info

In [5]:
def parse_count(text: str) -> int:
    """
    Parse a Chinese count string like '2万+', '200+' or '27' into an integer.
    解析中文计数字符串，如 '2万+'、'200+' 或 '27'，返回整数。
    
    Args:
        text (str): The count string to parse.
                    要解析的计数字符串。
    
    Returns:
        int: The parsed integer value.
             解析后的整数值。
    """
    text = text.rstrip('+')
    if text.endswith('万'):
        # convert "x万" to integer
        num = float(text[:-1])
        return int(num * 10000)
    return int(text)

### Test for one product

In [6]:
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"

# Initialize the Chrome WebDriver
driver = get_chromedriver(
    webdriver_path="../../chromedriver.exe",
    disable_logging=True,
    headless=False,
    user_agent=ua,
    disable_automation_features=True,
    patch_webdriver_property=True,
    implicit_wait=30
)

In [None]:
cookie_path = "../data/jd_cache.json"
jd_skus_path = "../data/jd_top50_skus.csv"

# Navigate to the JD homepage
driver.get("https://www.jd.com/?country=USA")
random_sleep(12, 15)

# Load cookies from the JSON file and add them to the WebDriver
clean_and_add_cookies(driver, cookie_path)

# Refresh the page to ensure cookies are applied
driver.refresh()
random_sleep(12, 15)

In [None]:
# Navigate to the JD powerbank product page
sku = "100024102298"
store = "罗马仕京东自营旗舰店"
url = f"https://item.jd.com/{sku}.html"
driver.get(url)
random_sleep()

In [None]:
# initialize with defaults
review = {
    "SKU": sku, 
    "product_name": "",
    "store": store,
    "positive_rate": "0%",
    "positive_count": 0,
    "neutral_count": 0,
    "negative_count": 0,
    "negative_review_1": "", 
    "negative_review_2": "",
    "negative_review_3": "",
    "negative_review_4": "",
    "negative_review_5": ""
}

# save the product name
review["product_name"] = driver.find_element(
    By.XPATH, '/html/body/div[4]/div/div[2]/div/div[5]/div[1]/span[1]'
).text.strip()

print(f"Product Name: {review['product_name']}")

Product Name: 罗马仕（ROMOSS）55W自带线充电宝【京东首发】20000毫安快充移动电源 可上飞机适用于小米/苹果/红米/笔记本 黑色


In [61]:
# Click the "All Reviews" button to expand the reviews section
all_reviews_button = WebDriverWait(driver, 30).until(
    EC.element_to_be_clickable(
        (By.XPATH, '//*[@id="comment-root"]/div[3]/div')
    )
)
all_reviews_button.click()
random_sleep()

In [64]:
# wait for the review tags overlay
rate_list =  WebDriverWait(driver, 30).until(
    EC.visibility_of_element_located((By.ID, "rateList"))
)
tag_divs = driver.find_elements(
    By.XPATH, '//*[@id="rateList"]/div/div[2]/div/div'
)
random_sleep()

print(f"Total review tags found: {len(tag_divs)}")

Total review tags found: 15


In [65]:
# parse each tag
for tag in tag_divs:
    label = tag.find_element(By.XPATH, './span[1]').text.strip()
    value = tag.find_element(By.XPATH, './span[2]').text.strip()

    if label == "全部":
        m = re.match(r'(\d+\.?\d*)%', value)
        review["positive_rate"] = (m.group(1) + "%") if m else "0%"
    elif label == "好评":
        review["positive_count"] = parse_count(value)
    elif label == "中评":
        review["neutral_count"] = parse_count(value)
    elif label == "差评":
        review["negative_count"] = parse_count(value)
        tag.click()
        random_sleep()

print(review)

{'SKU': '100157733819', 'product_name': '罗马仕（ROMOSS）55W自带线充电宝【京东首发】20000毫安快充移动电源 可上飞机适用于小米/苹果/红米/笔记本 黑色', 'positive_rate': '97%', 'positive_count': 20000, 'neutral_count': 200, 'negative_count': 200, 'negative_review_1': '', 'negative_review_2': '', 'negative_review_3': '', 'negative_review_4': '', 'negative_review_5': ''}


In [70]:
# if there are bad reviews, scroll to load up to 5 items
if review["negative_count"] > 0:
    # the scrollable container for review list
    scrollable_div = driver.find_element(
        By.XPATH, '//*[@id="rateList"]/div/div[3]'
    )
    # scroll down a few times to load more reviews
    for _ in range(5):
        driver.execute_script("arguments[0].scrollTop += 300", scrollable_div)
        random_sleep(2, 3)

    # extract up to 5 bad review texts
    for i in range(1, 6):
        try:
            review_xpath = f'//*[@id="rateList"]/div/div[3]/div[3]/div/div/div/div/div[{i}]/div/div/div[2]/div[2]/span'
            review_elem = driver.find_element(By.XPATH, review_xpath)
            review[f"negative_review_{i}"] = review_elem.text.strip()
        except:
            # no more reviews at this index
            review[f"negative_review_{i}"] = ""

for i in range(1, 6):
    print(f"Negative Review {i}: {review[f'negative_review_{i}']}")

Negative Review 1: 比起我买的其它牌子10000m.的比这个20000m还能冲多几次电. 不建议购买
Negative Review 2: 充电慢，说的有线，送来没有线
Negative Review 3: 其他特色：充电中时不时停止充电
Negative Review 4: 不支持vivo闪充 呵呵
Negative Review 5: 不怎么样，缝隙挺大的，已经退货


### Test for multiple products

In [32]:
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"

# Initialize the Chrome WebDriver
driver = get_chromedriver(
    webdriver_path="../../chromedriver.exe",
    disable_logging=True,
    headless=False,
    user_agent=ua,
    disable_automation_features=True,
    patch_webdriver_property=True,
    implicit_wait=30
)

In [33]:
cookie_path = "../data/jd_cache.json"
jd_skus_path = "../data/jd_top50_skus.csv"

# Navigate to the JD homepage
driver.get("https://www.jd.com/?country=USA")
random_sleep()

# Load cookies from the JSON file and add them to the WebDriver
clean_and_add_cookies(driver, cookie_path)

# Refresh the page to ensure cookies are applied
driver.refresh()
random_sleep()

In [None]:
start_idx = 14
end_idx = 15
jd_reviews_path = f"../data/jd_reviews_{start_idx+1:02d}_{end_idx:02d}.csv"

# Read the SKU list from the CSV file
df = pd.read_csv(jd_skus_path, encoding="utf-8-sig")

# Store the reviews information
reviews_info = []
for idx, row in df.iterrows():
    if idx < start_idx or idx >= end_idx:
        continue
    
    # Extract SKU and Store from the DataFrame row
    sku = row["SKU"]
    store = row["Store"]
    print(f"Processing item {idx + 1}/{len(df)}: SKU = {sku}")
    
    review = {
        "SKU": sku, 
        "product_name": "",
        "store": store,
        "positive_rate": "0%",
        "positive_count": 0,
        "neutral_count": 0,
        "negative_count": 0,
        "negative_review_1": "",
        "negative_review_2": "",
        "negative_review_3": "",
        "negative_review_4": "",
        "negative_review_5": ""
    }

    # Save the product name
    review["product_name"] = driver.find_element(
        By.XPATH, '/html/body/div[4]/div/div[2]/div/div[5]/div[1]/span[1]'
    ).text.strip()
    
    # Click the "All Reviews" button to expand the reviews section
    all_reviews_button = WebDriverWait(driver, 30).until(
        EC.element_to_be_clickable(
            (By.XPATH, '//div[@id="comment-root"]/div[@class="all-btn"]')
        )
    )
    all_reviews_button.click()
    random_sleep()

    # Wait for the review tags overlay
    rate_list =  WebDriverWait(driver, 30).until(
        EC.visibility_of_element_located((By.ID, "rateList"))
    )
    tag_divs = driver.find_elements(
        By.XPATH, '//*[@id="rateList"]/div/div[2]/div/div'
    )
    random_sleep()
    
    # Parse each tag
    for tag in tag_divs:
        label = tag.find_element(By.XPATH, './span[1]').text.strip()
        value = tag.find_element(By.XPATH, './span[2]').text.strip()

        if label == "全部":
            m = re.match(r'(\d+\.?\d*)%', value)
            review["positive_rate"] = (m.group(1) + "%") if m else "0%"
        elif label == "好评":
            review["positive_count"] = parse_count(value)
        elif label == "中评":
            review["neutral_count"] = parse_count(value)
        elif label == "差评":
            review["negative_count"] = parse_count(value)
            # Click the "差评" tag to load negative reviews
            tag.click()
            random_sleep()
    
    # if there are bad reviews, scroll to load up to 5 items
    if review["negative_count"] > 0:
        # the scrollable container for review list
        scrollable_div = driver.find_element(
            By.XPATH, '//*[@id="rateList"]/div/div[3]'
        )
        # scroll down a few times to load more reviews
        for _ in range(5):
            driver.execute_script("arguments[0].scrollTop += 300", scrollable_div)
            random_sleep(2, 3)

        # extract up to 5 bad review texts
        for i in range(1, 6):
            try:
                review_xpath = f'//*[@id="rateList"]/div/div[3]/div[@class="_list_1ygkr_67"]/div/div/div/div/div[{i}]/div/div/div[2]/div[2]/span'
                review_elem = driver.find_element(By.XPATH, review_xpath)
                review[f"negative_review_{i}"] = review_elem.text.strip()
            except:
                # no more reviews at this index
                review[f"negative_review_{i}"] = ""
    
    # Append the review to the list
    reviews_info.append(review)

Processing item 15/50: SKU = 10146872281876


In [35]:
pprint(reviews_info)

[{'SKU': 10146872281876,
  'negative_count': 3,
  'negative_review_1': '差评，原装自带的线不是快充，边冲边用，电量不长',
  'negative_review_2': '充电宝非常垃圾，充满电用不了两次就没电了！还说2万的！连5000的都比不了！购买的时候一定要注意！',
  'negative_review_3': '我真的要笑死了。标个小米跟我说适用于小米机型，我是冲着小米机型来买的吗？',
  'negative_review_4': '',
  'negative_review_5': '',
  'neutral_count': 0,
  'positive_count': 4,
  'positive_rate': '56%',
  'product_name': '小米（MI)米家适用66W充电宝20000毫安超大容量双向超级快充便携户外移动电源 商务黑【快充】 20000毫安'}]


In [13]:
driver.quit()

In [None]:
# Export to CSV file
reviews_df = pd.DataFrame(reviews_info)
reviews_df.to_csv(jd_reviews_path, index=False, encoding="utf-8-sig")

### Test for functions in .py file

In [7]:
from jd_pb_50review import *

In [8]:
webdriver_path = "../../chromedriver.exe"
cookie_path = "../data/jd_cache.json"
category_url = "https://list.jd.com/list.html?cat=9987%2C830%2C13658&psort=3&psort=3&0.7649336460749802#J_main"
jd_skus_path = "../data/jd_top50_skus.csv"
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"

In [20]:
start_idx = 45
end_idx = 50
jd_reviews_path = f"../data/jd_reviews_{start_idx+1:02d}_{end_idx:02d}.csv"
scrape_jd_reviews(
    webdriver_path=webdriver_path,
    cookie_path=cookie_path,
    skus_csv_path=jd_skus_path,
    output_csv_path=jd_reviews_path,
    user_agent=ua,
    start_idx=start_idx,
    end_idx=end_idx,
    headless=False
)

Processing item 46/50: SKU = 10119956611544
Processing item 47/50: SKU = 10141253217046
Processing item 48/50: SKU = 100133267851
Processing item 49/50: SKU = 10136859742730
Processing item 50/50: SKU = 10060515593137
Saved reviews for SKUs 46 to 50 into ../data/jd_reviews_46_50.csv


[{'SKU': 10119956611544,
  'product_name': '威顿【66W超级快充丨国家3C认证】充电宝20000毫安适用小米移动电源大容量自带线户外可上飞机小巧便携 【皓月白】20000毫安+品牌电芯+快至当日达 2025年新款丨自带三快充线丨国家3C认证',
  'store': '得意影音娱乐专营店',
  'positive_rate': '99%',
  'positive_count': 2000,
  'neutral_count': 0,
  'negative_count': 3,
  'negative_review_1': '充了几个小时还是1%垃圾充电宝，',
  'negative_review_2': '',
  'negative_review_3': '',
  'negative_review_4': '',
  'negative_review_5': ''},
 {'SKU': 10141253217046,
  'product_name': '充电宝20000毫安大容量超级快充自带线可上飞机移动电源适用小米 蓝-标准版【自带四线+普通快充】 【极速发货/次日达】可上飞机',
  'store': '米链手机数码配件工作室',
  'positive_rate': '99%',
  'positive_count': 1000,
  'neutral_count': 1,
  'negative_count': 0,
  'negative_review_1': '',
  'negative_review_2': '',
  'negative_review_3': '',
  'negative_review_4': '',
  'negative_review_5': ''},
 {'SKU': 100133267851,
  'product_name': 'CUKTECH酷态科10号超级电能棒Plus15000毫安时充电宝兼容120W/100W大功率快充支持笔记本/手机/小米/苹果',
  'store': 'CUKTECH酷态科京东自营旗舰店',
  'positive_rate': '99%',
  'positive_count': 5000,
  'neutral_count': 3

In [14]:
driver = get_chromedriver(
    webdriver_path="../../chromedriver.exe",
    disable_logging=True,
    headless=False,
    user_agent=ua,
    disable_automation_features=True,
    patch_webdriver_property=True,
    implicit_wait=30
)
driver.get("https://www.jd.com/?country=USA")

In [18]:
driver.quit()

## Combine All CSV Files into One

In [21]:
import os
import pandas as pd
from typing import List

def combine_jd_reviews_csv(
    data_dir: str = "../data/",
    output_filename: str = "jd_reviews_all.csv"
) -> None:
    """
    合并指定目录下所有包含 'jd_reviews' 的 CSV 文件到一个总文件中，并保存为 jd_reviews_all.csv。
    Combine all CSV files with 'jd_reviews' in their filename from the given directory
    into a single CSV file named 'jd_reviews_all.csv'.

    Args:
        data_dir (str): 存放 CSV 文件的目录路径。Directory containing the review CSV files.
        output_filename (str): 合并后输出文件名。Name of the combined output CSV file.
    """
    # List all files in the data directory
    files: List[str] = os.listdir(data_dir)
    # Filter for CSV files that contain 'jd_reviews' in the filename
    review_files = [f for f in files if f.endswith(".csv") and "jd_reviews" in f]

    if not review_files:
        print(f"No 'jd_reviews' CSV files found in {data_dir}")
        return

    # Read each CSV into a DataFrame
    df_list: List[pd.DataFrame] = []
    for filename in review_files:
        path = os.path.join(data_dir, filename)
        print(f"Reading {path}...")
        df = pd.read_csv(path, encoding="utf-8-sig")
        df_list.append(df)

    # Concatenate all DataFrames
    combined_df = pd.concat(df_list, ignore_index=True)
    output_path = os.path.join(data_dir, output_filename)

    # Save the combined DataFrame to CSV
    combined_df.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"Saved combined reviews to {output_path}")

In [22]:
combine_jd_reviews_csv(
    data_dir="../data/",
    output_filename="jd_reviews_all.csv"
)

Reading ../data/jd_reviews_01_05.csv...
Reading ../data/jd_reviews_06_10.csv...
Reading ../data/jd_reviews_11_15.csv...
Reading ../data/jd_reviews_16_20.csv...
Reading ../data/jd_reviews_21_25.csv...
Reading ../data/jd_reviews_26_30.csv...
Reading ../data/jd_reviews_31_35.csv...
Reading ../data/jd_reviews_36_40.csv...
Reading ../data/jd_reviews_41_45.csv...
Reading ../data/jd_reviews_46_50.csv...
Saved combined reviews to ../data/jd_reviews_all.csv


In [24]:
reviews_df = pd.read_csv("../data/jd_reviews_all.csv", encoding="utf-8-sig")
print(f"Total reviews loaded: {len(reviews_df)}")
reviews_df.head()

Total reviews loaded: 50


Unnamed: 0,SKU,product_name,store,positive_rate,positive_count,neutral_count,negative_count,negative_review_1,negative_review_2,negative_review_3,negative_review_4,negative_review_5
0,100024102298,罗马仕【热销200万+件】充电宝20000毫安22.5W快充 可上飞机苹果20W快充 适用华...,罗马仕京东自营旗舰店,97%,200000,2000,5000,此用户未填写评价内容,此用户未及时填写评价内容,此用户未及时填写评价内容,此用户未填写评价内容,此用户未及时填写评价内容
1,100096418777,小米（MI）小米自带线充电宝20000 33W 浅咖色 适用小米/红米/redmi/苹果/安...,小米京东自营旗舰店,98%,100000,1000,1000,充电过程中插头过热烫手,刚买了几天就发现降价了，体验感不好,此用户未填写评价内容,容量很一般，性价比不高！,嗯，那个壳被压的稀烂了，还是有点小
2,100097634241,CUKTECH酷态科电能块自带线20000毫安时移动电源55W快充充电宝可上飞机适用苹果16...,CUKTECH酷态科京东自营旗舰店,97%,20000,200,200,充电一个小时一直卡在99格,说好的可以上飞机呢，到机场都给我扣了,此用户未及时填写评价内容,国补的东西有点小瑕疵哈,刚买就降价了，还没有价保，幽默
3,100073745484,罗马仕30W自带线充电宝20000毫安时双向快充 支持苹果15可上飞机 适用于苹果华为手机平...,罗马仕京东自营旗舰店,98%,1000000,10000,10000,刚买1天就降价，看到心不爽,根本没收到货，快递丢了,此用户未填写评价内容,第一天买了第二天就降价了！,东西不错快递慢了一天
4,100140584252,倍思（Baseus）充电宝20000毫安自带线大容量22.5W快充极速数显适用于苹果15/1...,倍思（Baseus）京东自营旗舰店,98%,5000,90,89,买回来就那个上面就有划痕花了唉,才用了一次就充不上电,打开快递发现盒子已经被撕开的痕迹，明显这不是纯新包装,充满了，没用会自己掉电,早上八点开始充电，充到晚上九点五十，充到了27%。离谱！
