# 京东Top50充电宝商品&店铺分析

In [None]:
import pandas as pd
from hanlp import HanLP
from collections import Counter

In [None]:
# =============================
# 1️ 读取 CSV
# =============================
df = pd.read_csv("jd_reviews_all.csv")

print(f"读取完成：共有 {len(df)} 行")

# =============================
# 2️ 加载轻量单任务分词模型
# =============================
tokenizer = HanLP.load('FINE_ELECTRA_SMALL_ZH')

print("HanLP 分词模型加载完成：FINE_ELECTRA_SMALL_ZH")

# =============================
# 3️ 所有差评分词汇总
# =============================
all_tokens = []

for idx, row in df.iterrows():
    for i in range(1, 6):
        review = row.get(f'negative_review_{i}', '')
        if isinstance(review, str) and review.strip():
            tokens = tokenizer(review)
            all_tokens.extend(tokens)

print(f"共分词数量：{len(all_tokens)}")

In [None]:
# =============================
# 4️ 加载停用词表
# =============================
stopwords = set()
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f if line.strip())

print(f"加载停用词表完成，共 {len(stopwords)} 个停用词")

# =============================
# 5️ 去停用词 + 可选：去除单字词
# =============================
filtered_tokens = [w for w in all_tokens if w not in stopwords and len(w) > 1]

print(f"过滤后剩余词数：{len(filtered_tokens)}")

In [None]:
# =============================
# 6️ 统计词频
# =============================
freq = Counter(filtered_tokens)

# 打印前 50 个高频词
print("Top 50 高频词：")
for word, count in freq.most_common(50):
    print(f"{word}: {count}")

In [None]:
# =============================
# 7️ 保存词频表到 CSV
# =============================
df_freq = pd.DataFrame(freq.most_common(), columns=["word", "count"])
df_freq.to_csv("word_frequency.csv", index=False, encoding="utf-8-sig")

print("\n已保存到 word_frequency.csv")