In [1]:
import pandas as pd
import re
import os
from collections import Counter
import spacy
import torch
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import glob

# 查找Data目录中所有以_processed.xlsx结尾的文件
processed_files = glob.glob(os.path.join("Data", "*_processed.xlsx"))

# 确保找到了文件
if not processed_files:
    print("错误：在Data目录中没有找到任何以_processed.xlsx结尾的文件")
    Asin_List_file = None  # 设置为None表示未找到文件
    input_file_path = None
    input_file = None
else:
    # 获取第一个匹配文件的完整路径
    input_file_path = processed_files[0]
    
    # 同时设置input_file为选定文件的路径字符串(不是列表)
    input_file = input_file_path
    
    # 仅提取文件名（不包含路径）
    Asin_List_file = os.path.basename(input_file_path)
    
    print(f"找到了 {len(processed_files)} 个处理过的Excel文件:")
    for file in processed_files:
        file_name = os.path.basename(file)
        if file_name == Asin_List_file:
            print(f" - {file_name} (已选择)")
        else:
            print(f" - {file_name}")
    
    print(f"\nAsin_List_file = \"{Asin_List_file}\"")
    print(f"完整文件路径：{input_file}")

# 创建并保存输出目录路径为全局变量，以便在所有代码块中使用
output_dir = '生成结果/social_media/'
print(f"输出目录: {output_dir}")

# 创建目录（如果不存在）
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# # 从Excel文件名中提取基础文件夹名称
# base_folder_name = os.path.splitext(input_file)[0]

# # 创建目录（如果不存在）
# if not os.path.exists(base_folder_name):
#     os.makedirs(base_folder_name)

# 创建并保存输出目录路径为全局变量，以便在所有代码块中使用
output_dir = '生成结果/social_media/'
print(f"输出目录: {output_dir}")

# 创建目录（如果不存在）
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 检查GPU加速
def get_device():
    if torch.cuda.is_available():
        device = "cuda"
        print(f"使用CUDA加速 - {torch.cuda.get_device_name(0)}")
    elif torch.backends.mps.is_available():
        device = "mps"
        print("使用MPS加速")
    else:
        device = "cpu"
        print("使用CPU")
    return device

# 替换原来的设备检测
device = get_device()

# 清理文本数据
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # 转为小写
    text = text.lower()
    # 移除特殊字符和数字，但保留产品相关的关键字符如"-", "+", "&"
    text = re.sub(r'[^\w\s\-\+&]', ' ', text)
    # 移除多余空格
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 清理和规范化关键词
def clean_keyword(keyword):
    if not isinstance(keyword, str):
        return ""
    
    # 删除常见的非规范开头字符
    keyword = re.sub(r'^[\'"\*\-\(\)\[\]\{\}]+', '', keyword)
    
    # 删除常见的非规范结尾字符
    keyword = re.sub(r'[\'"\*\-\(\)\[\]\{\}]+$', '', keyword)
    
    # 移除奇怪的符号和标点
    keyword = re.sub(r'[^\w\s\-]', '', keyword)
    
    # 规范化空格
    keyword = re.sub(r'\s+', ' ', keyword).strip()
    
    # 移除数字序号开头的情况
    keyword = re.sub(r'^\d+[\.\)\-]\s*', '', keyword)
    
    # 检查关键词长度和格式
    if len(keyword) <= 1:
        return ""
    if keyword.count(' ') > 3:  # 不超过4个词的组合
        return ""
    if len(keyword) > 50:  # 不超过50个字符
        return ""
    
    return keyword

# 步骤2: 数据读取和预处理
def load_and_preprocess_data():
    # 根据文件扩展名决定读取方式
    print("正在读取数据文件...")
    file_extension = os.path.splitext(input_file)[1].lower()
    
    try:
        if file_extension == '.csv':
            df = pd.read_csv(input_file)
            print(f"成功读取CSV文件: {len(df)} 条记录")
        elif file_extension in ['.xlsx', '.xls']:
            df = pd.read_excel(input_file, sheet_name=0)
            print(f"成功读取Excel文件: {len(df)} 条记录")
        else:
            print(f"不支持的文件格式: {file_extension}")
            df = pd.DataFrame()
    except Exception as e:
        print(f"读取文件时出错: {e}")
        df = pd.DataFrame()
    
    # 确保需要的列存在
    required_columns = ['title', 'nodeLabelPath']
    for col in required_columns:
        if col not in df.columns:
            df[col] = ""
    
    # 只汇总'title'和'nodeLabelPath'两列的文本
    df['combined_text'] = df.apply(
        lambda row: f"{row['title']} {row['nodeLabelPath']}", 
        axis=1
    )
    
    # 清理组合文本
    df['cleaned_text'] = df['combined_text'].apply(clean_text)
    
    # 去重步骤
    original_count = len(df)
    df = df.drop_duplicates(subset=['cleaned_text'])
    removed_count = original_count - len(df)
    print(f"去重完成: 移除了 {removed_count} 条重复记录，剩余 {len(df)} 条记录")
    
    return df


# 修改步骤3: 加载NLP模型（不使用spaCy）
# 加载NLP模型时明确指定设备
def load_nlp_models():
    print("正在加载NLP模型...")
    
    # 尝试使用spaCy
    nlp = None
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        print("成功加载spaCy模型")
    except:
        print("无法加载spaCy模型，将使用简单文本处理")
    
    # 明确加载并移动模型到适当设备
    print(f"加载facebook/bart-large-mnli模型到 {device} 设备...")
    classifier = pipeline(
        "zero-shot-classification",
        model="facebook/bart-large-mnli",
        device=0 if device == "cuda" else device  # 对CUDA使用GPU索引
    )
    
    return nlp, classifier

# 修改步骤4: 从文本提取初始关键词（不依赖spaCy）
def extract_initial_keywords(df, nlp):
    print("提取初始关键词...")
    all_keywords = []
    
    # 如果spaCy可用，使用spaCy提取
    if nlp is not None:
        print("使用spaCy提取关键词")
        for _, row in df.iterrows():
            cleaned_text = row['cleaned_text']
            
            # 只处理有意义的文本
            if len(cleaned_text) > 5:
                # 使用spaCy进行分析
                doc = nlp(cleaned_text)
                
                # 提取名词和形容词
                for token in doc:
                    if token.pos_ in ['NOUN', 'PROPN', 'ADJ'] and len(token.text) > 2:
                        keyword = clean_keyword(token.lemma_)
                        if keyword:
                            all_keywords.append(keyword)
                
                # 提取名词短语
                for chunk in doc.noun_chunks:
                    if len(chunk.text) > 2:
                        keyword = clean_keyword(clean_text(chunk.text))
                        if keyword:
                            all_keywords.append(keyword)
    else:
        # 使用简单的文本分词方法
        print("使用简单方法提取关键词")
        for _, row in df.iterrows():
            cleaned_text = row['cleaned_text']
            
            # 只处理有意义的文本
            if len(cleaned_text) > 5:
                # 简单分词
                words = cleaned_text.split()
                
                # 提取单词
                for word in words:
                    if len(word) > 2:  # 只保留长度大于2的词
                        keyword = clean_keyword(word)
                        if keyword:
                            all_keywords.append(keyword)
                
                # 提取词组（简单的2-gram和3-gram）
                for i in range(len(words)-1):
                    if i < len(words)-1:
                        bigram = f"{words[i]} {words[i+1]}"
                        keyword = clean_keyword(bigram)
                        if keyword:
                            all_keywords.append(keyword)
                    
                    if i < len(words)-2:
                        trigram = f"{words[i]} {words[i+1]} {words[i+2]}"
                        keyword = clean_keyword(trigram)
                        if keyword:
                            all_keywords.append(keyword)
    
    return all_keywords

# 步骤5: 使用TF-IDF提取重要关键词并自动生成产品标签
def extract_tfidf_keywords_and_labels(df, max_features=200):
    print("使用TF-IDF提取文档级关键词并生成产品标签...")
    
    vectorizer = TfidfVectorizer(
        max_features=max_features, 
        stop_words='english',
        ngram_range=(1, 2)  # 提取1-gram和2-gram
    )
    
    # 拟合并转换文档
    tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'].tolist())
    
    # 获取特征名称（关键词）
    feature_names = vectorizer.get_feature_names_out()
    
    # 计算每个词的平均TF-IDF得分
    mean_tfidf_scores = np.array(tfidf_matrix.mean(axis=0)).flatten()
    
    # 按得分排序
    sorted_indices = np.argsort(mean_tfidf_scores)[::-1]
    top_feature_indices = sorted_indices[:max_features]
    
    # 获取顶级关键词
    top_keywords = []
    for i in top_feature_indices:
        keyword = clean_keyword(feature_names[i])
        if keyword:
            top_keywords.append(keyword)
    
    # 自动生成产品标签列表 - 使用TF-IDF得分最高的前10个词作为标签
    # 不需要任何预定义的模式或关键词
    product_labels = top_keywords[:10]
    
    print(f"自动生成的产品标签列表: {product_labels}")
    
    return top_keywords, product_labels

# 步骤6: 使用BART-MNLI模型过滤关键词 - 优化版本
def filter_keywords_with_bart(keywords, product_labels, classifier):
    print("使用BART-MNLI模型过滤关键词...")
    
    # 如果没有足够的产品标签，则使用关键词本身作为标签
    if not product_labels:
        print("没有足够的产品标签，使用关键词自身作为标签")
        counter = Counter(keywords)
        product_labels = [kw for kw, _ in counter.most_common(10)]
    
    # 增加批量大小以充分利用GPU
    batch_size = 128 if device == "cuda" else 20
    total_batches = (len(keywords) + batch_size - 1) // batch_size
    
    filtered_keywords = []
    
    print(f"处理 {len(keywords)} 个关键词，分为 {total_batches} 批")
    
    # 为所有关键词准备查询
    queries = [f"Relevance of the term: {keyword}" for keyword in keywords]
    
    # 批量处理关键词
    for i in range(0, len(keywords), batch_size):
        print(f"处理批次 {i//batch_size + 1}/{total_batches}")
        batch_keywords = keywords[i:i+batch_size]
        batch_queries = queries[i:i+batch_size]
        
        try:
            # 使用模型的批处理能力一次性处理整个批次
            batch_results = classifier(
                batch_queries, 
                candidate_labels=product_labels,
                multi_label=True,
                batch_size=32  # 控制内部批处理大小
            )
            
            # 如果返回的不是列表（单个查询的情况），则转换为列表
            if not isinstance(batch_results, list):
                batch_results = [batch_results]
            
            # 处理每个结果
            for j, result in enumerate(batch_results):
                if max(result['scores']) > 0.3:
                    filtered_keywords.append(batch_keywords[j])
                
        except Exception as e:
            print(f"处理批次 {i//batch_size + 1} 时出错: {e}")
            # 出错时改为逐个处理
            for j, keyword in enumerate(batch_keywords):
                try:
                    result = classifier(
                        batch_queries[j],
                        candidate_labels=product_labels,
                        multi_label=True
                    )
                    if max(result['scores']) > 0.3:
                        filtered_keywords.append(keyword)
                except Exception as e:
                    print(f"过滤关键词时出错 '{keyword}': {e}")
    
    print(f"过滤后保留了 {len(filtered_keywords)} 个关键词")
    return filtered_keywords

# 步骤7: 组合、去重和排序关键词
def process_final_keywords(keywords):
    # 再次清理每个关键词
    cleaned_keywords = []
    for kw in keywords:
        cleaned = clean_keyword(kw)
        if cleaned:
            # 检查关键词的组成
            words = cleaned.split()
            if len(words) > 1:
                # 检查多词组合中是否有无意义的词
                if all(len(word) > 1 for word in words):
                    cleaned_keywords.append(cleaned)
            else:
                # 单词关键词至少要有2个字符
                if len(cleaned) >= 2:
                    cleaned_keywords.append(cleaned)
    
    # 去重
    unique_keywords = list(set(cleaned_keywords))
    
    # 按字母顺序排序
    sorted_keywords = sorted(unique_keywords)
    
    return sorted_keywords

# 步骤8: 保存关键词到文件
def save_keywords(keywords):
    print("正在保存关键词...")
    # 将初始关键词保存到临时文件
    output_path = os.path.join(output_dir, 'keywords_raw.txt')
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for keyword in keywords:
            # 确保每一行都是格式规范的关键词
            f.write(f"{keyword}\n")
    
    print(f"关键词已保存到 {output_path}")

找到了 1 个处理过的Excel文件:
 - 环球一百狗窝打标_校对_processed.xlsx (已选择)

Asin_List_file = "环球一百狗窝打标_校对_processed.xlsx"
完整文件路径：Data\环球一百狗窝打标_校对_processed.xlsx
输出目录: 生成结果/social_media/
输出目录: 生成结果/social_media/
使用CUDA加速 - NVIDIA GeForce RTX 4090 D


In [2]:
df = load_and_preprocess_data()
df

正在读取数据文件...
成功读取Excel文件: 202 条记录
去重完成: 移除了 6 条重复记录，剩余 196 条记录


Unnamed: 0,asin,skuList,overviews,brand,brandUrl,title,asinUrl,imageUrl,parent,nodeLabelPath,...,Carrying Case,Straps (Carry/Shoulder),dimensions.1,Height,Length,Length - Level,Width,InteriorPadHeight,combined_text,cleaned_text
0,B0D8H5ZSHC,Color: Black/Grey,Color:Black/Grey | Material:Velvet | Brand:Woo...,Wooaidagg,/stores/WOOAIDAGG/page/DA557548-E7D2-4825-93B1...,"Dog Car Seat for Medium Sized Dog,Pet Travel C...",https://www.amazon.com/dp/B0D8H5ZSHC?psc=1,https://m.media-amazon.com/images/I/41UPRkZow+...,B0D5LYP1DH,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],20 x 20 x 20 inches,19.69,21.65,20.01-25,21.26,,"Dog Car Seat for Medium Sized Dog,Pet Travel C...",dog car seat for medium sized dog pet travel c...
1,B0DNSNMPWT,Pattern Name: BlackBrown,Color:BlackBrown | Brand:melafa365 | Maximum W...,melafa365,/stores/Qualityproductsqualityservice/page/B62...,"Dog Car Seat for Small/Medium Dogs, Memory Foa...",https://www.amazon.com/dp/B0DNSNMPWT?psc=1,https://m.media-amazon.com/images/I/51j7IiGGHt...,B0DNSKSWPN,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],13.7 x 9 x 7 inches,19.00,17.00,15.01-20,17.00,6.00,"Dog Car Seat for Small/Medium Dogs, Memory Foa...",dog car seat for small medium dogs memory foam...
2,B0DP4DDJJT,Color: Black/Grey | Size: Medium,Color:Black/Grey | Material:Velvet | Brand:IND...,INDYBUD,/stores/INDYBUD/page/7FC74FB8-CCA1-45CA-A23C-3...,"Dog Booster Car Seat for Dogs Up to 35lbs, Saf...",https://www.amazon.com/dp/B0DP4DDJJT?psc=1,https://m.media-amazon.com/images/I/41mSv17wdk...,B0DKXNXNQT,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],19 x 19 x 19 inches,19.00,19.00,15.01-20,19.00,5.91,"Dog Booster Car Seat for Dogs Up to 35lbs, Saf...",dog booster car seat for dogs up to 35lbs safe...
3,B0D1QYCCV2,Color: Black/Brown,Color:Black/Brown | Material:Velvet | Brand:Wo...,Wooaidagg,/stores/WOOAIDAGG/page/DA557548-E7D2-4825-93B1...,"Dog Car Seat for Medium Sized Dog,Pet Travel C...",https://www.amazon.com/dp/B0D1QYCCV2?psc=1,https://m.media-amazon.com/images/I/41NXcIa4I3...,B0D5LYP1DH,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],20 x 20 x 20 inches,20.00,20.00,15.01-20,20.00,,"Dog Car Seat for Medium Sized Dog,Pet Travel C...",dog car seat for medium sized dog pet travel c...
4,B09H26QDXG,Material Type: Cationic Blue | Pattern Name: U...,Color:Dark Blue | Material:Cationic Blue | Bra...,GENORTH,/GENORTH/b/ref=bl_dp_s_web_119204902011?ie=UTF...,"Dog Car Seats for Small and Medium Dogs,Portab...",https://www.amazon.com/dp/B09H26QDXG?psc=1,https://m.media-amazon.com/images/I/412zzYGgSn...,B0CKRBQYBJ,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],18.9 x 14.96 x 10.62 inches,10.60,18.90,15.01-20,14.96,,"Dog Car Seats for Small and Medium Dogs,Portab...",dog car seats for small and medium dogs portab...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,B0CRB9NRSV,Color: Black,Color:Black | Material:Short plush | Brand:NEE...,NEEZUKAR,/stores/NEEZUKAR/page/0FCDAB2E-9534-4BE5-B71D-...,"Dog Car Seat for Large Medium Dogs,Portable Wa...",https://www.amazon.com/dp/B0CRB9NRSV?psc=1,https://m.media-amazon.com/images/I/41MjhbAeCc...,B0CW98M1LG,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],29.5 x 20 x 20 inches,20.00,29.50,Above 25.01,20.00,4.00,"Dog Car Seat for Large Medium Dogs,Portable Wa...",dog car seat for large medium dogs portable wa...
198,B0CRBBKV6K,Color: Black,Brand:GL GLENSLAVE | Breed Recommendation:smal...,GL GLENSLAVE,/stores/GLGLENSLAVE/page/AF5DA707-F137-4FFF-84...,"Dog Car Seat for Small Medium Dogs, Memory Foa...",https://www.amazon.com/dp/B0CRBBKV6K?psc=1,https://m.media-amazon.com/images/I/41u4KEsHDn...,B0CRD2FK2H,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],19.69 x 17.72 x 11.81 inches,11.70,19.70,15.01-20,17.70,,"Dog Car Seat for Small Medium Dogs, Memory Foa...",dog car seat for small medium dogs memory foam...
199,B0CSSK989S,Color: Blue,Color:Blue | Material:Polyester | Brand:Guloko...,Gulokoka,/Gulokoka/b/ref=bl_dp_s_web_36890320011?ie=UTF...,"Dog Car Seat for Small Dogs, Small Dog Booster...",https://www.amazon.com/dp/B0CSSK989S?psc=1,https://m.media-amazon.com/images/I/41udyDUeGr...,B0CSSJY7N9,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Yes'],17.01 x 6.89 x 6.85 inches,13.00,20.50,20.01-25,19.00,3.00,"Dog Car Seat for Small Dogs, Small Dog Booster...",dog car seat for small dogs small dog booster ...
200,B0CSY76B8L,Color: Grey | Size: M,"Color:Grey | Material:Metal, Plastic, Velvet, ...",Ytmuzic,/Ytmuzic/b/ref=bl_dp_s_web_121070718011?ie=UTF...,"Dog Car Seat for Small Medium Dogs, Pet Car Se...",https://www.amazon.com/dp/B0CSY76B8L?psc=1,https://m.media-amazon.com/images/I/41Ibg4XcCk...,B0D78B6X1N,Pet Supplies:Dogs:Carriers & Travel Products:C...,...,['Not Mention'],['Not Mention'],18.9 x 18.9 x 19 inches,18.90,19.00,15.01-20,18.90,,"Dog Car Seat for Small Medium Dogs, Pet Car Se...",dog car seat for small medium dogs pet car sea...


In [3]:
# 步骤2: 加载和预处理数据
df = load_and_preprocess_data()

# 步骤3: 加载NLP模型
nlp, classifier = load_nlp_models()

# 步骤4: 提取初始关键词
initial_keywords = extract_initial_keywords(df, nlp)

# 步骤5: 提取TF-IDF关键词并生成产品标签
tfidf_keywords, product_labels = extract_tfidf_keywords_and_labels(df)

# 步骤6: 使用BART-MNLI模型过滤关键词
combined_keywords = list(set(initial_keywords + tfidf_keywords))
filtered_keywords = filter_keywords_with_bart(combined_keywords, product_labels, classifier)

# 步骤7: 整合、清理和排序关键词
final_keywords = process_final_keywords(filtered_keywords)

# 步骤8: 保存关键词
save_keywords(final_keywords)

正在读取数据文件...
成功读取Excel文件: 202 条记录
去重完成: 移除了 6 条重复记录，剩余 196 条记录
正在加载NLP模型...
成功加载spaCy模型
加载facebook/bart-large-mnli模型到 cuda 设备...


Device set to use cuda:0


提取初始关键词...
使用spaCy提取关键词
使用TF-IDF提取文档级关键词并生成产品标签...
自动生成的产品标签列表: ['car', 'travel', 'seat', 'dog', 'dogs', 'booster', 'pet', 'car seat', 'seats', 'dog car']
使用BART-MNLI模型过滤关键词...
处理 734 个关键词，分为 6 批
处理批次 1/6
处理批次 2/6
处理批次 3/6
处理批次 4/6
处理批次 5/6
处理批次 6/6
过滤后保留了 561 个关键词
正在保存关键词...
关键词已保存到 生成结果/social_media/keywords_raw.txt


In [4]:
# print(product_labels)

In [5]:
import re
import os
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import time
import random

# 确定处理设备
if torch.backends.mps.is_available():
    device = "mps"
    print("使用MPS加速")
elif torch.cuda.is_available():
    device = "cuda"
    print("使用CUDA加速")
else:
    device = "cpu"
    print("使用CPU")

# 从Excel文件名中获取基础文件夹名称
excel_file = Asin_List_file
#base_folder_name = os.path.splitext(excel_file)[0]

base_folder_name = os.path.join("生成结果", "social_media")

# 配置文件路径
input_file = os.path.join(base_folder_name, "keywords_raw.txt")
# 创建一个临时文件
temp_output_file = os.path.join(base_folder_name, "keywords_temp1.txt")

def clean_keyword_format(keyword):
    """第一阶段：清理关键词的基本格式问题"""
    if not isinstance(keyword, str):
        return ""
    
    # 原始关键词备份
    original = keyword
    
    # 基本清理
    keyword = keyword.strip().lower()
    keyword = re.sub(r'\s+', ' ', keyword)
    
    # 删除非标准字符
    keyword = re.sub(r'[^\w\s\-]', '', keyword)
    
    # 空字符串检查
    if not keyword:
        return ""
    
    # 移除数字前缀和单位
    if re.match(r'^\d+', keyword):
        # 移除纯数字+单位格式
        if re.match(r'^\d+\s*(inch|inche|cm|mm|pcs|pc|piece|pieces)$', keyword):
            return ""
        
        # 提取有实际意义的部分
        cleaned = re.sub(r'^\d+\s*(inch|inche|cm|mm|pcs|pc|piece|pieces)\s*', '', keyword)
        if cleaned and len(cleaned) > 2:
            keyword = cleaned
    
    # 移除尾部的pcs/pc等标记
    if re.search(r'\b(pcs|pc|piece|pieces)$', keyword):
        keyword = re.sub(r'\s*(pcs|pc|piece|pieces)$', '', keyword)
    
    # 移除商品数量标记（如2pcs, 3pcs等）
    if re.search(r'^\d+(pcs|pc)\b', keyword):
        keyword = re.sub(r'^\d+(pcs|pc)\s*', '', keyword)
    
    # 如果清理后过短，则放弃
    if len(keyword) < 3:
        return ""
    
    # 检查是否只剩下了代码/ID等无意义字符
    if re.match(r'^[a-z0-9]+$', keyword) and len(keyword) < 5:
        return ""
    
    return keyword

def load_keywords(filepath):
    """从文件加载关键词"""
    keywords = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                keyword = line.strip()
                if keyword:
                    keywords.append(keyword)
        print(f"成功加载 {len(keywords)} 个关键词")
    except Exception as e:
        print(f"读取文件时出错: {e}")
    
    return keywords

def create_custom_classifier():
    """创建自定义分类器，使用更适合的预训练模型"""
    print("加载预训练模型...")
    
    # 使用RoBERTa模型，它对细微语义差异更敏感
    model_name = "roberta-large"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    # 将模型移至适当设备
    model = model.to(device)
    
    # 创建分类器
    classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        device=device if device != "mps" else -1  # MPS需要特殊处理
    )
    
    return classifier

def prepare_classifier_prompt(keyword):
    """准备分类器提示，更精确地指导模型判断"""
    # 构建提示模板
    prompts = [
        f"Is '{keyword}' a specific product feature, component, or attribute that can be used to search for products on social media? Answer: ",
        f"Would '{keyword}' help identify a specific product category or type when searching on social media? Answer: ",
        f"Is '{keyword}' a descriptive term about a product's physical characteristics rather than just its potential users? Answer: "
    ]
    
    # 随机选择一个提示以增加多样性
    return random.choice(prompts)

def filter_keywords_with_custom_logic(keywords):
    """使用自定义逻辑过滤关键词"""
    # 第一阶段：格式清理
    print("第一阶段：格式清理")
    format_cleaned = []
    rejected_format = []
    
    for keyword in tqdm(keywords, desc="格式清理"):
        cleaned = clean_keyword_format(keyword)
        if cleaned:
            format_cleaned.append(cleaned)
        else:
            rejected_format.append(keyword)
    
    print(f"格式清理后保留: {len(format_cleaned)}/{len(keywords)} 关键词")
    
    # 第二阶段：语义过滤
    print("\n第二阶段：语义过滤")
    
    # 定义明确的排除模式
    exclude_patterns = [
        r'^(boy|girl|adult|kids|children|teen|baby|women|men)s?$',  # 纯人群词
        r'^[a-z]$',  # 单个字母
        r'^\d+$',  # 纯数字
        r'^[0-9a-z]{5,10}$',  # 可能是产品代码
        r'^(amazon|ebay|walmart|target)$',  # 商店名称
        r'^(made in|shipping|delivery)$'  # 物流词
    ]
    
    # 手动过滤明确的模式
    filtered_keywords = []
    rejected_patterns = []
    
    for keyword in tqdm(format_cleaned, desc="模式过滤"):
        # 检查是否匹配任何排除模式
        if any(re.match(pattern, keyword) for pattern in exclude_patterns):
            rejected_patterns.append(keyword)
            continue
        
        filtered_keywords.append(keyword)
    
    print(f"模式过滤后保留: {len(filtered_keywords)}/{len(format_cleaned)} 关键词")
    
    # # 第三阶段：使用预训练模型进行高级过滤
    # print("\n第三阶段：高级语义过滤")
    
    # # 加载自定义分类器
    # classifier = create_custom_classifier()
    
    # semantic_filtered = []
    # rejected_semantic = []
    
    # # 批量处理以提高效率
    # batch_size = 16
    
    # # 对剩余关键词进行语义过滤
    # for i in tqdm(range(0, len(filtered_keywords), batch_size), desc="语义过滤"):
    #     batch = filtered_keywords[i:i+batch_size]
        
    #     for keyword in batch:
    #         # 构建提示
    #         prompt = prepare_classifier_prompt(keyword)
            
    #         try:
    #             # 使用模型检查关键词的相关性
    #             time.sleep(0.05)  # 轻微延迟以避免设备过载
    #             result = []
                
    #             # 判定关键词是否与产品相关
    #             is_related = False
                
    #             # 使用简单启发式规则优先处理明显的产品关键词
    #             if any(product_term in keyword for product_term in ['bag', 'backpack', 'case', 'holder', 'pack', 'product', 'accessory']):
    #                 is_related = True
    #             else:
    #                 # 让模型判断更复杂的情况
    #                 result = classifier(prompt)
    #                 label = result[0]['label']
    #                 score = result[0]['score']
                    
    #                 # RoBERTa的标签判断
    #                 is_related = (label == "LABEL_1" and score > 0.85) or (label == "LABEL_0" and score < 0.25)
                
    #             if is_related:
    #                 semantic_filtered.append(keyword)
    #             else:
    #                 rejected_semantic.append(keyword)
                    
    #         except Exception as e:
    #             print(f"\n处理关键词时出错 '{keyword}': {e}")
    #             # 出错时保守地保留关键词
    #             semantic_filtered.append(keyword)
    
    # print(f"语义过滤后保留: {len(semantic_filtered)}/{len(filtered_keywords)} 关键词")
    
    # # 打印一些被过滤的示例
    # if rejected_semantic:
    #     sample_size = min(10, len(rejected_semantic))
    #     print(f"\n被过滤掉的示例: {random.sample(rejected_semantic, sample_size)}")
    
    # return semantic_filtered
    return filtered_keywords

def save_keywords(keywords, filepath):
    """保存过滤后的关键词到文件"""
    # 按字母顺序排序
    sorted_keywords = sorted(set(keywords))
    
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            for keyword in sorted_keywords:
                f.write(f"{keyword}\n")
        print(f"成功保存 {len(sorted_keywords)} 个关键词到 {filepath}")
    except Exception as e:
        print(f"保存文件时出错: {e}")

# 直接执行流程
print(f"开始处理文件: {input_file}")

# 1. 加载关键词
keywords = load_keywords(input_file)
if keywords:
    # 2. 使用增强过滤逻辑过滤关键词
    filtered_keywords = filter_keywords_with_custom_logic(keywords)
    
    # 3. 保存结果
    save_keywords(filtered_keywords, temp_output_file)
    
    print("关键词清理完成!")
else:
    print("没有找到关键词，退出程序")

# # 清理临时文件
# try:
#     os.remove(os.path.join(base_folder_name, "keywords_temp1.txt"))
#     os.remove(os.path.join(base_folder_name, "keywords_raw.txt"))
#     print("已清理临时文件")
# except Exception as e:
#     print(f"临时文件清理失败: {e}")

使用CUDA加速
开始处理文件: 生成结果\social_media\keywords_raw.txt
成功加载 549 个关键词
第一阶段：格式清理


格式清理: 100%|██████████████████████████████████████████████████████████████████| 549/549 [00:00<00:00, 109692.88it/s]


格式清理后保留: 484/549 关键词

第二阶段：语义过滤


模式过滤: 100%|██████████████████████████████████████████████████████████████████| 484/484 [00:00<00:00, 193263.82it/s]

模式过滤后保留: 335/484 关键词
成功保存 335 个关键词到 生成结果\social_media\keywords_temp1.txt
关键词清理完成!





In [6]:
import os
import pandas as pd
import spacy
import numpy as np
from tqdm import tqdm
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from typing import List, Optional, Set
import requests
import warnings

# 忽略特定警告 (例如 InsecureRequestWarning)
from requests.packages.urllib3.exceptions import InsecureRequestWarning
warnings.simplefilter('ignore', InsecureRequestWarning)

# --- REMOVED: Static GENERIC_CATEGORY_BLOCKLIST ---
# We will now derive potentially problematic single words from product_labels

# 从Excel文件名中获取基础文件夹名称
# excel_file = Asin_List_file
# base_folder_name = os.path.splitext(excel_file)[0]
base_folder_name = os.path.join("生成结果", "social_media")
os.makedirs(base_folder_name, exist_ok=True)

# 配置文件路径
input_file = os.path.join(base_folder_name, "keywords_temp1.txt")
output_file = os.path.join(base_folder_name, "keywords.txt")

# 加载spaCy模型
print("加载NLP模型...")
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("未找到 en_core_web_lg 模型，尝试下载...")
    try:
        spacy.cli.download("en_core_web_lg")
        nlp = spacy.load("en_core_web_lg")
    except Exception as e:
        print(f"下载或加载 spaCy 模型失败: {e}")
        exit()

# --- 新增：辅助函数，从产品标签提取单字标签 ---
def get_single_word_labels(labels: Optional[List[str]]) -> Set[str]:
    """从产品标签列表中提取所有单字条目（小写）"""
    if not labels:
        return set()
    single_words = set()
    for label in labels:
        processed_label = label.strip().lower()
        # 检查处理后的标签是否只包含一个单词（没有空格）
        if processed_label and ' ' not in processed_label:
            single_words.add(processed_label)
    return single_words
# --- 结束新增 ---

# --- 修改：更新 AI Prompt (简化关于通用词的说明) ---
def create_keyword_refinement_prompt(keywords, product_labels=None):
    """为QWEN模型创建关键词精炼提示词 (已更新，简化通用词说明)"""

    product_context = ""
    label_words_str = ""

    if product_labels and len(product_labels) > 0:
        product_context = f"The product likely relates to these concepts: {', '.join(product_labels)}. "
        # 提取一些标签词用于上下文提示
        temp_label_words = set()
        for label in product_labels:
             words = [word.lower() for word in re.findall(r'\b[a-z]{3,}\b', label.lower())]
             temp_label_words.update(words)
        label_words_str = ', '.join(list(temp_label_words)[:10])
        product_context += f"Keywords should be specific to this context (related to: {label_words_str})."

    # 提示词稍微简化，因为部分单字过滤已在前面处理
    prompt = f"""You are an e-commerce keyword expert. Refine the following keyword list to keep only terms describing specific product functions, usage scenarios, user experience, materials, features, target audience, or styles.

{product_context}

Perform these operations:

1.  Remove keywords that are too generic or vague *for the given product context*. For example, if the context is 'travel pillow', remove standalone 'travel' or 'accessories' if present, but keep 'travel pillow'.
2.  Remove pure numbers, standalone size specs (e.g., "XL"), unless they are a key model/feature.
3.  Remove standalone packaging units (e.g., "bag", "box", "set", "pcs", "pack") ONLY if they are not the product's core identity (e.g., for furniture, remove 'box'; for a 'storage box', keep 'box'). Use context: {label_words_str}.
4.  Remove dimensional/capacity specs (e.g., "120l", "5kg") unless they are a defining feature.
5.  Keep phrases about usage/experience (e.g., "easy to assemble", "comfortable grip").
6.  Keep terms for functions/scenarios (e.g., "outdoor camping", "office work").
7.  Keep terms for features/materials (e.g., "wireless charging", "stainless steel").
8.  Remove vague adjectives if standalone (e.g., remove "good", keep "good quality").
9.  Make minor improvements for clarity/search habits.

IMPORTANT: Focus on specificity relevant to the product context ({label_words_str}).

Keyword list:
{', '.join(keywords)}

Return only the processed keyword list, one keyword per line, without explanations or numbering. If the list is short (under 200), be slightly more lenient.
"""
    return prompt
# --- 结束修改 ---


def refine_keywords_with_ai(
    keywords: List[str],
    api_key: Optional[str] = None,
    product_labels: Optional[List[str]] = None,
    base_folder_name: str = None
) -> List[str]:
    """使用QWEN大模型精炼关键词 (API调用逻辑保持不变)"""
    if not keywords:
        print("输入AI精炼的关键词列表为空，跳过AI步骤。")
        return []

    if not api_key:
        api_key = os.environ.get("QWEN_API_KEY")
        if not api_key:
            print("警告: 未提供QWEN API密钥，跳过AI精炼步骤")
            return keywords # 返回未经AI处理的列表

    prompt = create_keyword_refinement_prompt(keywords, product_labels)

    if base_folder_name:
        try:
            with open(os.path.join(base_folder_name, "ai_prompt.txt"), 'w', encoding='utf-8') as f:
                f.write(prompt)
        except Exception as e:
            print(f"无法写入 ai_prompt.txt: {e}")

    refined_keywords = []
    max_retries = 2
    current_retry = 0

    # --- API 调用逻辑 (保持和上一个版本一致，包含重试和错误处理) ---
    while current_retry < max_retries and not refined_keywords:
        current_retry += 1
        print(f"尝试使用QWEN大模型精炼关键词 (尝试 {current_retry}/{max_retries})...")
        # 尝试方法1: OpenAI 兼容模式
        try:
            print("正在使用 OpenAI 兼容模式 (方法1)...")
            from openai import OpenAI, APITimeoutError, APIConnectionError, RateLimitError
            client = OpenAI(api_key=api_key, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", timeout=60.0)
            completion = client.chat.completions.create(
                model="qwen-max",
                messages=[
                    {"role": "system", "content": "You are an e-commerce and SEO keyword expert."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1, max_tokens=4090, top_p=0.9,
            )
            result = completion.choices[0].message.content.strip()
            if result and isinstance(result, str):
                refined_keywords = [kw.strip().lower() for kw in result.splitlines() if kw.strip()]
                if refined_keywords:
                    print(f"方法1成功获取精炼关键词，共 {len(refined_keywords)} 个")
                    break
                else: print("方法1返回了空或无效的结果。")
            else: print("方法1未能获取有效响应内容。")
        except (APITimeoutError, APIConnectionError) as e: print(f"方法1连接或超时错误: {e}")
        except RateLimitError as e: print(f"方法1触发速率限制: {e}")
        except Exception as e: print(f"方法1精炼关键词时发生未知错误: {e}")

        # 如果方法1失败，尝试方法2: 原始 API 调用
        if not refined_keywords and current_retry <= max_retries:
            try:
                print("方法1失败，尝试使用原始API调用 (方法2)...")
                headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
                payload = {
                    "model": "qwen-max",
                    "input": {"messages": [
                        {"role": "system", "content": "You are an e-commerce and SEO keyword expert."},
                        {"role": "user", "content": prompt}
                    ]},
                    "parameters": {"temperature": 0.1, "max_tokens": 4090, "top_p": 0.9, "result_format": "message"}
                }
                response = requests.post(
                    "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation",
                    headers=headers, json=payload, timeout=60.0, verify=False
                )
                if response.status_code == 200:
                    result_data = response.json()
                    if "output" in result_data and "choices" in result_data["output"] and len(result_data["output"]["choices"]) > 0:
                        content = result_data["output"]["choices"][0]["message"]["content"].strip()
                        if content and isinstance(content, str):
                            refined_keywords = [kw.strip().lower() for kw in content.splitlines() if kw.strip()]
                            if refined_keywords:
                                print(f"方法2成功获取精炼关键词，共 {len(refined_keywords)} 个")
                                break
                            else: print(f"方法2返回了空或无效的结果内容。响应: {response.text[:200]}...")
                        else: print(f"方法2未能获取有效响应内容。响应: {response.text[:200]}...")
                    else: print(f"方法2 API响应结构不符合预期。响应: {response.text[:200]}...")
                else: print(f"方法2 API调用失败，状态码: {response.status_code}, 响应: {response.text[:200]}...")
            except requests.exceptions.Timeout: print("方法2请求超时。")
            except requests.exceptions.RequestException as e: print(f"方法2请求发生错误: {e}")
            except Exception as e: print(f"方法2精炼关键词时发生未知错误: {e}")

    if not refined_keywords:
        print("警告：AI大模型调用失败，将使用AI处理之前的关键词列表")
        if base_folder_name:
            try:
                with open(os.path.join(base_folder_name, "ai_error_log.txt"), 'a', encoding='utf-8') as f:
                    f.write(f"[{pd.Timestamp.now()}] AI refinement failed for prompt starting with: {prompt[:100]}...\n")
            except Exception as e: print(f"无法写入 ai_error_log.txt: {e}")
        return keywords # 返回未经AI处理的列表

    if base_folder_name:
        try:
            with open(os.path.join(base_folder_name, "ai_refined_keywords.txt"), 'w', encoding='utf-8') as f:
                for kw in refined_keywords: f.write(f"{kw}\n")
        except Exception as e: print(f"无法写入 ai_refined_keywords.txt: {e}")

    print(f"关键词AI精炼完成: 输入{len(keywords)}个，精炼后{len(refined_keywords)}个")
    return refined_keywords
# --- 结束 AI 函数 ---


# --- 修改：load_keywords 函数增加基础清洗 (保持不变) ---
def load_keywords(filepath):
    """从文件加载关键词，并进行基础清洗（小写，去特殊字符）"""
    keywords = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                keyword = line.strip().lower()
                keyword = keyword.replace('|', '') # 移除 '|'
                if keyword:
                    keywords.append(keyword)
        print(f"成功加载并初步清洗 {len(keywords)} 个关键词")
    except FileNotFoundError:
         print(f"错误: 输入文件未找到 {filepath}")
         return []
    except Exception as e:
        print(f"读取文件 {filepath} 时出错: {e}")
        return []
    return keywords
# --- 结束修改 ---

# --- REMOVED: filter_generic_keywords 函数 ---
# 过滤逻辑将整合到 extract_core_keywords


def analyze_keywords_structure(keywords):
    """分析关键词结构 (基本保持不变, 移除 blocklist 依赖)"""
    print("分析关键词结构...")
    if not keywords: return [], []

    parsed_keywords = []
    if 'nlp' not in globals() or nlp is None:
        print("错误: NLP模型未加载，无法分析关键词结构。")
        return [], []

    for keyword in tqdm(keywords, desc="解析关键词"):
        if not keyword or not isinstance(keyword, str): continue
        try:
            doc = nlp(keyword.lower())
            parsed_keywords.append(doc)
        except Exception as e:
            print(f"解析关键词 '{keyword}' 时出错: {e}")
            continue

    product_terms = Counter()
    for doc in parsed_keywords:
        for token in doc:
            if token.text and not token.text.isspace() and token.pos_ in ["NOUN", "PROPN"] and len(token.text) > 2:
                product_terms[token.text.lower()] += 1

    # 核心术语识别稍微调整，不再依赖外部 blocklist
    # 可以考虑在这里过滤掉一些通用的单位词，如果需要的话
    common_units_or_fillers = {'pcs', 'piece', 'pieces', 'pack', 'packs', 'set', 'sets', 'inch', 'cm', 'mm', 'kg', 'lb', 'oz', 'for', 'with', 'and'}
    core_product_terms = [term for term, count in product_terms.most_common()
                          if count >= 3 and term not in common_units_or_fillers and not term.isdigit()]

    print(f"识别出 {len(core_product_terms)} 个核心产品术语")

    # 前缀模式识别逻辑不变
    prefix_patterns = []
    for keyword in keywords:
        if not keyword or not isinstance(keyword, str): continue
        words = keyword.lower().split()
        if len(words) >= 2:
            potential_prefix = words[0]
            if re.match(r'^\d+', potential_prefix) or '-' in potential_prefix:
                prefix_patterns.append(potential_prefix)
            if len(words) >= 3 and re.match(r'^\d+', potential_prefix) and words[1] in ['pcs', 'piece', 'pieces', 'pack', 'in', 'inch', 'inche']:
                prefix_patterns.append(f"{words[0]} {words[1]}")

    common_prefixes = Counter(prefix_patterns).most_common()
    print(f"识别出 {len(common_prefixes)} 种前缀模式")

    return core_product_terms, common_prefixes


# --- 修改：extract_core_keywords 整合单字标签过滤 ---
def extract_core_keywords(keywords, product_labels=None):
    """
    提取核心关键词，移除多样化前缀，并过滤掉与产品标签中单字条目匹配的单字关键词。
    """
    if not keywords: return []

    # --- 新增：获取单字标签用于过滤 ---
    single_word_labels_to_filter = get_single_word_labels(product_labels)
    if single_word_labels_to_filter:
        print(f"\n将尝试过滤与以下单字标签匹配的单字关键词: {list(single_word_labels_to_filter)}")
    # --- 结束新增 ---

    # 1. 分析关键词结构
    core_terms, common_prefixes = analyze_keywords_structure(keywords)

    # 打印上下文信息 (保持不变)
    # category_core_terms = process_product_labels(product_labels) if product_labels else set() # 这个函数现在没了，但可以用 single_word_labels 代替部分信息
    print(f"\n产品标签中的单字词 (可能被过滤): {list(single_word_labels_to_filter)[:10]}")
    print("\n常见前缀示例:")
    for prefix, count in common_prefixes[:15]: print(f"  '{prefix}' (出现{count}次)")
    print("\n核心产品术语示例 (来自关键词分析):")
    print(f"  {', '.join(core_terms[:15])}")

    # 按长度排序关键词
    # 先进行一次简单去重和基础过滤
    initial_filtered = set()
    for kw in keywords:
        k = kw.strip()
        if k and len(k) > 1: # 至少保留长度为2的词
             initial_filtered.add(k)
    sorted_keywords = sorted(list(initial_filtered), key=len, reverse=True)

    processed_keywords = []
    removal_stats = Counter() # 重命名统计计数器
    prefix_patterns = [prefix for prefix, _ in common_prefixes]
    unit_terms = {'pcs', 'piece', 'pieces', 'pack', 'in', 'inch', 'inche', 'in-1', 'set'} # 保留单位词用于前缀识别

    for keyword in tqdm(sorted_keywords, desc="前缀移除与单字标签过滤"):
        original = keyword
        words = keyword.lower().split() # 确保小写

        # --- 修改：整合单字标签过滤逻辑 ---
        is_single_word_keyword = len(words) == 1
        if is_single_word_keyword:
            # 如果当前关键词是单个词，并且这个词存在于从 product_labels 提取的单字标签集合中
            if keyword in single_word_labels_to_filter:
                removal_stats[f"单字标签过滤: {keyword}"] += 1
                continue # 过滤掉这个词，进行下一轮循环
            # 如果是单字，但不在过滤列表里，检查是否是纯数字或过短
            elif keyword.isdigit():
                 removal_stats["纯数字关键词"] += 1
                 continue
            elif len(keyword) < 3: # 可选：过滤掉非常短的单字 (如 's', 'm')
                 removal_stats["过短单字(<3)"] += 1
                 continue
            else:
                 # 保留不在过滤列表中的有效单字关键词
                 processed_keywords.append(keyword)
                 continue # 处理完单字情况
        # --- 结束修改 ---

        # --- 前缀移除逻辑 (基本保持不变, 增加有效性检查) ---
        cleaned = False
        # 策略1: 固定前缀模式移除
        for prefix in prefix_patterns:
            prefix_terms = prefix.split()
            if len(words) > len(prefix_terms) and ' '.join(words[:len(prefix_terms)]) == prefix:
                remaining_keyword = ' '.join(words[len(prefix_terms):]).strip()
                if remaining_keyword and len(remaining_keyword) > 1 and not remaining_keyword.isdigit(): # 允许长度为2
                    keyword = remaining_keyword
                    removal_stats[f"固定前缀: {prefix}"] += 1
                    cleaned = True
                    break
                else:
                    cleaned = False; break # 移除后无效

        if cleaned:
            # 如果清理后变成单字，再次检查是否需要过滤
            if ' ' not in keyword and keyword in single_word_labels_to_filter:
                 removal_stats[f"清理后单字标签过滤: {keyword}"] += 1
                 continue
            elif len(keyword) > 1: # 确保清理后仍然有效
                 processed_keywords.append(keyword)
            continue

        # 策略2: 数字+单位组合识别
        if len(words) >= 3 and re.match(r'^\d+', words[0]) and words[1] in unit_terms:
            remaining_keyword = ' '.join(words[2:]).strip()
            if remaining_keyword and len(remaining_keyword) > 1 and not remaining_keyword.isdigit():
                keyword = remaining_keyword
                # 再次检查清理后是否为需过滤的单字
                if ' ' not in keyword and keyword in single_word_labels_to_filter:
                     removal_stats[f"清理后单字标签过滤: {keyword}"] += 1
                     continue
                removal_stats[f"数字单位前缀: {words[0]} {words[1]}"] += 1
                processed_keywords.append(keyword)
                continue

        # 策略3: 仅数字前缀识别
        if re.match(r'^\d+(-\d+)*$', words[0]) and len(words) > 1:
             remaining_keyword = ' '.join(words[1:]).strip()
             if remaining_keyword and len(remaining_keyword) > 1 and not remaining_keyword.isdigit():
                keyword = remaining_keyword
                if ' ' not in keyword and keyword in single_word_labels_to_filter:
                     removal_stats[f"清理后单字标签过滤: {keyword}"] += 1
                     continue
                removal_stats[f"数字前缀: {words[0]}"] += 1
                processed_keywords.append(keyword)
                continue

        # 策略4: 复杂模式识别 (逻辑不变，增加后续检查)
        if len(words) >= 2:
            first_word = words[0]
            if (re.match(r'^\d+[a-z]+\d*$', first_word) or
                re.match(r'^\d+-[a-z]+(-\d+)?$', first_word)):
                prefix_to_remove = first_word
                idx_to_join = 1
                if len(words) > 2 and words[1] in unit_terms.union(['casual', 'rolling', 'cartoon', 'tier']):
                    prefix_to_remove = f"{words[0]} {words[1]}"
                    idx_to_join = 2
                remaining_keyword = ' '.join(words[idx_to_join:]).strip()
                if remaining_keyword and len(remaining_keyword) > 1 and not remaining_keyword.isdigit():
                    keyword = remaining_keyword
                    if ' ' not in keyword and keyword in single_word_labels_to_filter:
                         removal_stats[f"清理后单字标签过滤: {keyword}"] += 1
                         continue
                    removal_stats[f"复杂模式前缀: {prefix_to_remove}"] += 1
                    processed_keywords.append(keyword)
                    continue

        # 如果没有应用任何清理规则，保留原始关键词 (但要确保它没在单字过滤列表里)
        # 这一步理论上应该不会执行到单字情况，因为前面处理过了
        # 但作为保险，检查一下
        if keyword not in processed_keywords: # 避免重复添加
             if ' ' not in keyword and keyword in single_word_labels_to_filter:
                 # 不应该到这里，但以防万一
                 removal_stats[f"末尾单字标签过滤: {keyword}"] += 1
             elif len(keyword) > 1: # 确保有效
                 processed_keywords.append(keyword)


    # --- 标准化和去重 (TF-IDF 相似度部分保持不变) ---
    print("\n标准化和去重 (TF-IDF)...")
    if not processed_keywords:
        print("没有可处理的关键词进行 TF-IDF 分析。")
        return []

    unique_keywords = sorted(list(set(processed_keywords)), key=len, reverse=True)
    print(f"前缀移除和过滤后剩余 {len(unique_keywords)} 个独立关键词。")
    if len(unique_keywords) < 2: return unique_keywords

    try:
        vectorizer = TfidfVectorizer(min_df=1, analyzer='char', ngram_range=(2, 5))
        tfidf_matrix = vectorizer.fit_transform(unique_keywords)
        similarity_matrix = cosine_similarity(tfidf_matrix)
    except Exception as e:
        print(f"TF-IDF 处理时出错: {e}. 返回简单去重结果。")
        return unique_keywords

    clusters = {}
    processed_indices = set()
    similarity_threshold = 0.85

    for i in tqdm(range(len(unique_keywords)), desc="聚类相似关键词"):
        if i in processed_indices: continue
        similar_indices = np.where(similarity_matrix[i] >= similarity_threshold)[0]
        if len(similar_indices) > 1:
            cluster_keywords = [unique_keywords[j] for j in similar_indices if j < len(unique_keywords)]
            representative = max(cluster_keywords, key=len)
            clusters[representative] = cluster_keywords
            processed_indices.update(similar_indices)
        else:
            representative = unique_keywords[i]
            clusters[representative] = [representative]
            processed_indices.add(i)

    final_keywords = sorted(list(clusters.keys()))

    print("\n移除/过滤统计 (Top 15):")
    for item, count in removal_stats.most_common(15):
        print(f"  移除/过滤 '{item}' {count}次")

    print(f"TF-IDF 相似度去重后剩余 {len(final_keywords)} 个关键词。")
    return final_keywords
# --- 结束 extract_core_keywords 修改 ---


def save_keywords(keywords, filepath):
    """保存最终关键词到文件 (保持不变)"""
    try:
        dir_path = os.path.dirname(filepath)
        if dir_path: os.makedirs(dir_path, exist_ok=True)
        with open(filepath, 'w', encoding='utf-8') as f:
            for keyword in keywords: f.write(f"{keyword}\n")
        print(f"成功保存 {len(keywords)} 个关键词到 {filepath}")
    except Exception as e:
        print(f"保存文件 {filepath} 时出错: {e}")

# --- 主执行流程 ---

# !! 重要：假设 product_labels 在这里被定义 !!
# 示例，使用你提供的例子:
#product_labels = ['travel', 'bag', 'luggage', 'shoes', 'gear', 'luggage travel', 'travel gear', 'duffels', 'clothing', 'jewelry']
# product_labels = ["Simple White T-Shirt", "Cotton Crew Neck Clothing"] # 另一个例子
# product_labels = None # 测试没有标签的情况

print(f"使用的产品标签 (用于上下文和单字过滤): {product_labels}")

os.environ["QWEN_API_KEY"] = "sk-2ea9416b45e04af6b6aa72d3c2ade52f"
# API Key (从环境变量读取)
QWEN_API_KEY = os.environ.get("QWEN_API_KEY")
if not QWEN_API_KEY:
    print("警告: 环境变量 QWEN_API_KEY 未设置。")

print(f"\n--- 开始处理 ---")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print(f"结果文件夹: {base_folder_name}")

# 1. 加载并初步清洗关键词
initial_keywords = load_keywords(input_file)

if initial_keywords:
    # --- 步骤合并：提取核心关键词，同时进行前缀移除和基于标签的单字过滤 ---
    print(f"\n--- 步骤 1: 提取核心关键词 (含前缀移除、单字标签过滤、TF-IDF去重) ---")
    # 现在 extract_core_keywords 内部处理了单字标签的过滤
    core_keywords = extract_core_keywords(initial_keywords, product_labels)

    print(f"\n--- 步骤 2: 使用 AI 精炼关键词 ---")
    # 将上一步处理后的结果送入 AI
    refined_keywords = refine_keywords_with_ai(
        core_keywords,
        api_key=QWEN_API_KEY,
        product_labels=product_labels, # 仍然传递标签给 AI 用于整体上下文
        base_folder_name=base_folder_name
    )

    # 确保最终结果是排序且唯一的列表
    final_keywords_to_save = sorted(list(set(refined_keywords)))

    print(f"\n--- 步骤 3: 保存结果 ---")
    save_keywords(final_keywords_to_save, output_file)

    print("\n--- 关键词处理完成! ---")
else:
    print(f"未能从 {input_file} 加载到任何关键词，程序结束。")

# 清理临时文件的部分保持注释状态
# ...

print("\n--- 程序执行结束 ---")

加载NLP模型...
使用的产品标签 (用于上下文和单字过滤): ['car', 'travel', 'seat', 'dog', 'dogs', 'booster', 'pet', 'car seat', 'seats', 'dog car']

--- 开始处理 ---
输入文件: 生成结果\social_media\keywords_temp1.txt
输出文件: 生成结果\social_media\keywords.txt
结果文件夹: 生成结果\social_media
成功加载并初步清洗 335 个关键词

--- 步骤 1: 提取核心关键词 (含前缀移除、单字标签过滤、TF-IDF去重) ---

将尝试过滤与以下单字标签匹配的单字关键词: ['dogs', 'seat', 'pet', 'booster', 'car', 'dog', 'travel', 'seats']
分析关键词结构...


解析关键词: 100%|███████████████████████████████████████████████████████████████████| 335/335 [00:01<00:00, 265.68it/s]


识别出 41 个核心产品术语
识别出 12 种前缀模式

产品标签中的单字词 (可能被过滤): ['dogs', 'seat', 'pet', 'booster', 'car', 'dog', 'travel', 'seats']

常见前缀示例:
  '25' (出现5次)
  'non-slip' (出现2次)
  '14lbs' (出现1次)
  '20lbs' (出现1次)
  '25lbs-elevated' (出现1次)
  '35lbs' (出现1次)
  '60lbs' (出现1次)
  'anti-collapse' (出现1次)
  'clip-on' (出现1次)
  'medium-sized' (出现1次)
  'pets-pattern' (出现1次)
  'suvs-gray' (出现1次)

核心产品术语示例 (来自关键词分析):
  seat, car, dog, dogs, supplies, booster, travel, bed, seats, pet, puppy, console, safety, pets, carrier


前缀移除与单字标签过滤: 100%|████████████████████████████████████████████████████| 335/335 [00:00<00:00, 167572.07it/s]



标准化和去重 (TF-IDF)...
前缀移除和过滤后剩余 327 个独立关键词。


聚类相似关键词: 100%|████████████████████████████████████████████████████████████| 327/327 [00:00<00:00, 189465.04it/s]



移除/过滤统计 (Top 15):
  移除/过滤 '固定前缀: 25' 5次
  移除/过滤 '固定前缀: non-slip' 2次
  移除/过滤 '固定前缀: pets-pattern' 1次
  移除/过滤 '固定前缀: clip-on' 1次
  移除/过滤 '固定前缀: anti-collapse' 1次
  移除/过滤 '固定前缀: 25lbs-elevated' 1次
  移除/过滤 '固定前缀: 60lbs' 1次
  移除/过滤 '固定前缀: 35lbs' 1次
  移除/过滤 '固定前缀: suvs-gray' 1次
  移除/过滤 '固定前缀: 14lbs' 1次
  移除/过滤 '固定前缀: 20lbs' 1次
  移除/过滤 '固定前缀: medium-sized' 1次
  移除/过滤 '清理后单字标签过滤: dogs' 1次
TF-IDF 相似度去重后剩余 302 个关键词。

--- 步骤 2: 使用 AI 精炼关键词 ---
尝试使用QWEN大模型精炼关键词 (尝试 1/2)...
正在使用 OpenAI 兼容模式 (方法1)...
方法1成功获取精炼关键词，共 248 个
关键词AI精炼完成: 输入302个，精炼后248个

--- 步骤 3: 保存结果 ---
成功保存 248 个关键词到 生成结果\social_media\keywords.txt

--- 关键词处理完成! ---

--- 程序执行结束 ---
