In [1]:
import pandas as pd
import numpy as np
import ast
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict
import platform
import glob
import os
import ast

# 根据操作系统设置合适的中文字体
system = platform.system()
if system == 'Darwin':  # macOS
    plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'Heiti TC', 'PingFang HK', 'Apple Color Emoji']
elif system == 'Windows':
    plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun']
else:  # Linux或其他
    plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'WenQuanYi Micro Hei', 'WenQuanYi Zen Hei']

# 正常显示负号
plt.rcParams['axes.unicode_minus'] = False

print(f"当前操作系统: {system}")
print(f"当前字体设置: {plt.rcParams['font.sans-serif']}")


当前操作系统: Windows
当前字体设置: ['SimHei', 'Microsoft YaHei', 'SimSun']


In [2]:
# --- 配置区域 ---

# 1. 文件路径和模式
matches_dir1 = '生成结果/matches_analysis/'
matches_dir2 = '生成结果/last_quadrant/'
original_data_dir = './Data/'
asin_data_dir = './Data/'

# 2. 具体文件名或模式 (使用 glob 查找)
# 请确保目录下只有一个符合模式的文件，或者修改代码以处理多个文件
complete_file_pattern = os.path.join(matches_dir2, '*plot_quadrant_data_场景匹配.xlsx')
matches_result_pattern = os.path.join(matches_dir1, 'refined_*_matches_analysis.xlsx')
original_data_pattern = os.path.join(original_data_dir, '*VOC数据.xlsx')
asin_data_pattern = os.path.join(asin_data_dir, '*_processed.xlsx')

# 3. 输出文件路径
output_dir = '生成结果/integrated_analysis_matches/'
os.makedirs(output_dir, exist_ok=True) # 创建输出目录
final_merged_output_file = os.path.join(output_dir, 'final_merged_original_data_matches.xlsx')
analysis_output_prefix = os.path.join(output_dir, 'analysis_') # 分析结果文件前缀

# 4. 列名定义 (请根据你的 Excel 文件修改)
# complete_file ('plot_quadrant_data_场景匹配.xlsx') 列名
col_complete_match = '场景匹配' # 用于匹配 'Match' 列
col_complete_quadrant = '图表象限'

# matches_result_file ('refined_*_matches_analysis.xlsx') 列名
col_matches_match = 'Match'
col_matches_indices = 'Review_Indices'

# original_data_file ('*VOC数据.xlsx') 列名
col_original_indices = 'Review_Indices' # 用于匹配 'Review_Indices' 列
col_original_rating = 'Rating'
col_original_asin = 'ASIN'
col_original_site = '站点信息'
col_original_likes = '点赞数'
# 假设原始数据中有一个 ASIN 排名列，如果名称不同请修改
col_original_asin_rank = 'ASIN排名' # 如果没有该列，后续分析会报错或需要调整

# asin_data_file ('*_processed.xlsx') 列名
col_asin_asin = 'asin' # 用于匹配 'ASIN' 列
col_asin_brand = 'brand'
col_asin_seller = 'sellerName'

# 新增到 original_data_file 的列名
col_new_match = 'Match' # 从 matches_result_file 添加
col_new_quadrant = '图表象限' # 从 complete_file 添加

# --- 配置区域结束 ---

# 设置 matplotlib 支持中文显示（选择一种适合你环境的方式）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体为黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题

print("配置完成。")
print(f"象限数据文件模式: {complete_file_pattern}")
print(f"匹配结果文件模式: {matches_result_pattern}")
print(f"原始数据文件模式: {original_data_pattern}")
print(f"ASIN数据文件模式: {asin_data_pattern}")
print(f"输出目录: {output_dir}")

# 辅助函数：查找文件，确保只找到一个
def find_single_file(pattern):
    files = glob.glob(pattern)
    if len(files) == 0:
        raise FileNotFoundError(f"错误：找不到匹配模式的文件: {pattern}")
    if len(files) > 1:
        print(f"警告：找到多个匹配模式的文件: {pattern}。将使用第一个文件: {files[0]}")
        # 或者可以抛出错误： raise ValueError(f"错误：找到多个匹配模式的文件: {pattern}")
    return files[0]

# 查找文件
try:
    complete_file = find_single_file(complete_file_pattern)
    matches_result_file = find_single_file(matches_result_pattern)
    original_data_file = find_single_file(original_data_pattern)
    asin_data_file = find_single_file(asin_data_pattern)
    print("\n成功找到所有需要的文件:")
    print(f"  象限数据文件: {complete_file}")
    print(f"  匹配结果文件: {matches_result_file}")
    print(f"  原始数据文件: {original_data_file}")
    print(f"  ASIN数据文件: {asin_data_file}")
except FileNotFoundError as e:
    print(e)
    # 如果文件找不到，后续代码无法执行，可以考虑退出或采取其他措施
    exit()
except ValueError as e:
    print(e)
    # 如果找到多个文件且不允许，可以考虑退出
    exit()

配置完成。
象限数据文件模式: 生成结果/last_quadrant/*plot_quadrant_data_场景匹配.xlsx
匹配结果文件模式: 生成结果/matches_analysis/refined_*_matches_analysis.xlsx
原始数据文件模式: ./Data/*VOC数据.xlsx
ASIN数据文件模式: ./Data/*_processed.xlsx
输出目录: 生成结果/integrated_analysis_matches/

成功找到所有需要的文件:
  象限数据文件: 生成结果/last_quadrant\plot_quadrant_data_场景匹配.xlsx
  匹配结果文件: 生成结果/matches_analysis\refined_致欧-2025-01-10之后VOC数据_rating_3_to_5_length_10_to_200_words_matches_analysis.xlsx
  原始数据文件: ./Data\致欧-2025-01-10之后VOC数据.xlsx
  ASIN数据文件: ./Data\zhiou_processed.xlsx


In [3]:
# --- 第1步：加载数据 ---

print("\n--- 开始第1步：加载数据 ---")

# 加载象限数据 (complete_file)
try:
    df_complete = pd.read_excel(complete_file)
    # 选择需要的列，并重命名 '场景匹配' 为 'Match' 以便合并
    df_complete_selected = df_complete[[col_complete_match, col_complete_quadrant]].rename(
        columns={col_complete_match: col_matches_match} # 使用 matches_result 文件中的列名作为目标
    )
    print(f"成功加载并选择了 {complete_file} 的数据。")
    print(f"  选择了列: {col_complete_match} (重命名为 {col_matches_match}), {col_complete_quadrant}")
    print(f"  数据预览 (前5行):\n{df_complete_selected.head()}")
except Exception as e:
    print(f"加载或处理 {complete_file} 时出错: {e}")
    exit()

# 加载匹配结果数据 (matches_result_file)
try:
    df_matches_result = pd.read_excel(matches_result_file)
    # 选择需要的列
    df_matches_selected = df_matches_result[[col_matches_match, col_matches_indices]]
    print(f"\n成功加载并选择了 {matches_result_file} 的数据。")
    print(f"  选择了列: {col_matches_match}, {col_matches_indices}")
    print(f"  数据预览 (前5行):\n{df_matches_selected.head()}")
except Exception as e:
    print(f"加载或处理 {matches_result_file} 时出错: {e}")
    exit()

print("\n--- 第1步完成 ---")


--- 开始第1步：加载数据 ---
成功加载并选择了 生成结果/last_quadrant\plot_quadrant_data_场景匹配.xlsx 的数据。
  选择了列: 场景匹配 (重命名为 Match), 图表象限
  数据预览 (前5行):
Empty DataFrame
Columns: [Match, 图表象限]
Index: []

成功加载并选择了 生成结果/matches_analysis\refined_致欧-2025-01-10之后VOC数据_rating_3_to_5_length_10_to_200_words_matches_analysis.xlsx 的数据。
  选择了列: Match, Review_Indices
  数据预览 (前5行):
Empty DataFrame
Columns: [Match, Review_Indices]
Index: []

--- 第1步完成 ---


In [4]:
# --- 第2步：合并象限信息到匹配结果 (增强版) ---
print("\n--- 开始第2步：合并象限信息到匹配结果 ---")

# 检查并清理 Match 列，确保格式一致
def clean_match_text(text):
    if not isinstance(text, str):
        return text
    # 去除额外空格，转换为小写等
    return text.strip().lower()

# 应用清理函数
df_matches_selected[col_matches_match] = df_matches_selected[col_matches_match].apply(clean_match_text)
df_complete_selected[col_matches_match] = df_complete_selected[col_matches_match].apply(clean_match_text)

# 检查匹配情况
matches_in_complete = set(df_complete_selected[col_matches_match].unique())
matches_in_results = set(df_matches_selected[col_matches_match].unique())
common_matches = matches_in_complete.intersection(matches_in_results)

print(f"象限数据中的Match数量: {len(matches_in_complete)}")
print(f"匹配结果中的Match数量: {len(matches_in_results)}")
print(f"两者共有的Match数量: {len(common_matches)}")

if len(common_matches) < min(len(matches_in_complete), len(matches_in_results)):
    print("警告: 部分Match无法在两个数据源之间匹配！")
    # 输出一些不匹配的例子
    mismatched_in_complete = matches_in_complete - common_matches
    mismatched_in_results = matches_in_results - common_matches
    if mismatched_in_complete:
        print(f"象限数据中但不在匹配结果中的Match示例: {list(mismatched_in_complete)[:3]}")
    if mismatched_in_results:
        print(f"匹配结果中但不在象限数据中的Match示例: {list(mismatched_in_results)[:3]}")

# 使用左合并，保留所有 df_matches_selected 的行
df_merged_matches_info = pd.merge(
    df_matches_selected,
    df_complete_selected,
    on=col_matches_match, # 合并键是 'Match' 列
    how='left' # 保留左边（df_matches_selected）的所有行
)

# 检查合并结果
nan_quadrant_count = df_merged_matches_info[col_complete_quadrant].isnull().sum()
if nan_quadrant_count > 0:
    print(f"警告: 有 {nan_quadrant_count} 行未能匹配到 '{col_complete_quadrant}' 信息")
    # 显示一些未匹配到象限的Match值
    unmatched_matches = df_merged_matches_info[df_merged_matches_info[col_complete_quadrant].isnull()][col_matches_match].unique()
    print(f"未匹配到象限的Match示例: {unmatched_matches[:3]}")

print(f"成功将 '{col_complete_quadrant}' 合并到匹配结果数据中。")
print(f"  合并后的数据预览 (前5行):\n{df_merged_matches_info.head()}")
print("\n--- 第2步完成 ---")



--- 开始第2步：合并象限信息到匹配结果 ---
象限数据中的Match数量: 0
匹配结果中的Match数量: 0
两者共有的Match数量: 0
成功将 '图表象限' 合并到匹配结果数据中。
  合并后的数据预览 (前5行):
Empty DataFrame
Columns: [Match, Review_Indices, 图表象限]
Index: []

--- 第2步完成 ---


In [5]:
# --- 第3步：合并匹配和象限信息到原始数据 (增强版) ---
import ast  # 确保导入 ast
import re   # 导入正则表达式模块，用于从文件名提取信息
print("\n--- 开始第3步：合并匹配和象限信息到原始数据 ---")

# 从文件名中提取Rating范围信息
def extract_rating_range(filename):
    # 使用正则表达式匹配rating_X_to_Y模式
    rating_pattern = re.search(r'rating_(\d+)_to_(\d+)', filename.lower())
    if rating_pattern:
        min_rating = int(rating_pattern.group(1))
        max_rating = int(rating_pattern.group(2))
        return min_rating, max_rating
    return None, None  # 如果没有找到匹配项，返回None

# 提取当前使用的matches_result_file中的Rating范围
min_rating, max_rating = extract_rating_range(matches_result_file)
if min_rating is not None and max_rating is not None:
    print(f"从文件名中提取的Rating筛选范围: {min_rating} 到 {max_rating}")
else:
    print("未能从文件名中提取Rating筛选范围，将使用所有Rating数据")

# 加载原始数据 (original_data_file)
try:
    df_original = pd.read_excel(original_data_file)
    # 为原始数据添加行号索引列，用于后续合并
    df_original['original_row_index'] = df_original.index
    print(f"成功加载原始数据文件: {original_data_file} ({len(df_original)} 行)")
except Exception as e:
    print(f"加载或处理 {original_data_file} 时出错: {e}")
    exit()

# 加载品牌数据 (asin_data_file)
try:
    df_asin_data = pd.read_excel(asin_data_file)
    print(f"成功加载ASIN数据文件: {asin_data_file} ({len(df_asin_data)} 行)")
    
    # 检查必要的列是否存在
    required_cols = [col_asin_asin, col_asin_brand, col_asin_seller]
    missing_cols = [col for col in required_cols if col not in df_asin_data.columns]
    if missing_cols:
        print(f"警告: ASIN数据文件中缺少以下列: {missing_cols}")
    
    # 显示品牌数据的统计信息
    if col_asin_brand in df_asin_data.columns:
        brand_counts = df_asin_data[col_asin_brand].value_counts().head(10)
        print(f"品牌数据统计 (前10个):\n{brand_counts}")
except Exception as e:
    print(f"加载或处理 {asin_data_file} 时出错: {e}")
    print("将继续处理，但不会包含品牌信息")
    df_asin_data = None

# --- 处理包含列表索引的 matches 数据 (df_merged_matches_info 来自第2步) ---
# 1. 转换字符串列表为实际列表 (如果需要)
if not df_merged_matches_info.empty:
    print(f"处理 '{col_matches_indices}' 列...")
    
    # 检查列是否存在
    if col_matches_indices not in df_merged_matches_info.columns:
        raise ValueError(f"错误：合并后的匹配信息中缺少列 '{col_matches_indices}'")
    
    # 检查第一个非空值的类型
    first_valid_idx = df_merged_matches_info[col_matches_indices].first_valid_index()
    if first_valid_idx is not None:
        first_valid_value = df_merged_matches_info.loc[first_valid_idx, col_matches_indices]
        print(f"'{col_matches_indices}' 列的第一个有效值类型: {type(first_valid_value)}")
        print(f"值示例: {first_valid_value}")
        
        # 如果是字符串，尝试转换为列表
        if isinstance(first_valid_value, str):
            try:
                print(f"尝试将 '{col_matches_indices}' 列的字符串转换为列表...")
                df_merged_matches_info[col_matches_indices] = df_merged_matches_info[col_matches_indices].apply(
                    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
                )
                print("转换成功")
            except Exception as e:
                print(f"错误：转换 '{col_matches_indices}' 列失败: {e}")
                # 显示一些导致错误的值
                problematic_values = df_merged_matches_info[col_matches_indices].head(3).tolist()
                print(f"问题值示例: {problematic_values}")
                exit()

# 2. 展开 (Explode) df_merged_matches_info 数据框
try:
    print(f"展开前的匹配信息数据形状: {df_merged_matches_info.shape}")
    
    # 处理空列表或非列表项
    df_merged_matches_info = df_merged_matches_info.dropna(subset=[col_matches_indices])
    
    # 确保所有值都是列表
    def ensure_list(item):
        if isinstance(item, list):
            return item
        elif isinstance(item, (int, float)):
            return [int(item)]  # 单个数字转为列表
        else:
            return []  # 其他情况返回空列表
    
    df_merged_matches_info[col_matches_indices] = df_merged_matches_info[col_matches_indices].apply(ensure_list)
    
    # 执行 explode
    df_exploded_matches = df_merged_matches_info.explode(col_matches_indices)
    df_exploded_matches = df_exploded_matches.rename(columns={col_matches_indices: 'target_original_index'})
    
    # 转换索引为整数
    df_exploded_matches = df_exploded_matches.dropna(subset=['target_original_index'])
    df_exploded_matches['target_original_index'] = df_exploded_matches['target_original_index'].astype(int)
    
    print(f"展开后得到 {len(df_exploded_matches)} 行待合并数据。")
    print(f"展开后的匹配信息示例 (前3行):\n{df_exploded_matches.head(3)}")
    
    # 检查索引范围是否在原始数据范围内
    max_original_index = df_original['original_row_index'].max()
    max_target_index = df_exploded_matches['target_original_index'].max()
    if max_target_index > max_original_index:
        print(f"警告: 展开后的索引最大值 ({max_target_index}) 超出了原始数据索引范围 (0-{max_original_index})!")
        # 过滤掉超出范围的索引
        df_exploded_matches = df_exploded_matches[df_exploded_matches['target_original_index'] <= max_original_index]
        print(f"过滤后剩余 {len(df_exploded_matches)} 行数据。")
    
except Exception as e:
    print(f"展开 '{col_matches_indices}' 列时出错: {e}")
    exit()

# 3. 将展开后的信息合并回原始数据 df_original
print("合并展开信息到原始数据...")
df_final_merged = pd.merge(
    df_original,
    df_exploded_matches[['target_original_index', col_matches_match, col_complete_quadrant]],
    left_on='original_row_index',
    right_on='target_original_index',
    how='left' # 保留所有原始行
)

# --- 合并品牌数据 ---
# 在第3步合并品牌数据后添加
if df_asin_data is not None and col_original_asin in df_final_merged.columns and col_asin_asin in df_asin_data.columns:
    print(f"合并品牌数据...")
    
    # 检查ASIN列的唯一值数量
    print(f"原始数据中ASIN列的唯一值数量: {df_final_merged[col_original_asin].nunique()}")
    print(f"品牌数据中ASIN列的唯一值数量: {df_asin_data[col_asin_asin].nunique()}")
    
    # 检查有多少ASIN能够匹配
    common_asins = set(df_final_merged[col_original_asin].astype(str).str.lower()) & set(df_asin_data[col_asin_asin].astype(str).str.lower())
    print(f"两个数据集共有的ASIN数量: {len(common_asins)}")
    
    # 确保ASIN列的格式一致 (转换为小写字符串)
    df_final_merged[col_original_asin] = df_final_merged[col_original_asin].astype(str).str.lower()
    df_asin_data[col_asin_asin] = df_asin_data[col_asin_asin].astype(str).str.lower()
    
    # 合并品牌数据
    df_final_merged_before = df_final_merged.copy()
    df_final_merged = pd.merge(
        df_final_merged,
        df_asin_data[[col_asin_asin, col_asin_brand, col_asin_seller]],
        left_on=col_original_asin,
        right_on=col_asin_asin,
        how='left'
    )
    
    # 检查合并前后的行数变化
    print(f"合并前行数: {len(df_final_merged_before)}, 合并后行数: {len(df_final_merged)}")
    
    # 检查合并结果
    brand_matched_count = df_final_merged[col_asin_brand].notna().sum()
    print(f"成功合并品牌数据，{brand_matched_count} 行有品牌信息 ({brand_matched_count/len(df_final_merged)*100:.2f}%)")
    
    # 如果匹配率很低，输出一些未匹配的ASIN示例
    if brand_matched_count/len(df_final_merged) < 0.5:  # 如果匹配率低于50%
        unmatched_asins = df_final_merged[df_final_merged[col_asin_brand].isna()][col_original_asin].sample(min(5, len(df_final_merged))).tolist()
        print(f"未匹配ASIN示例: {unmatched_asins}")
        print(f"检查这些ASIN是否在品牌数据中: {[asin in df_asin_data[col_asin_asin].values for asin in unmatched_asins]}")

    
    # 如果有品牌数据，显示品牌分布
    if brand_matched_count > 0:
        brand_distribution = df_final_merged[col_asin_brand].value_counts().head(10)
        print(f"品牌分布 (前10个):\n{brand_distribution}")
else:
    print("跳过品牌数据合并，因为缺少必要的列或数据")

# 清理辅助列
df_final_merged = df_final_merged.drop(columns=['original_row_index', 'target_original_index'], errors='ignore')
if col_asin_asin in df_final_merged.columns and col_asin_asin != col_original_asin:
    df_final_merged = df_final_merged.drop(columns=[col_asin_asin], errors='ignore')

# 重命名列 (确保最终列名正确)
rename_dict_final = {}
if col_matches_match in df_final_merged.columns and col_matches_match != col_new_match:
    rename_dict_final[col_matches_match] = col_new_match
if col_complete_quadrant in df_final_merged.columns and col_complete_quadrant != col_new_quadrant:
     rename_dict_final[col_complete_quadrant] = col_new_quadrant
if rename_dict_final:
    df_final_merged = df_final_merged.rename(columns=rename_dict_final)

print(f"成功将 '{col_new_match}' 和 '{col_new_quadrant}' 信息合并到原始数据。")
matched_rows_count = df_final_merged[col_new_match].notna().sum()
print(f"  最终数据 {len(df_final_merged)} 行，其中 {matched_rows_count} 行匹配到信息。")

# 验证合并后的数据
print("\n验证合并后的数据:")
if col_new_quadrant in df_final_merged.columns:
    quadrant_counts = df_final_merged[col_new_quadrant].value_counts()
    print(f"各象限数据量:\n{quadrant_counts}")
    
    # 检查每个象限对应的Match
    for quadrant in df_final_merged[col_new_quadrant].dropna().unique():
        matches = df_final_merged[df_final_merged[col_new_quadrant] == quadrant][col_new_match].unique()
        print(f"象限 '{quadrant}' 包含的Match: {matches}")
else:
    print(f"警告: 合并后的数据中没有 '{col_new_quadrant}' 列!")

# 保存最终合并后的数据
try:
    df_final_merged.to_excel(final_merged_output_file, index=False)
    print(f"最终合并后的数据已保存到: {final_merged_output_file}")
except Exception as e:
    print(f"保存最终合并文件时出错: {e}")

print("\n--- 第3步完成 ---")



--- 开始第3步：合并匹配和象限信息到原始数据 ---
从文件名中提取的Rating筛选范围: 3 到 5


成功加载原始数据文件: ./Data\致欧-2025-01-10之后VOC数据.xlsx (2342 行)
成功加载ASIN数据文件: ./Data\zhiou_processed.xlsx (100 行)
品牌数据统计 (前10个):
brand
SONGMICS           9
Kitsure            5
LANTEFUL           5
OYREL              4
Autonomier         3
Simple Trending    3
VASAGLE            3
HOOBRO             3
VILICK             3
ROJASOP            2
Name: count, dtype: int64
展开前的匹配信息数据形状: (0, 3)
展开后得到 0 行待合并数据。
展开后的匹配信息示例 (前3行):
Empty DataFrame
Columns: [Match, target_original_index, 图表象限]
Index: []
合并展开信息到原始数据...
合并品牌数据...
原始数据中ASIN列的唯一值数量: 77
品牌数据中ASIN列的唯一值数量: 100
两个数据集共有的ASIN数量: 77
合并前行数: 2342, 合并后行数: 2342
成功合并品牌数据，2342 行有品牌信息 (100.00%)
品牌分布 (前10个):
brand
LANTEFUL            233
VTRIN               172
FIDUCIAL HOME       152
Sakugi              140
ROMGUAR CRAFT       114
OYREL               110
Simple Houseware    103
HOOBRO              102
INGIORDAR            93
SONGMICS             81
Name: count, dtype: int64
成功将 'Match' 和 '图表象限' 信息合并到原始数据。
  最终数据 2342 行，其中 0 行匹配到信息。

验证合并后的数据:
各象限数据量:
Series

最终合并后的数据已保存到: 生成结果/integrated_analysis_matches/final_merged_original_data_matches.xlsx

--- 第3步完成 ---


In [6]:
# --- 第4步：按 'Match' 统计分析 ---

print("\n--- 开始第4步：按 'Match' 统计分析 ---")

# 筛选出包含有效 'Match' 信息的行进行分析
df_analysis_match = df_final_merged.dropna(subset=[col_new_match])

if df_analysis_match.empty:
    print("没有找到有效的 'Match' 数据进行分析。")
else:
    # 获取所有唯一的 Match 值
    unique_matches = df_analysis_match[col_new_match].unique()
    print(f"将对以下 {len(unique_matches)} 个 'Defect' 进行分析: {unique_matches}")

    # 创建一个 Excel writer 用于保存所有 Match 的分析结果
    match_analysis_file = f"{analysis_output_prefix}by_matches.xlsx"
    with pd.ExcelWriter(match_analysis_file) as writer:
        print(f"\n分析结果将保存在: {match_analysis_file}")

        for match_value in unique_matches:
            print(f"\n--- 分析 Match: {match_value} ---")
            df_subset = df_analysis_match[df_analysis_match[col_new_match] == match_value]
            sheet_name = str(match_value)[:31] # Excel sheet 名长度限制

            analysis_results = {} # 存储当前 Match 的分析结果

            # 1. Rating 分布
            if col_original_rating in df_subset.columns:
                rating_distribution = df_subset[col_original_rating].value_counts().sort_index()
                analysis_results[f'{col_original_rating}_分布'] = rating_distribution
                print(f"  {col_original_rating} 分布:\n{rating_distribution}")
                # 可视化 Rating 分布 (柱状图)
                plt.figure(figsize=(8, 5))
                sns.countplot(x=col_original_rating, data=df_subset, order=sorted(df_subset[col_original_rating].unique()))
                plt.title(f'Match: {match_value} - {col_original_rating} 分布')
                plt.xlabel(col_original_rating)
                plt.ylabel('数量')
                plot_filename = os.path.join(output_dir, f'match_{match_value}_rating_dist.png')
                plt.savefig(plot_filename)
                plt.close()
                print(f"  Rating 分布图已保存至: {plot_filename}")
            else:
                print(f"  警告: 找不到列 '{col_original_rating}'")

            # 2. ASIN 排名分布 (假设是数值型，用描述性统计)
            if col_original_asin_rank in df_subset.columns:
                 # 检查列是否为数值类型
                if pd.api.types.is_numeric_dtype(df_subset[col_original_asin_rank]):
                    asin_rank_stats = df_subset[col_original_asin_rank].describe()
                    analysis_results[f'{col_original_asin_rank}_统计'] = asin_rank_stats
                    print(f"\n  {col_original_asin_rank} 统计:\n{asin_rank_stats}")
                    # 可视化 ASIN 排名分布 (直方图或箱线图)
                    plt.figure(figsize=(8, 5))
                    sns.histplot(df_subset[col_original_asin_rank].dropna(), kde=True)
                    plt.title(f'Match: {match_value} - {col_original_asin_rank} 分布')
                    plt.xlabel(col_original_asin_rank)
                    plt.ylabel('频率')
                    plot_filename = os.path.join(output_dir, f'match_{match_value}_asin_rank_dist.png')
                    plt.savefig(plot_filename)
                    plt.close()
                    print(f"  ASIN排名 分布图已保存至: {plot_filename}")
                else:
                    print(f"  警告: 列 '{col_original_asin_rank}' 不是数值类型，无法计算统计信息。将显示值计数。")
                    asin_rank_counts = df_subset[col_original_asin_rank].value_counts()
                    analysis_results[f'{col_original_asin_rank}_分布'] = asin_rank_counts
                    print(f"\n  {col_original_asin_rank} 值计数:\n{asin_rank_counts.head()}") # 显示前几个
            else:
                print(f"  警告: 找不到列 '{col_original_asin_rank}'")


            # 3. 站点信息分布
            if col_original_site in df_subset.columns:
                site_distribution = df_subset[col_original_site].value_counts()
                analysis_results[f'{col_original_site}_分布'] = site_distribution
                print(f"\n  {col_original_site} 分布:\n{site_distribution}")
                # 可视化站点分布 (柱状图)
                plt.figure(figsize=(10, 6))
                sns.countplot(y=col_original_site, data=df_subset, order=site_distribution.index)
                plt.title(f'Match: {match_value} - {col_original_site} 分布')
                plt.xlabel('数量')
                plt.ylabel(col_original_site)
                plot_filename = os.path.join(output_dir, f'match_{match_value}_site_dist.png')
                plt.savefig(plot_filename)
                plt.close()
                print(f"  站点信息 分布图已保存至: {plot_filename}")
            else:
                print(f"  警告: 找不到列 '{col_original_site}'")

            # 4. 点赞数分布 (假设是数值型)
            if col_original_likes in df_subset.columns:
                 # 检查列是否为数值类型
                if pd.api.types.is_numeric_dtype(df_subset[col_original_likes]):
                    likes_stats = df_subset[col_original_likes].describe()
                    analysis_results[f'{col_original_likes}_统计'] = likes_stats
                    print(f"\n  {col_original_likes} 统计:\n{likes_stats}")
                    # 可视化点赞数分布 (直方图或箱线图)
                    plt.figure(figsize=(8, 5))
                    # 使用对数刻度可能更好，如果分布很偏斜
                    sns.histplot(df_subset[col_original_likes].dropna(), kde=False, bins=30) # 可以调整 bins 数量
                    plt.title(f'Match: {match_value} - {col_original_likes} 分布')
                    plt.xlabel(col_original_likes)
                    plt.ylabel('频率')
                    # plt.xscale('log') # 如果需要对数刻度
                    plot_filename = os.path.join(output_dir, f'match_{match_value}_likes_dist.png')
                    plt.savefig(plot_filename)
                    plt.close()
                    print(f"  点赞数 分布图已保存至: {plot_filename}")
                else:
                    print(f"  警告: 列 '{col_original_likes}' 不是数值类型，无法计算统计信息。将显示值计数。")
                    likes_counts = df_subset[col_original_likes].value_counts()
                    analysis_results[f'{col_original_likes}_分布'] = likes_counts
                    print(f"\n  {col_original_likes} 值计数:\n{likes_counts.head()}")
            else:
                print(f"  警告: 找不到列 '{col_original_likes}'")

            # 将当前 Match 的分析结果写入 Excel 的不同 sheet
            # 为了写入 Excel，需要将 Series 转换为 DataFrame
            start_row = 0
            for name, data in analysis_results.items():
                df_to_write = data.reset_index()
                # 尝试设置列名
                try:
                    if "分布" in name:
                        df_to_write.columns = [data.index.name if data.index.name else 'Value', 'Count']
                    elif "统计" in name:
                         df_to_write.columns = ['Statistic', 'Value']
                except: # 如果转换失败，使用默认列名
                     pass
                df_to_write.to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False)
                # 写入标题
                pd.DataFrame([name]).to_excel(writer, sheet_name=sheet_name, startrow=start_row, header=False, index=False)
                start_row += len(df_to_write) + 2 # 增加行数，留出空间

print("\n--- 第4步完成 ---")


--- 开始第4步：按 'Match' 统计分析 ---
没有找到有效的 'Match' 数据进行分析。

--- 第4步完成 ---


In [7]:
# --- 第5步：按 '图表象限' 统计分析 (增强版) ---
print("\n--- 开始第5步：按 '图表象限' 统计分析 ---")
# 检查数据情况
print(f"df_final_merged 形状: {df_final_merged.shape}")
print(f"df_final_merged 中有象限信息的行数: {df_final_merged[col_new_quadrant].notna().sum()}")
print(f"df_final_merged 中有品牌信息的行数: {df_final_merged[col_asin_brand].notna().sum() if col_asin_brand in df_final_merged.columns else 0}")
print(f"df_final_merged 中同时有象限和品牌信息的行数: {df_final_merged.dropna(subset=[col_new_quadrant, col_asin_brand]).shape[0] if col_asin_brand in df_final_merged.columns else 0}")

# 检查象限分布
if col_new_quadrant in df_final_merged.columns:
    quadrant_counts = df_final_merged[col_new_quadrant].value_counts()
    print(f"象限分布:\n{quadrant_counts}")

# 检查品牌分布
if col_asin_brand in df_final_merged.columns:
    brand_counts = df_final_merged[col_asin_brand].value_counts().head(5)
    print(f"前5个品牌分布:\n{brand_counts}")

# 确保只分析有效的象限数据
if col_new_quadrant not in df_final_merged.columns:
    print(f"错误: 合并后的数据中没有 '{col_new_quadrant}' 列，无法进行象限分析")
    print("请检查前面的数据处理步骤是否正确")
    exit()

df_analysis_quadrant = df_final_merged.dropna(subset=[col_new_quadrant])
print(f"筛选出包含有效象限信息的行数: {len(df_analysis_quadrant)}")

# 如果从文件名中提取到了Rating范围，应用这个筛选条件
if min_rating is not None and max_rating is not None and col_original_rating in df_analysis_quadrant.columns:
    # 应用Rating筛选
    df_analysis_quadrant = df_analysis_quadrant[(df_analysis_quadrant[col_original_rating] >= min_rating) & 
                                               (df_analysis_quadrant[col_original_rating] <= max_rating)]
    print(f"应用Rating {min_rating}-{max_rating} 筛选后，剩余 {len(df_analysis_quadrant)} 行数据")

# 获取所有唯一的图表象限值
unique_quadrants = df_analysis_quadrant[col_new_quadrant].unique()
print(f"将对以下 {len(unique_quadrants)} 个 '图表象限' 进行分析: {unique_quadrants}")

# 创建一个 Excel writer 用于保存所有象限的分析结果
quadrant_analysis_file = f"{analysis_output_prefix}by_quadrant.xlsx"
with pd.ExcelWriter(quadrant_analysis_file) as writer:
    print(f"\n分析结果将保存在: {quadrant_analysis_file}")
    
    # 添加一个总览表，显示每个象限的数据量
    quadrant_counts = df_analysis_quadrant[col_new_quadrant].value_counts().reset_index()
    quadrant_counts.columns = ['象限', '数据量']
    quadrant_counts.to_excel(writer, sheet_name='象限数据量概览', index=False)
    print(f"象限数据量概览:\n{quadrant_counts}")
    
    # 添加一个品牌总体分布表
    if col_asin_brand in df_analysis_quadrant.columns:
        brand_overall = df_analysis_quadrant[col_asin_brand].value_counts().reset_index()
        brand_overall.columns = ['品牌', '数量']
        brand_overall['占比'] = brand_overall['数量'] / brand_overall['数量'].sum() * 100
        brand_overall['占比'] = brand_overall['占比'].round(2).astype(str) + '%'
        brand_overall.to_excel(writer, sheet_name='品牌总体分布', index=False)
        print(f"品牌总体分布 (前5个):\n{brand_overall.head()}")
        
        # 创建品牌与象限的交叉表
        brand_quadrant_cross = pd.crosstab(
            df_analysis_quadrant[col_asin_brand], 
            df_analysis_quadrant[col_new_quadrant],
            margins=True,
            margins_name='总计'
        )
        brand_quadrant_cross.to_excel(writer, sheet_name='品牌象限交叉表')
        print("已添加品牌与象限的交叉表")
    
    for quadrant_value in unique_quadrants:
        print(f"\n--- 分析 图表象限: {quadrant_value} ---")
        
        # 正确筛选当前象限的数据
        df_subset = df_analysis_quadrant[df_analysis_quadrant[col_new_quadrant] == quadrant_value]
        print(f"  该象限包含 {len(df_subset)} 行数据")
        
        # 清理象限值作为 sheet 名称 (移除特殊字符等)
        sheet_name = str(quadrant_value).replace('/', '-').replace('\\', '-').replace('?', '').replace('*', '')[:31]
        analysis_results = {} # 存储当前象限的分析结果
        
        # 1. Rating 分布
        if col_original_rating in df_subset.columns:
            rating_distribution = df_subset[col_original_rating].value_counts().sort_index()
            analysis_results[f'{col_original_rating}_分布'] = rating_distribution
            print(f"  {col_original_rating} 分布:\n{rating_distribution}")
            
            # 可视化
            plt.figure(figsize=(8, 5))
            sns.countplot(x=col_original_rating, data=df_subset, order=sorted(df_subset[col_original_rating].unique()))
            plt.title(f'象限: {quadrant_value} - {col_original_rating} 分布')
            plt.xlabel(col_original_rating)
            plt.ylabel('数量')
            plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_rating_dist.png')
            plt.savefig(plot_filename)
            plt.close()
            print(f"  Rating 分布图已保存至: {plot_filename}")
        
        # 2. 品牌分布 (新增)
        if col_asin_brand in df_subset.columns:
            # 计算品牌分布
            brand_distribution = df_subset[col_asin_brand].value_counts()
            analysis_results[f'{col_asin_brand}_分布'] = brand_distribution
            print(f"  {col_asin_brand} 分布 (前5个):\n{brand_distribution.head()}")
            
            # 可视化品牌分布 (只显示前10个品牌，其他归为"其他")
            plt.figure(figsize=(10, 6))
            top_brands = brand_distribution.nlargest(10)
            if len(brand_distribution) > 10:
                other_count = brand_distribution[~brand_distribution.index.isin(top_brands.index)].sum()
                top_brands = pd.concat([top_brands, pd.Series({'其他': other_count})])
            
            # 创建横向条形图
            ax = sns.barplot(x=top_brands.values, y=top_brands.index, orient='h')
            plt.title(f'象限: {quadrant_value} - 品牌分布 (前10)')
            plt.xlabel('数量')
            plt.ylabel('品牌')
            
            # 添加数值标签
            for i, v in enumerate(top_brands.values):
                ax.text(v + 0.1, i, str(v), va='center')
            
            plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_brand_dist.png')
            plt.tight_layout()
            plt.savefig(plot_filename)
            plt.close()
            print(f"  品牌分布图已保存至: {plot_filename}")
            
            # 计算品牌占比
            brand_percentage = (brand_distribution / brand_distribution.sum() * 100).round(2)
            analysis_results[f'{col_asin_brand}_占比'] = brand_percentage
            print(f"  {col_asin_brand} 占比 (前5个):\n{brand_percentage.head()}")
            
            # 可视化品牌占比 (饼图，只显示前7个品牌，其他归为"其他")
            plt.figure(figsize=(10, 8))
            top_brands_pct = brand_percentage.nlargest(7)
            if len(brand_percentage) > 7:
                other_pct = brand_percentage[~brand_percentage.index.isin(top_brands_pct.index)].sum()
                top_brands_pct = pd.concat([top_brands_pct, pd.Series({'其他': other_pct})])
            
            plt.pie(top_brands_pct.values, labels=top_brands_pct.index, autopct='%1.1f%%', startangle=90)
            plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
            plt.title(f'象限: {quadrant_value} - 品牌占比')
            
            plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_brand_pie.png')
            plt.savefig(plot_filename)
            plt.close()
            print(f"  品牌占比饼图已保存至: {plot_filename}")
        else:
            print(f"  警告: 找不到列 '{col_asin_brand}'，无法分析品牌分布")
        
        # 3. ASIN 排名分布
        if col_original_asin_rank in df_subset.columns:
            # 检查列是否为数值类型
            if pd.api.types.is_numeric_dtype(df_subset[col_original_asin_rank]):
                asin_rank_stats = df_subset[col_original_asin_rank].describe()
                analysis_results[f'{col_original_asin_rank}_统计'] = asin_rank_stats
                print(f"\n  {col_original_asin_rank} 统计:\n{asin_rank_stats}")
                
                # 可视化 ASIN 排名分布 (直方图或箱线图)
                plt.figure(figsize=(8, 5))
                sns.histplot(df_subset[col_original_asin_rank].dropna(), kde=True)
                plt.title(f'象限: {quadrant_value} - {col_original_asin_rank} 分布')
                plt.xlabel(col_original_asin_rank)
                plt.ylabel('频率')
                plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_asin_rank_dist.png')
                plt.savefig(plot_filename)
                plt.close()
                print(f"  ASIN排名 分布图已保存至: {plot_filename}")
                
                # 添加箱线图
                plt.figure(figsize=(8, 5))
                sns.boxplot(y=df_subset[col_original_asin_rank].dropna())
                plt.title(f'象限: {quadrant_value} - {col_original_asin_rank} 箱线图')
                plt.ylabel(col_original_asin_rank)
                plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_asin_rank_box.png')
                plt.savefig(plot_filename)
                plt.close()
                print(f"  ASIN排名 箱线图已保存至: {plot_filename}")
            else:
                print(f"  警告: 列 '{col_original_asin_rank}' 不是数值类型，无法计算统计量")
                asin_rank_counts = df_subset[col_original_asin_rank].value_counts()
                analysis_results[f'{col_original_asin_rank}_分布'] = asin_rank_counts
                print(f"\n  {col_original_asin_rank} 值计数:\n{asin_rank_counts.head()}")
        else:
            print(f"  警告: 找不到列 '{col_original_asin_rank}'")
        
        # 4. 站点信息分布
        if col_original_site in df_subset.columns:
            site_distribution = df_subset[col_original_site].value_counts()
            analysis_results[f'{col_original_site}_分布'] = site_distribution
            print(f"\n  {col_original_site} 分布:\n{site_distribution}")
            
            # 可视化站点分布 (柱状图)
            plt.figure(figsize=(10, 6))
            sns.countplot(y=col_original_site, data=df_subset, order=site_distribution.index)
            plt.title(f'象限: {quadrant_value} - {col_original_site} 分布')
            plt.xlabel('数量')
            plt.ylabel(col_original_site)
            plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_site_dist.png')
            plt.savefig(plot_filename)
            plt.close()
            print(f"  站点信息 分布图已保存至: {plot_filename}")
            
            # 添加饼图
            plt.figure(figsize=(10, 8))
            site_percentage = (site_distribution / site_distribution.sum() * 100).round(2)
            plt.pie(site_distribution.values, labels=site_distribution.index, autopct='%1.1f%%', startangle=90)
            plt.axis('equal')
            plt.title(f'象限: {quadrant_value} - {col_original_site} 占比')
            plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_site_pie.png')
            plt.savefig(plot_filename)
            plt.close()
            print(f"  站点信息 占比饼图已保存至: {plot_filename}")
        else:
            print(f"  警告: 找不到列 '{col_original_site}'")
        
        # 5. 点赞数分布
        if col_original_likes in df_subset.columns:
            # 检查列是否为数值类型
            if pd.api.types.is_numeric_dtype(df_subset[col_original_likes]):
                likes_stats = df_subset[col_original_likes].describe()
                analysis_results[f'{col_original_likes}_统计'] = likes_stats
                print(f"\n  {col_original_likes} 统计:\n{likes_stats}")
                
                # 可视化点赞数分布 (直方图)
                plt.figure(figsize=(8, 5))
                sns.histplot(df_subset[col_original_likes].dropna(), kde=False, bins=20)
                plt.title(f'象限: {quadrant_value} - {col_original_likes} 分布')
                plt.xlabel(col_original_likes)
                plt.ylabel('频率')
                plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_likes_dist.png')
                plt.savefig(plot_filename)
                plt.close()
                print(f"  点赞数 分布图已保存至: {plot_filename}")
                
                # 对于点赞数，可能存在很多0值，可以添加一个非零点赞数的分布
                non_zero_likes = df_subset[df_subset[col_original_likes] > 0][col_original_likes]
                if len(non_zero_likes) > 0:
                    plt.figure(figsize=(8, 5))
                    sns.histplot(non_zero_likes, kde=True, bins=20)
                    plt.title(f'象限: {quadrant_value} - 非零{col_original_likes}分布')
                    plt.xlabel(f'非零{col_original_likes}')
                    plt.ylabel('频率')
                    plot_filename = os.path.join(output_dir, f'quadrant_{sheet_name}_non_zero_likes_dist.png')
                    plt.savefig(plot_filename)
                    plt.close()
                    print(f"  非零点赞数 分布图已保存至: {plot_filename}")
                    
                    # 添加非零点赞数统计
                    non_zero_likes_stats = non_zero_likes.describe()
                    analysis_results[f'非零{col_original_likes}_统计'] = non_zero_likes_stats
                    print(f"\n  非零{col_original_likes} 统计:\n{non_zero_likes_stats}")
            else:
                print(f"  警告: 列 '{col_original_likes}' 不是数值类型，无法计算统计量")
                likes_counts = df_subset[col_original_likes].value_counts()
                analysis_results[f'{col_original_likes}_分布'] = likes_counts
                print(f"\n  {col_original_likes} 值计数:\n{likes_counts.head()}")
        else:
            print(f"  警告: 找不到列 '{col_original_likes}'")
        
        # 6. 交叉分析: Rating与品牌
        if col_original_rating in df_subset.columns and col_asin_brand in df_subset.columns:
            # 创建Rating与品牌的交叉表
            rating_brand_cross = pd.crosstab(
                df_subset[col_original_rating], 
                df_subset[col_asin_brand],
                margins=True,
                margins_name='总计'
            )
            
            # 如果品牌太多，只保留前10个
            if rating_brand_cross.shape[1] > 11:  # 10个品牌+1个总计
                top_brands = df_subset[col_asin_brand].value_counts().nlargest(10).index
                selected_columns = list(top_brands) + ['总计']
                rating_brand_cross = rating_brand_cross[selected_columns]
            
            analysis_results[f'{col_original_rating}与{col_asin_brand}_交叉表'] = rating_brand_cross
            print(f"\n  {col_original_rating}与{col_asin_brand}交叉表 (部分):\n{rating_brand_cross.head()}")
        
        # 7. 交叉分析: 站点与品牌 (如果数据足够)
        if col_original_site in df_subset.columns and col_asin_brand in df_subset.columns and len(df_subset) >= 10:
            # 创建站点与品牌的交叉表
            site_brand_cross = pd.crosstab(
                df_subset[col_original_site], 
                df_subset[col_asin_brand],
                margins=True,
                margins_name='总计'
            )
            
            # 如果品牌太多，只保留前10个
            if site_brand_cross.shape[1] > 11:  # 10个品牌+1个总计
                top_brands = df_subset[col_asin_brand].value_counts().nlargest(10).index
                selected_columns = list(top_brands) + ['总计']
                site_brand_cross = site_brand_cross[selected_columns]
            
            analysis_results[f'{col_original_site}与{col_asin_brand}_交叉表'] = site_brand_cross
            print(f"\n  {col_original_site}与{col_asin_brand}交叉表 (部分):\n{site_brand_cross.head()}")
        
        # 将当前象限的分析结果写入 Excel
        start_row = 0
        for name, data in analysis_results.items():
            df_to_write = data.reset_index() if hasattr(data, 'reset_index') else pd.DataFrame(data)
            try:
                if "分布" in name:
                    df_to_write.columns = [data.index.name if data.index.name else '值', '数量']
                elif "占比" in name:
                    df_to_write.columns = [data.index.name if data.index.name else '值', '占比(%)']
                elif "统计" in name:
                    df_to_write.columns = ['统计量', '值']
                elif "交叉表" in name:
                    # 交叉表已经有列名，不需要修改
                    pass
            except Exception as e:
                print(f"  警告: 设置列名时出错: {e}")
            
            try:
                df_to_write.to_excel(writer, sheet_name=sheet_name, startrow=start_row+1, index=False if "交叉表" not in name else True)
                pd.DataFrame([name]).to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False, header=False)
                start_row += len(df_to_write) + 3  # 增加一行空行
            except Exception as e:
                print(f"  警告: 写入Excel时出错: {e}")
                # 如果是因为数据太大，尝试写入一个单独的文件
                try:
                    detail_file = os.path.join(output_dir, f'quadrant_{sheet_name}_{name.replace(" ", "_")}.xlsx')
                    df_to_write.to_excel(detail_file, index=False if "交叉表" not in name else True)
                    print(f"  数据太大，已保存到单独文件: {detail_file}")
                except:
                    print(f"  无法保存数据 {name}")

print("\n--- 第5步完成 ---")



--- 开始第5步：按 '图表象限' 统计分析 ---
df_final_merged 形状: (2342, 24)
df_final_merged 中有象限信息的行数: 0
df_final_merged 中有品牌信息的行数: 2342
df_final_merged 中同时有象限和品牌信息的行数: 0
象限分布:
Series([], Name: count, dtype: int64)
前5个品牌分布:
brand
LANTEFUL         233
VTRIN            172
FIDUCIAL HOME    152
Sakugi           140
ROMGUAR CRAFT    114
Name: count, dtype: int64
筛选出包含有效象限信息的行数: 0
应用Rating 3-5 筛选后，剩余 0 行数据
将对以下 0 个 '图表象限' 进行分析: []

分析结果将保存在: 生成结果/integrated_analysis_matches/analysis_by_quadrant.xlsx
象限数据量概览:
Empty DataFrame
Columns: [象限, 数据量]
Index: []
品牌总体分布 (前5个):
Empty DataFrame
Columns: [品牌, 数量, 占比]
Index: []
已添加品牌与象限的交叉表

--- 第5步完成 ---


In [8]:
# --- 第6步：品牌专项分析 ---
print("\n--- 开始第6步：品牌专项分析 ---")

# 检查数据情况
print(f"df_final_merged 形状: {df_final_merged.shape}")
print(f"df_final_merged 中有品牌信息的行数: {df_final_merged[col_asin_brand].notna().sum() if col_asin_brand in df_final_merged.columns else 0}")

# 确保有品牌数据
if col_asin_brand not in df_final_merged.columns:
    print(f"错误: 合并后的数据中没有 '{col_asin_brand}' 列，无法进行品牌分析")
    print("请检查前面的数据处理步骤是否正确")
    exit()

# 筛选有效数据
df_brand_analysis = df_final_merged.dropna(subset=[col_asin_brand])
print(f"筛选出包含有效品牌信息的行数: {len(df_brand_analysis)}")
print(f"品牌数量: {df_brand_analysis[col_asin_brand].nunique()}")
print(f"前5个品牌及其数量:\n{df_brand_analysis[col_asin_brand].value_counts().head()}")

# 如果有象限信息，进行品牌-象限交叉分析
if col_new_quadrant in df_brand_analysis.columns:
    df_brand_with_quadrant = df_brand_analysis.dropna(subset=[col_new_quadrant])
    print(f"同时包含品牌和象限信息的行数: {len(df_brand_with_quadrant)}")
    
    # 创建一个 Excel writer 用于保存品牌分析结果
    brand_analysis_file = f"{analysis_output_prefix}brand_analysis.xlsx"
    with pd.ExcelWriter(brand_analysis_file) as writer:
        print(f"\n分析结果将保存在: {brand_analysis_file}")
        
        # 1. 品牌总体分布
        brand_overall = df_brand_analysis[col_asin_brand].value_counts().reset_index()
        brand_overall.columns = ['品牌', '数量']
        brand_overall['占比'] = brand_overall['数量'] / brand_overall['数量'].sum() * 100
        brand_overall['占比'] = brand_overall['占比'].round(2).astype(str) + '%'
        brand_overall.to_excel(writer, sheet_name='品牌总体分布', index=False)
        print(f"品牌总体分布 (前5个):\n{brand_overall.head()}")
        
        # 2. 品牌-象限交叉表
        brand_quadrant_cross = pd.crosstab(
            df_brand_with_quadrant[col_asin_brand], 
            df_brand_with_quadrant[col_new_quadrant],
            margins=True,
            margins_name='总计'
        )
        brand_quadrant_cross.to_excel(writer, sheet_name='品牌象限交叉表')
        print("已添加品牌与象限的交叉表")
        
        # 3. 品牌-Rating交叉表
        if col_original_rating in df_brand_analysis.columns:
            brand_rating_cross = pd.crosstab(
                df_brand_analysis[col_asin_brand], 
                df_brand_analysis[col_original_rating],
                margins=True,
                margins_name='总计'
            )
            brand_rating_cross.to_excel(writer, sheet_name='品牌Rating交叉表')
            print("已添加品牌与Rating的交叉表")
        
        # 4. 每个品牌的详细分析
        top_brands = df_brand_analysis[col_asin_brand].value_counts().nlargest(10).index
        print(f"将对前10个品牌进行详细分析: {top_brands.tolist()}")
        
        for brand in top_brands:
            print(f"\n--- 分析品牌: {brand} ---")
            df_brand_subset = df_brand_analysis[df_brand_analysis[col_asin_brand] == brand]
            
            # 清理品牌名作为sheet名
            sheet_name = str(brand).replace('/', '-').replace('\\', '-').replace('?', '').replace('*', '')[:31]
            
            # 4.1 象限分布
            if col_new_quadrant in df_brand_subset.columns:
                quadrant_dist = df_brand_subset[col_new_quadrant].value_counts()
                quadrant_dist_df = quadrant_dist.reset_index()
                quadrant_dist_df.columns = ['象限', '数量']
                quadrant_dist_df['占比'] = (quadrant_dist_df['数量'] / quadrant_dist_df['数量'].sum() * 100).round(2)
                quadrant_dist_df.to_excel(writer, sheet_name=sheet_name, startrow=0, index=False)
                print(f"  象限分布:\n{quadrant_dist}")
                
                # 可视化象限分布
                plt.figure(figsize=(10, 6))
                ax = sns.barplot(x=quadrant_dist.values, y=quadrant_dist.index, orient='h')
                plt.title(f'品牌: {brand} - 象限分布')
                plt.xlabel('数量')
                plt.ylabel('象限')
                
                # 添加数值标签
                for i, v in enumerate(quadrant_dist.values):
                    ax.text(v + 0.1, i, str(v), va='center')
                
                plot_filename = os.path.join(output_dir, f'brand_{sheet_name}_quadrant_dist.png')
                plt.tight_layout()
                plt.savefig(plot_filename)
                plt.close()
                print(f"  象限分布图已保存至: {plot_filename}")
            
            # 4.2 Rating分布
            if col_original_rating in df_brand_subset.columns:
                rating_dist = df_brand_subset[col_original_rating].value_counts().sort_index()
                rating_dist_df = rating_dist.reset_index()
                rating_dist_df.columns = ['Rating', '数量']
                rating_dist_df['占比'] = (rating_dist_df['数量'] / rating_dist_df['数量'].sum() * 100).round(2)
                rating_dist_df.to_excel(writer, sheet_name=sheet_name, startrow=len(quadrant_dist)+3 if col_new_quadrant in df_brand_subset.columns else 0, index=False)
                print(f"  Rating分布:\n{rating_dist}")
                
                # 可视化Rating分布
                plt.figure(figsize=(8, 5))
                sns.countplot(x=col_original_rating, data=df_brand_subset, order=sorted(df_brand_subset[col_original_rating].unique()))
                plt.title(f'品牌: {brand} - Rating分布')
                plt.xlabel('Rating')
                plt.ylabel('数量')
                plot_filename = os.path.join(output_dir, f'brand_{sheet_name}_rating_dist.png')
                plt.savefig(plot_filename)
                plt.close()
                print(f"  Rating分布图已保存至: {plot_filename}")
            
            # 4.3 Match分布
            if col_new_match in df_brand_subset.columns:
                match_dist = df_brand_subset[col_new_match].value_counts()
                if not match_dist.empty:
                    match_dist_df = match_dist.reset_index()
                    match_dist_df.columns = ['Match', '数量']
                    match_dist_df['占比'] = (match_dist_df['数量'] / match_dist_df['数量'].sum() * 100).round(2)
                    
                    # 计算起始行
                    start_row = 0
                    if col_new_quadrant in df_brand_subset.columns:
                        start_row += len(quadrant_dist) + 3
                    if col_original_rating in df_brand_subset.columns:
                        start_row += len(rating_dist) + 3
                    
                    match_dist_df.to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False)
                    print(f"  Match分布 (前3个):\n{match_dist.head(3)}")

print("\n--- 第6步完成 ---")



--- 开始第6步：品牌专项分析 ---
df_final_merged 形状: (2342, 24)
df_final_merged 中有品牌信息的行数: 2342
筛选出包含有效品牌信息的行数: 2342
品牌数量: 52
前5个品牌及其数量:
brand
LANTEFUL         233
VTRIN            172
FIDUCIAL HOME    152
Sakugi           140
ROMGUAR CRAFT    114
Name: count, dtype: int64
同时包含品牌和象限信息的行数: 0

分析结果将保存在: 生成结果/integrated_analysis_matches/analysis_brand_analysis.xlsx
品牌总体分布 (前5个):
              品牌   数量     占比
0       LANTEFUL  233  9.95%
1          VTRIN  172  7.34%
2  FIDUCIAL HOME  152  6.49%
3         Sakugi  140  5.98%
4  ROMGUAR CRAFT  114  4.87%
已添加品牌与象限的交叉表
已添加品牌与Rating的交叉表
将对前10个品牌进行详细分析: ['LANTEFUL', 'VTRIN', 'FIDUCIAL HOME', 'Sakugi', 'ROMGUAR CRAFT', 'OYREL', 'Simple Houseware', 'HOOBRO', 'INGIORDAR', 'SONGMICS']

--- 分析品牌: LANTEFUL ---
  象限分布:
Series([], Name: count, dtype: int64)


  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_LANTEFUL_quadrant_dist.png
  Rating分布:
Rating
1     56
2     12
3     22
4     23
5    120
Name: count, dtype: int64




  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_LANTEFUL_rating_dist.png

--- 分析品牌: VTRIN ---
  象限分布:
Series([], Name: count, dtype: int64)
  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_VTRIN_quadrant_dist.png
  Rating分布:
Rating
1    38
2    21
3    12
4    16
5    85
Name: count, dtype: int64


  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_VTRIN_rating_dist.png

--- 分析品牌: FIDUCIAL HOME ---


  象限分布:
Series([], Name: count, dtype: int64)
  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_FIDUCIAL HOME_quadrant_dist.png
  Rating分布:
Rating
1    62
2    16
3    15
4    16
5    43
Name: count, dtype: int64




  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_FIDUCIAL HOME_rating_dist.png

--- 分析品牌: Sakugi ---
  象限分布:
Series([], Name: count, dtype: int64)
  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_Sakugi_quadrant_dist.png
  Rating分布:
Rating
1    39
2    21
3    17
4    12
5    51
Name: count, dtype: int64
  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_Sakugi_rating_dist.png

--- 分析品牌: ROMGUAR CRAFT ---
  象限分布:
Series([], Name: count, dtype: int64)




  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_ROMGUAR CRAFT_quadrant_dist.png
  Rating分布:
Rating
1    14
2    10
3    14
4    19
5    57
Name: count, dtype: int64
  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_ROMGUAR CRAFT_rating_dist.png

--- 分析品牌: OYREL ---
  象限分布:
Series([], Name: count, dtype: int64)
  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_OYREL_quadrant_dist.png
  Rating分布:
Rating
1    13
2     4
3     8
4    15
5    70
Name: count, dtype: int64




  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_OYREL_rating_dist.png

--- 分析品牌: Simple Houseware ---
  象限分布:
Series([], Name: count, dtype: int64)
  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_Simple Houseware_quadrant_dist.png
  Rating分布:
Rating
1     9
2     4
3     4
4    10
5    76
Name: count, dtype: int64




  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_Simple Houseware_rating_dist.png

--- 分析品牌: HOOBRO ---
  象限分布:
Series([], Name: count, dtype: int64)
  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_HOOBRO_quadrant_dist.png
  Rating分布:
Rating
1     4
2     6
3     6
4    13
5    73
Name: count, dtype: int64
  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_HOOBRO_rating_dist.png

--- 分析品牌: INGIORDAR ---
  象限分布:
Series([], Name: count, dtype: int64)




  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_INGIORDAR_quadrant_dist.png
  Rating分布:
Rating
1     7
2     5
3     2
4    11
5    68
Name: count, dtype: int64
  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_INGIORDAR_rating_dist.png

--- 分析品牌: SONGMICS ---
  象限分布:
Series([], Name: count, dtype: int64)




  象限分布图已保存至: 生成结果/integrated_analysis_matches/brand_SONGMICS_quadrant_dist.png
  Rating分布:
Rating
1     6
2     1
3     4
4     8
5    62
Name: count, dtype: int64
  Rating分布图已保存至: 生成结果/integrated_analysis_matches/brand_SONGMICS_rating_dist.png

--- 第6步完成 ---
