In [1]:
# --------------------------------------------------
# Cell 1: 导入库并设置项目结构
# --------------------------------------------------
import copy
import os
import re
import sys
import warnings
from datetime import datetime
from functools import reduce
from pathlib import Path
from time import perf_counter
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.font_manager import FontProperties

# 忽略来自 openpyxl 的特定 UserWarning
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

font_path = "微软雅黑.ttf"

try:
    chinese_font = FontProperties(fname=font_path)
except FileNotFoundError:
    print("Font file not found. Please provide the correct path.")
    # Fallback to a generic font if the file is not found
    chinese_font = FontProperties()


warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")


# --- 项目路径设置  ---
# .
# ├── 1_data_preprocess.ipynb
# └── 报告数据/
#     ├── 输入/
#     │   ├── 安监数据/              (存放老师给的原始Excel文件；10家公司，10个文件)
#     │   └── basic_data.xlsx          (城市信息、线路信息)
#     ├── 输出/                      (存放所有最终生成的报告)
#     ├── temp/
#     │   ├── 1_待上传猪猪云数据/        (需要逐个手动上传到猪猪云的文件；8家公司，16个文件，排除顺丰和中通)
#     │   ├── 2_猪猪云下载数据/          (【手动放入】存放从猪猪云下载的结果文件；8家公司，16个文件，排除顺丰和中通)
#     │   ├── 3_猪猪云合并数据/         （猪猪云下载数据按公司合并后数据；8家公司，8个文件，排除顺丰和中通）
#     │   ├── 4_logistics数据         （存放logistics数据——提取完整物流信息的时间戳后的数据；8家公司，8个文件，排除顺丰和中通）
#     └── └── 5_中转数据/               (存放中转数据——提取中转城市和平均中转次数后的数据；8家公司，8个文件，排除顺丰和中通)
# 根目录
base_path = Path.cwd()
report_path = base_path / "报告数据"
# 输入路径
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
# 输出路径
output_path = report_path / "输出"
# 中间过程文件路径（自动创建，用于存放临时文件）
temp_path = report_path / "temp"
upload_split_path = temp_path / "1_待上传猪猪云文件"  # 存放拆分后待上传的文件
zhuzhuyun_download_path = temp_path / "2_猪猪云下载数据"  # 关键：这是手动放置文件的目录
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"
transit_data_path = temp_path / "5_中转数据"

# 创建所有需要的文件夹
for p in [
    report_path,
    input_path,
    anjian_data_path,
    zhuzhuyun_download_path,
    zhuzhuyun_merge_path,
    transit_data_path,
    output_path,
    temp_path,
    upload_split_path,
    pycharm_input_path,
]:
    p.mkdir(exist_ok=True)

In [5]:
# --------------------------------------------------
# Cell 2: 汇总数据
# --------------------------------------------------
# --- 路径设置 ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_dir = report_path / "输出" / "data_analysis_result"
output_dir = report_path / "输出" / "1_汇总数据"
output_dir.mkdir(parents=True, exist_ok=True)


def extract_metrics_from_basic_sheet(df, kilo_params):
    """
    从“基础指标”DataFrame中直接提取所有需要的指标。
    """
    if df.empty:
        return pd.Series(dtype="float64")

    metrics_table = df.set_index("项目")
    results = {}

    time_limit_cols = [
        "全程时限",
        "寄出地处理时限",
        "运输时限",
        "寄达地处理时限",
        "投递时限",
    ]
    on_time_cols = ["72小时准时率", "48小时准时率"]

    for col in time_limit_cols + on_time_cols:
        results[col] = metrics_table.loc[col, "mean"]

    for param in kilo_params:
        results[param] = metrics_table.loc["全程时限", param]

    return pd.Series(results)


def main(companies, params):
    """
    主函数，处理所有公司的文件并生成汇总报告。
    (此函数逻辑正确，无需修改)
    """
    all_metrics_cols = [
        "全程时限",
        "寄出地处理时限",
        "运输时限",
        "寄达地处理时限",
        "投递时限",
        "72小时准时率",
        "48小时准时率",
    ] + params

    summary_df = pd.DataFrame(index=companies, columns=all_metrics_cols)

    print(f"⭕️ 开始处理数据，读取目录为：'{input_dir}'")

    for company in companies:
        # 文件名是根据公司名动态生成的
        file_name = f"{company}_data_analysis_result.xlsx"
        # 文件路径是 输入目录 + 文件名
        file_path = input_dir / file_name

        try:
            print(f"   - 正在处理公司: {company}，文件: {file_name}...")

            # 读取“基础指标”sheet
            basic_metrics_df = pd.read_excel(file_path, sheet_name="基础指标")

            # 从基础指标中直接提取
            metrics = extract_metrics_from_basic_sheet(basic_metrics_df, params)

            # 填充到汇总DataFrame
            summary_df.loc[company, metrics.index] = metrics

        except FileNotFoundError:
            print(f"   ⚠️ 警告：未找到文件 {file_path}，跳过该公司。")
        except KeyError as e:
            print(
                f"   ❌ 错误: 在文件 {file_path} 的'基础指标'Sheet中未找到关键项目: {e}。"
            )
        except Exception as e:
            print(f"   ❌ 错误：处理文件 {file_path} 时发生错误: {e}")

    summary_df = summary_df.apply(pd.to_numeric, errors="coerce")
    return summary_df.round(4)


if __name__ == "__main__":
    companies_list = [
        "顺丰",
        "EMS",
        "中通",
        "极兔",
        "韵达",
        "圆通",
        "京东",
        "申通",
        "德邦",
        "邮政",
    ]
    kilometer_params = ["0-600", "600-1500", "1500-2500", "2500以上"]

    final_summary = main(companies_list, kilometer_params)

    output_file = output_dir / "月度汇总数据.xlsx"
    final_summary.to_excel(output_file)

    print("\n" + "=" * 50)
    print(f"✅ 任务完成！")
    print(f"月度汇总数据已保存至: ‘{output_file}’")
    print("=" * 50)

⭕️ 开始处理数据，读取目录为：'/Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输出/data_analysis_result'
   - 正在处理公司: 顺丰，文件: 顺丰_data_analysis_result.xlsx...
   - 正在处理公司: EMS，文件: EMS_data_analysis_result.xlsx...
   - 正在处理公司: 中通，文件: 中通_data_analysis_result.xlsx...
   - 正在处理公司: 极兔，文件: 极兔_data_analysis_result.xlsx...
   - 正在处理公司: 韵达，文件: 韵达_data_analysis_result.xlsx...
   - 正在处理公司: 圆通，文件: 圆通_data_analysis_result.xlsx...
   - 正在处理公司: 京东，文件: 京东_data_analysis_result.xlsx...
   - 正在处理公司: 申通，文件: 申通_data_analysis_result.xlsx...
   - 正在处理公司: 德邦，文件: 德邦_data_analysis_result.xlsx...
   - 正在处理公司: 邮政，文件: 邮政_data_analysis_result.xlsx...

✅ 任务完成！
月度汇总数据已保存至: ‘/Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输出/1_汇总数据/月度汇总数据.xlsx’


In [None]:
# --------------------------------------------------
# Cell 3: 通达兔专题分析
# --------------------------------------------------
import sys
from pathlib import Path

import numpy as np
import pandas as pd

# --- 路径设置 ---
ROOT_PATH = Path.cwd()
DATA_ANALYSIS_DIR = ROOT_PATH / "报告数据" / "输出" / "data_analysis_result"
OUTPUT_DIR = ROOT_PATH / "报告数据" / "输出" / "5_通达兔数据"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BASIC_DATA_PATH = ROOT_PATH / "报告数据" / "输入" / "basic_data.xlsx"


# --- 公司和城市列表 ---
def load_city_list_from_excel(file_path, sheet_name):
    try:
        if not file_path.exists():
            print(
                f"❌ 严重错误：基础数据文件 'basic_data.xlsx' 未在以下路径找到:\n{file_path}"
            )
            return None
        df_cities = pd.read_excel(file_path, sheet_name=sheet_name, header=0)
        if df_cities.columns[0] not in df_cities.columns:
            print(f"❌ 严重错误: 在Sheet '{sheet_name}' 中未找到城市列。")
            return None
        city_list = (
            df_cities.iloc[:, 0].dropna().astype(str).str.strip().unique().tolist()
        )
        if not city_list:
            print(
                f"❌ 严重错误：在 '{file_path.name}' 的 '{sheet_name}' sheet页中未能加载到任何城市数据。"
            )
            return None
        print(f"✅ 成功从 'basic_data.xlsx' 加载 {len(city_list)} 个城市。")
        return city_list
    except Exception as e:
        print(f"❌ 严重错误：加载城市列表时发生意外错误: {e}")
        return None


# 公司列表定义
COMPANY_FILE_MAP = {
    "中通": "中通",
    "圆通": "圆通",
    "极兔": "极兔",
    "申通": "申通",
    "韵达": "韵达",
    "顺丰": "顺丰",
    "京东": "京东",
    "EMS": "EMS",
    "德邦": "德邦",
    "快包": "邮政",
}
COMPANIES_FOR_MACRO_ANALYSIS = [c for c in COMPANY_FILE_MAP.keys() if c != "快包"]
COMPANIES_TONGDATU = ["圆通", "申通", "中通", "极兔", "韵达"]
CITY_LIST = None


# ==============================================================================
# Part 1: 分路线时限分析 (逻辑不变)
# ==============================================================================
def run_route_analysis(city_list):
    print("\n==========================================================")
    print("➡️ Part 1: 开始执行分路线时限分析 (RouteAnalysis)")
    print("==========================================================")
    city_routes = [f"{c1}-{c2}" for c1 in city_list for c2 in city_list if c1 != c2]
    aggregated_results_df = pd.DataFrame({"路线": city_routes})
    required_time_cols = [
        "揽收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "签收时间",
        "寄出城市",
        "寄达城市",
    ]
    for company in COMPANIES_TONGDATU:
        file_prefix = COMPANY_FILE_MAP.get(company)
        file_path = DATA_ANALYSIS_DIR / f"{file_prefix}_data_analysis_result.xlsx"
        if not file_path.exists():
            continue
        print(f"  - 正在处理: {company}")
        try:
            df = pd.read_excel(file_path, sheet_name="线路详细数据")
        except Exception as e:
            print(f"  - ⚠️ 警告: 读取文件 {file_path.name} 失败: {e}")
            continue
        if not all(col in df.columns for col in required_time_cols):
            print(
                f"  - ⚠️ 警告: {file_path.name} 的'线路详细数据'sheet缺少必要的时间列，跳过 Part 1 的 {company} 分析。"
            )
            continue
        for col in required_time_cols:
            if "时间" in col:
                df[col] = pd.to_datetime(df[col], errors="coerce")
        df.dropna(
            subset=[col for col in required_time_cols if "时间" in col], inplace=True
        )
        if df.empty:
            continue
        df["路线"] = (
            df["寄出城市"].astype(str).str.strip()
            + "-"
            + df["寄达城市"].astype(str).str.strip()
        )
        df[f"揽收-到达寄出地分拣中心（小时）{company}"] = (
            df["到达分拣中心时间"] - df["揽收时间"]
        ) / np.timedelta64(1, "h")
        df[f"到达寄出地分拣中心-离开寄达地分拣中心（小时）{company}"] = (
            df["离开收件城市分拣中心时间"] - df["到达分拣中心时间"]
        ) / np.timedelta64(1, "h")
        df[f"离开寄达地分拣中心-签收（小时）{company}"] = (
            df["签收时间"] - df["离开收件城市分拣中心时间"]
        ) / np.timedelta64(1, "h")
        company_metrics = [
            f"揽收-到达寄出地分拣中心（小时）{company}",
            f"到达寄出地分拣中心-离开寄达地分拣中心（小时）{company}",
            f"离开寄达地分拣中心-签收（小时）{company}",
        ]
        company_agg = df.groupby("路线")[company_metrics].mean().reset_index()
        aggregated_results_df = pd.merge(
            aggregated_results_df, company_agg, on="路线", how="left"
        )
    metric_templates = [
        "揽收-到达寄出地分拣中心（小时）",
        "到达寄出地分拣中心-离开寄达地分拣中心（小时）",
        "离开寄达地分拣中心-签收（小时）",
    ]
    for metric in metric_templates:
        company_cols = [
            f"{metric}{comp}"
            for comp in COMPANIES_TONGDATU
            if f"{metric}{comp}" in aggregated_results_df.columns
        ]
        if company_cols:
            aggregated_results_df[f"{metric}通达兔均值"] = aggregated_results_df[
                company_cols
            ].mean(axis=1)
            aggregated_results_df[f"{metric}通达兔最优"] = aggregated_results_df[
                company_cols
            ].min(axis=1)
    print("  ✅ [Part 1] 分路线时限分析完成。")
    return aggregated_results_df


# ==============================================================================
# Part 2: 宏观比例/时长分析 (按新原则修改)
# ==============================================================================
def run_macro_ratio_analysis():
    print("\n==========================================================")
    print("➡️ Part 2: 开始执行宏观比例/时长分析 (MacroRatioAnalysis)")
    print("==========================================================")
    all_time_cols = [
        "揽收时间",
        "到达分拣中心时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "离开收件城市分拣中心时间",
        "签收时间",
    ]
    results = []

    for company_key in COMPANIES_FOR_MACRO_ANALYSIS:
        file_prefix = COMPANY_FILE_MAP.get(company_key, company_key)
        file_path = DATA_ANALYSIS_DIR / f"{file_prefix}_data_analysis_result.xlsx"
        if not file_path.exists():
            continue
        print(f"  - 正在处理: {company_key}")
        try:
            df = pd.read_excel(file_path, sheet_name="线路详细数据")
        except Exception as e:
            print(f"  - ⚠️ 警告: 读取文件 {file_path.name} 失败: {e}")
            continue
        if df.empty or not all(col in df.columns for col in all_time_cols):
            print(
                f"  - ⚠️ 警告: {file_path.name} 的'线路详细数据'sheet不完整，跳过 Part 2 的 {company_key} 分析。"
            )
            continue
        for col in all_time_cols:
            df[col] = pd.to_datetime(df[col], errors="coerce")
        df.dropna(subset=all_time_cols, inplace=True)
        if df.empty:
            continue

        company_metrics = {"公司": company_key}
        company_metrics["揽件和进分拨中心同一天比例"] = (
            df["揽收时间"].dt.date == df["到达分拣中心时间"].dt.date
        ).mean()
        company_metrics["从揽件到离开寄出地在12小时之内的比例"] = (
            (df["离开寄件城市时间"] - df["揽收时间"]) / np.timedelta64(1, "h") < 12
        ).mean()
        handling_time_out = (
            df["离开寄件城市时间"] - df["到达分拣中心时间"]
        ) / np.timedelta64(1, "h")
        handling_time_in = (
            df["离开收件城市分拣中心时间"] - df["到达收件城市时间"]
        ) / np.timedelta64(1, "h")
        company_metrics["寄出地分拣中心处理时长"] = handling_time_out.mean()
        company_metrics["寄达地分拣中心处理时长"] = handling_time_in.mean()
        company_metrics["寄达地分拨中心处理超过12小时比例"] = (
            handling_time_in > 12
        ).mean()
        results.append(company_metrics)

    if not results:
        print("  - ❌ 错误: 未能从任何公司文件中计算出宏观指标。")
        return pd.DataFrame()

    df_final = pd.DataFrame(results).set_index("公司").T

    # 计算通达兔均值和最优值
    higher_is_better_metrics = [
        "揽件和进分拨中心同一天比例",
        "从揽件到离开寄出地在12小时之内的比例",
    ]
    lower_is_better_metrics = [
        "寄出地分拣中心处理时长",
        "寄达地分拣中心处理时长",
        "寄达地分拨中心处理超过12小时比例",
    ]

    tongdatu_cols = df_final.columns.intersection(COMPANIES_TONGDATU).tolist()
    if tongdatu_cols:
        df_final["通达兔均值"] = df_final[tongdatu_cols].mean(axis=1)

        # (核心修正) 在赋值前，先筛选出实际存在的指标
        existing_higher_metrics = [
            m for m in higher_is_better_metrics if m in df_final.index
        ]
        if existing_higher_metrics:
            df_final.loc[existing_higher_metrics, "通达兔最优值"] = df_final.loc[
                existing_higher_metrics, tongdatu_cols
            ].max(axis=1)

        existing_lower_metrics = [
            m for m in lower_is_better_metrics if m in df_final.index
        ]
        if existing_lower_metrics:
            df_final.loc[existing_lower_metrics, "通达兔最优值"] = df_final.loc[
                existing_lower_metrics, tongdatu_cols
            ].min(axis=1)

    print("  ✅ [Part 2] 宏观指标分析完成。")
    return df_final


if __name__ == "__main__":
    print("🚀 开始执行数据分析工作流...")
    CITY_LIST = load_city_list_from_excel(BASIC_DATA_PATH, "50_focus_cities")
    if CITY_LIST is None:
        print("\n🛑 由于无法加载城市列表，工作流已中止。")
        sys.exit(1)

    df_detail = run_route_analysis(CITY_LIST)
    df_summary = run_macro_ratio_analysis()

    final_output_path = OUTPUT_DIR / "通达兔分段分析.xlsx"
    print(f"\n... 正在整合结果到Excel文件 ...")
    with pd.ExcelWriter(final_output_path, engine="openpyxl") as writer:
        if not df_summary.empty:
            df_summary.to_excel(writer, sheet_name="summary_tongdatu", index=True)
            print(f"  - Sheet 'summary_tongdatu' 已写入。")
        else:
            print("  - ⚠️ 警告: 宏观分析结果为空，未写入 'summary_tongdatu' sheet。")

        if not df_detail.empty:
            df_detail.to_excel(writer, sheet_name="detail_tongdatu", index=False)
            print(f"  - Sheet 'detail_tongdatu' 已写入。")
        else:
            print("  - ⚠️ 警告: 路线分析结果为空，未写入 'detail_tongdatu' sheet。")
    print(f"\n🎉🎉🎉 所有任务已全部完成！最终报告已保存至: {final_output_path} 🎉🎉🎉")

🚀 开始执行数据分析工作流...
✅ 成功从 'basic_data.xlsx' 加载 50 个城市。

➡️ Part 1: 开始执行分路线时限分析 (RouteAnalysis)
  - 正在处理: 圆通
  - 正在处理: 申通
  - 正在处理: 中通
  - 正在处理: 极兔
  - 正在处理: 韵达
  ✅ [Part 1] 分路线时限分析完成。

➡️ Part 2: 开始执行宏观比例/时长分析 (MacroRatioAnalysis)
  - 正在处理: 中通
  - 正在处理: 圆通
  - 正在处理: 极兔
  - 正在处理: 申通
  - 正在处理: 韵达
  - 正在处理: 顺丰
  - 正在处理: 京东
  - 正在处理: EMS
  - 正在处理: 德邦
  ✅ [Part 2] 宏观指标分析完成。

... 正在整合结果到Excel文件 ...
  - Sheet 'summary_tongdatu' 已写入。
  - Sheet 'detail_tongdatu' 已写入。

🎉🎉🎉 所有任务已全部完成！最终报告已保存至: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输出/5_通达兔数据/通达兔分段分析.xlsx 🎉🎉🎉


In [None]:
# ==============================================================================
# Cell 4: 数据中台生成 (最终正确版 - 统一口径)
# ==============================================================================
import traceback
import zipfile
from pathlib import Path

import numpy as np
import pandas as pd

# --- 0. 全局配置与辅助函数 ---
COMPANY_MAPPING = {
    "EMS": "EMS",
    "德邦": "德邦",
    "极兔": "极兔",
    "圆通": "圆通",
    "顺丰": "顺丰",
    "中通": "中通",
    "京东": "京东",
    "韵达": "韵达",
    "申通": "申通",
    "邮政": "快包",
    "快包": "快包",
}

# 明确定义各个公司列表的用途
COMPANIES_FOR_INDUSTRY_COMPARISON = [
    "EMS",
    "中通",
    "京东",
    "圆通",
    "德邦",
    "极兔",
    "申通",
    "韵达",
    "顺丰",
]
COMPANIES_NINE_MAJOR = COMPANIES_FOR_INDUSTRY_COMPARISON
COMPANIES_ALL_TEN = COMPANIES_FOR_INDUSTRY_COMPARISON + ["快包"]


def _find_company_key_from_filename(filename):
    name = Path(filename).stem.replace("_data_analysis_result", "")
    sorted_keys = sorted(COMPANY_MAPPING.keys(), key=len, reverse=True)
    for keyword in sorted_keys:
        if keyword in name:
            return COMPANY_MAPPING[keyword]
    return None


# --- 1. 核心功能: 主报告生成 ---
def generate_main_report():
    print("--- 任务开始：生成最终线路明细报告 ---")

    base_path = Path.cwd()
    report_path = base_path / "报告数据"
    output_path = report_path / "输出"
    data_analysis_path = output_path / "data_analysis_result"
    base_data_path = report_path / "输入" / "basic_data.xlsx"

    print("\n[1/4] 动态构建线路全集并合并基础信息...")
    all_routes = set()
    files_in_analysis_result = list(
        data_analysis_path.glob("*_data_analysis_result.xlsx")
    )
    if not files_in_analysis_result:
        print(
            f"🔥🔥🔥 错误：在路径 '{data_analysis_path.resolve()}' 中未找到任何 '*_data_analysis_result.xlsx' 文件。"
        )
        return None
    for file_path in files_in_analysis_result:
        try:
            all_routes.update(
                pd.read_excel(
                    file_path,
                    sheet_name="线路汇总数据",
                    usecols=["路线"],
                    engine="openpyxl",
                )["路线"].unique()
            )
        except zipfile.BadZipFile:
            print(
                f" -> 警告: 文件 {file_path.name} 已损坏或不是有效的Excel文件，已跳过。"
            )
            continue
        except Exception as e:
            print(f" -> 警告: 读取文件 {file_path.name} 路线列表失败: {e}")
    if not all_routes:
        print("🔥🔥🔥 错误：未能从任何文件中构建线路列表，流程终止。")
        return None
    df_result = pd.DataFrame(list(all_routes), columns=["路线"])
    try:
        df_base_info = pd.read_excel(
            base_data_path, sheet_name="inter-city_routes", engine="openpyxl"
        ).rename(columns={"公里": "线路里程", "经济圈": "城市圈"})
        cols_to_merge = [
            "寄出省份",
            "寄出城市",
            "寄达省份",
            "寄达城市",
            "路线",
            "线路里程",
            "城市圈",
        ]
        df_result = pd.merge(
            df_result,
            df_base_info[[c for c in cols_to_merge if c in df_base_info.columns]],
            on="路线",
            how="left",
        )
    except Exception as e:
        print(f" -> 🔥🔥🔥 错误：合并基础信息失败: {e}")

    print("\n[2/4] 从'线路汇总数据'提取所有预计算指标...")
    metrics_to_extract = {
        "快递数量": "快递数量",
        "全程时限": "全程时限",
        "寄出地处理时限": "寄出地处理时限",
        "运输时限": "运输时限",
        "寄达地处理时限": "寄达地处理时限",
        "投递时限": "投递时限",
        "揽收-到达寄出地分拣中心时长": "揽收-到达寄出地分拣中心时长",
        "到达寄出地分拣中心-离开寄出地城市时长": "到达寄出地分拣中心-离开寄出地城市时长",
        "到达寄达地城市-离开寄达地分拣中心时长": "到达寄达地城市-离开寄达地分拣中心时长",
        "离开寄达地分拣中心-派件": "离开寄达地分拣中心-派件",
        "72小时准时率": "72小时准时率",
        "48小时准时率": "48小时准时率",
        "送达天数_80分位": "送达天数",
        "中转次数": "平均中转次数",
    }
    for file_path in files_in_analysis_result:
        company_key = _find_company_key_from_filename(file_path.name)
        if not company_key:
            continue
        try:
            df_summary = pd.read_excel(
                file_path, sheet_name="线路汇总数据", engine="openpyxl"
            )
            for source_col, target_metric in metrics_to_extract.items():
                if source_col in df_summary.columns:
                    if source_col == "中转次数":
                        new_col_name = f"{company_key}{target_metric}"
                    else:
                        new_col_name = f"{target_metric}{company_key}"

                    df_metric = (
                        df_summary[["路线", source_col]]
                        .copy()
                        .rename(columns={source_col: new_col_name})
                    )
                    df_result = pd.merge(df_result, df_metric, on="路线", how="left")
        except zipfile.BadZipFile:
            print(
                f" -> 警告: 文件 {file_path.name} 已损坏或不是有效的Excel文件，已跳过。"
            )
            continue
        except Exception as e:
            print(f" -> 错误: 处理文件 {file_path.name} 时出错: {e}")

    print("\n[3/4] 计算各项指标的统计值...")
    all_transfer_cols = [
        f"{comp}平均中转次数"
        for comp in COMPANIES_ALL_TEN
        if f"{comp}平均中转次数" in df_result.columns
    ]
    if all_transfer_cols:
        cols_for_best_turnover = [c for c in all_transfer_cols if "快包" not in c]
        if cols_for_best_turnover:
            df_result["最优中转次数"] = df_result[cols_for_best_turnover].min(axis=1)

    stat_metrics = [
        "快递数量",
        "48小时准时率",
        "72小时准时率",
        "全程时限",
        "寄出地处理时限",
        "运输时限",
        "寄达地处理时限",
        "投递时限",
        "送达天数",
    ]
    for metric in stat_metrics:
        if metric == "快递数量":
            total_qty_cols = [
                f"快递数量{comp}"
                for comp in COMPANIES_ALL_TEN
                if f"快递数量{comp}" in df_result.columns
            ]
            if total_qty_cols:
                df_result[f"快递数量_total"] = df_result[total_qty_cols].sum(axis=1)
            continue
        metric_cols = [
            f"{metric}{comp}"
            for comp in COMPANIES_FOR_INDUSTRY_COMPARISON
            if f"{metric}{comp}" in df_result.columns
        ]
        if not metric_cols:
            continue
        df_result[f"{metric}_average"] = df_result[metric_cols].mean(axis=1)
        df_result[f"{metric}_minimum"] = df_result[metric_cols].min(axis=1)
        df_result[f"{metric}_maximum"] = df_result[metric_cols].max(axis=1)

    print("\n[4/4] 计算排名...")
    rank_metrics = [
        "48小时准时率",
        "72小时准时率",
        "全程时限",
        "寄出地处理时限",
        "运输时限",
        "寄达地处理时限",
        "投递时限",
        "送达天数",
    ]
    for param in rank_metrics:
        is_desc = "准时率" in param
        ems_cols = [
            f"{param}{comp}"
            for comp in COMPANIES_NINE_MAJOR
            if f"{param}{comp}" in df_result.columns
        ]
        if ems_cols and f"{param}EMS" in df_result.columns:
            df_result[f"{param}_ems排名"] = df_result[ems_cols].rank(
                axis=1, method="min", ascending=not is_desc
            )[f"{param}EMS"]
        kb_cols = [
            f"{param}{comp}"
            for comp in COMPANIES_ALL_TEN
            if comp != "EMS" and f"{param}{comp}" in df_result.columns
        ]
        if kb_cols and f"{param}快包" in df_result.columns:
            df_result[f"{param}_快包排名"] = df_result[kb_cols].rank(
                axis=1, method="min", ascending=not is_desc
            )[f"{param}快包"]

    return df_result


def calculate_weighted_summary(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """对给定的DataFrame按指定列进行分组，并计算加权平均值。"""
    print(f"  - 正在按 '{', '.join(group_cols)}' 进行加权平均聚合...")

    metrics_to_weight = [
        "全程时限",
        "寄出地处理时限",
        "运输时限",
        "寄达地处理时限",
        "投递时限",
        "揽收-到达寄出地分拣中心时长",
        "到达寄出地分拣中心-离开寄出地城市时长",
        "到达寄达地城市-离开寄达地分拣中心时长",
        "离开寄达地分拣中心-派件",
        "72小时准时率",
        "48小时准时率",
        "送达天数",
        "平均中转次数",
    ]

    summary_rows = []
    for name, group in df.groupby(group_cols):
        row = dict(zip(group_cols, name if isinstance(name, tuple) else [name]))
        for comp in COMPANIES_ALL_TEN:
            weight_col = f"快递数量{comp}"
            if weight_col not in group.columns:
                continue
            total_weight = group[weight_col].sum()
            row[weight_col] = total_weight
            if total_weight > 0:
                for metric in metrics_to_weight:
                    metric_col = f"{metric}{comp}"
                    if metric == "平均中转次数":
                        metric_col = f"{comp}{metric}"
                    if metric_col in group.columns:
                        metric_values = pd.to_numeric(
                            group[metric_col], errors="coerce"
                        )
                        weight_values = pd.to_numeric(
                            group[weight_col], errors="coerce"
                        )
                        weighted_sum = (metric_values * weight_values).sum()
                        row[metric_col] = weighted_sum / total_weight
            else:
                for metric in metrics_to_weight:
                    metric_col = f"{metric}{comp}"
                    if metric == "平均中转次数":
                        metric_col = f"{comp}{metric}"
                    if metric_col in group.columns:
                        row[metric_col] = np.nan
        summary_rows.append(row)
    summary_df = pd.DataFrame(summary_rows)

    # 重新计算行业统计值
    for metric in metrics_to_weight:
        source_metric_name = metric
        metric_cols = [
            f"{source_metric_name}{comp}"
            for comp in COMPANIES_FOR_INDUSTRY_COMPARISON
            if f"{source_metric_name}{comp}" in summary_df.columns
        ]

        if not metric_cols:
            continue
        summary_df[f"{source_metric_name}_average"] = summary_df[metric_cols].mean(
            axis=1
        )
        summary_df[f"{source_metric_name}_minimum"] = summary_df[metric_cols].min(
            axis=1
        )
        summary_df[f"{source_metric_name}_maximum"] = summary_df[metric_cols].max(
            axis=1
        )

    # 重新计算排名
    rank_metrics = [
        "48小时准时率",
        "72小时准时率",
        "全程时限",
        "寄出地处理时限",
        "运输时限",
        "寄达地处理时限",
        "投递时限",
        "送达天数",
    ]
    for param in rank_metrics:
        source_param_name = param
        is_desc = "准时率" in param
        ems_cols = [
            f"{source_param_name}{comp}"
            for comp in COMPANIES_NINE_MAJOR
            if f"{source_param_name}{comp}" in summary_df.columns
        ]
        if ems_cols and f"{source_param_name}EMS" in summary_df.columns:
            summary_df[f"{source_param_name}_ems排名"] = summary_df[ems_cols].rank(
                axis=1, method="min", ascending=not is_desc
            )[f"{source_param_name}EMS"]
        kb_cols = [
            f"{source_param_name}{comp}"
            for comp in COMPANIES_ALL_TEN
            if comp != "EMS" and f"{source_param_name}{comp}" in summary_df.columns
        ]
        if kb_cols and f"{source_param_name}快包" in summary_df.columns:
            summary_df[f"{source_param_name}_快包排名"] = summary_df[kb_cols].rank(
                axis=1, method="min", ascending=not is_desc
            )[f"{source_param_name}快包"]
    return summary_df


def main():
    base_path = Path.cwd()
    report_path = base_path / "报告数据"
    output_path = report_path / "输出"
    output_path.mkdir(parents=True, exist_ok=True)
    final_report_path = output_path / "分析总报告.xlsx"
    print(f"◎◎◎ 分析流程启动 ◎◎◎")
    df_main_report = generate_main_report()
    if df_main_report is None:
        print("\n❌❌❌ 由于主报告生成失败，流程终止。")
        return

    print("\n- 正在清理完全为空的指标列...")
    df_main_report.dropna(axis=1, how="all", inplace=True)
    print("- 清理完成。")

    print(f"\n--- 正在生成数据中台报告: {final_report_path} ---")
    try:
        with pd.ExcelWriter(final_report_path, engine="xlsxwriter") as writer:
            print("  - 正在写入'最终线路明细结果' Sheet...")

            df_main_to_write = df_main_report.copy()
            for col in df_main_to_write.columns:
                if (
                    "送达天数" in col
                    and "排名" not in col
                    and df_main_to_write[col].dtype != "object"
                ):
                    df_main_to_write[col] = pd.to_numeric(
                        df_main_to_write[col], errors="coerce"
                    ).round()
                    df_main_to_write[col] = df_main_to_write[col].apply(
                        lambda x: f"T+{int(x)}" if pd.notna(x) else x
                    )

            # 动态生成列顺序
            base_info_cols = [
                "寄出省份",
                "寄出城市",
                "寄达省份",
                "寄达城市",
                "路线",
                "线路里程",
                "城市圈",
            ]
            metrics_ordered = [
                "快递数量",
                "全程时限",
                "寄出地处理时限",
                "运输时限",
                "寄达地处理时限",
                "投递时限",
                "揽收-到达寄出地分拣中心时长",
                "到达寄出地分拣中心-离开寄出地城市时长",
                "到达寄达地城市-离开寄达地分拣中心时长",
                "离开寄达地分拣中心-派件",
                "72小时准时率",
                "48小时准时率",
            ]
            all_companies_ordered_cols = []
            for comp in COMPANIES_ALL_TEN:
                for metric in metrics_ordered:
                    all_companies_ordered_cols.append(f"{metric}{comp}")

            final_cols = base_info_cols + all_companies_ordered_cols
            for comp in COMPANIES_ALL_TEN:
                final_cols.append(f"送达天数{comp}")
            for comp in COMPANIES_ALL_TEN:
                final_cols.append(f"{comp}平均中转次数")
            final_cols.append("最优中转次数")

            stat_rank_cols = [col for col in df_main_to_write.columns if "_" in col]
            final_cols.extend(stat_rank_cols)

            final_cols_exist = [
                col for col in final_cols if col in df_main_to_write.columns
            ]
            final_cols_exist += [
                col for col in df_main_to_write.columns if col not in final_cols_exist
            ]

            df_main_to_write[final_cols_exist].to_excel(
                writer, sheet_name="最终线路明细结果", index=False
            )

            # 寄出地汇总
            origin_group_cols = ["寄出省份", "寄出城市"]
            if all(c in df_main_report.columns for c in origin_group_cols):
                df_origin_summary = calculate_weighted_summary(
                    df_main_report, origin_group_cols
                )
                print("  - 正在写入'寄出地汇总' Sheet...")
                df_origin_summary_to_write = df_origin_summary.copy()
                df_origin_summary_to_write.dropna(axis=1, how="all", inplace=True)
                for col in df_origin_summary_to_write.columns:
                    if (
                        "送达天数" in col
                        and "排名" not in col
                        and df_origin_summary_to_write[col].dtype != "object"
                    ):
                        df_origin_summary_to_write[col] = pd.to_numeric(
                            df_origin_summary_to_write[col], errors="coerce"
                        ).round()
                        df_origin_summary_to_write[col] = df_origin_summary_to_write[
                            col
                        ].apply(lambda x: f"T+{int(x)}" if pd.notna(x) else x)
                df_origin_summary_to_write.to_excel(
                    writer, sheet_name="寄出地汇总", index=False
                )

            # 寄达地汇总
            dest_group_cols = ["寄达省份", "寄达城市"]
            if all(c in df_main_report.columns for c in dest_group_cols):
                # 【修复】: 在main函数内定义metrics_to_weight以解决作用域问题
                metrics_to_weight = [
                    "全程时限",
                    "寄出地处理时限",
                    "运输时限",
                    "寄达地处理时限",
                    "投递时限",
                    "揽收-到达寄出地分拣中心时长",
                    "到达寄出地分拣中心-离开寄出地城市时长",
                    "到达寄达地城市-离开寄达地分拣中心时长",
                    "离开寄达地分拣中心-派件",
                    "72小时准时率",
                    "48小时准时率",
                    "送达天数",
                    "平均中转次数",
                ]

                needed_cols = dest_group_cols.copy()
                all_metrics_and_qty = metrics_to_weight + ["快递数量"]
                for comp in COMPANIES_ALL_TEN:
                    for metric in all_metrics_and_qty:
                        if metric == "平均中转次数":
                            col_name = f"{comp}{metric}"
                        else:
                            col_name = f"{metric}{comp}"
                        if col_name in df_main_report.columns:
                            needed_cols.append(col_name)

                df_dest_subset = df_main_report[needed_cols]
                df_dest_temp = df_dest_subset.rename(
                    columns={"寄达省份": "寄出省份", "寄达城市": "寄出城市"}
                )

                df_dest_summary = calculate_weighted_summary(
                    df_dest_temp, origin_group_cols
                )
                df_dest_summary.rename(
                    columns={"寄出省份": "寄达省份", "寄出城市": "寄达城市"},
                    inplace=True,
                )
                print("  - 正在写入'寄达地汇总' Sheet...")
                df_dest_summary_to_write = df_dest_summary.copy()
                df_dest_summary_to_write.dropna(axis=1, how="all", inplace=True)
                for col in df_dest_summary_to_write.columns:
                    if (
                        "送达天数" in col
                        and "排名" not in col
                        and df_dest_summary_to_write[col].dtype != "object"
                    ):
                        df_dest_summary_to_write[col] = pd.to_numeric(
                            df_dest_summary_to_write[col], errors="coerce"
                        ).round()
                        df_dest_summary_to_write[col] = df_dest_summary_to_write[
                            col
                        ].apply(lambda x: f"T+{int(x)}" if pd.notna(x) else x)
                df_dest_summary_to_write.to_excel(
                    writer, sheet_name="寄达地汇总", index=False
                )

        print("--- ✓ 数据中台生成完毕 (包含汇总表) ---")
    except Exception as e:
        print(f" -> 🔥🔥🔥 写入数据中台失败: {e}")
        print(traceback.format_exc())
    print("\n🎉🎉🎉 恭喜！数据中台任务已全部执行完毕！🎉🎉🎉")


if __name__ == "__main__":
    main()

◎◎◎ 分析流程启动 ◎◎◎
--- 任务开始：生成最终线路明细报告 ---

[1/4] 动态构建线路全集并合并基础信息...

[2/4] 从'线路汇总数据'提取所有预计算指标...

[3/4] 计算各项指标的统计值...

[4/4] 计算排名...

- 正在清理完全为空的指标列...
- 清理完成。

--- 正在生成数据中台报告: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输出/分析总报告.xlsx ---
  - 正在写入'最终线路明细结果' Sheet...
  - 正在按 '寄出省份, 寄出城市' 进行加权平均聚合...
  - 正在写入'寄出地汇总' Sheet...
  - 正在按 '寄出省份, 寄出城市' 进行加权平均聚合...
  - 正在写入'寄达地汇总' Sheet...
--- ✓ 数据中台生成完毕 (包含汇总表) ---

🎉🎉🎉 恭喜！数据中台任务已全部执行完毕！🎉🎉🎉


In [14]:
# ==============================================================================
# Cell 5: 中通月报生成
# ==============================================================================
from pathlib import Path

import numpy as np
import pandas as pd

# --- 重新定义必要的全局变量和函数 ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
output_path = report_path / "输出"
base_data_path = input_path / "basic_data.xlsx"

COMPANY_MAPPING = {
    "EMS": "EMS",
    "德邦": "德邦",
    "极兔": "极兔",
    "圆通": "圆通",
    "顺丰": "顺丰",
    "中通": "中通",
    "京东": "京东",
    "韵达": "韵达",
    "申通": "申通",
    "邮政": "快包",
    "快包": "快包",
}
COMPANIES_NINE_MAJOR = [
    "EMS",
    "中通",
    "京东",
    "圆通",
    "德邦",
    "极兔",
    "申通",
    "韵达",
    "顺丰",
]
COMPANIES_TONGDATU = ["圆通", "申通", "中通", "极兔", "韵达"]


def _find_company_key_from_filename(filename):
    name = Path(filename).stem.replace("_data_analysis_result", "")
    for keyword, key in COMPANY_MAPPING.items():
        if keyword in name:
            return key
    return None


# --- 计算所有公司达成率的辅助函数 ---
def _calculate_all_achievement_rates(df_standard):
    """
    计算所有公司在所有标准线路上的达成率
    基于“揽收时间”和“签收时间”的日历日差异计算
    T+0 为当天达，T+1 为次日达。
    """
    print("  -> 开始计算所有公司的达成率 (基于日历日)...")
    data_analysis_path = output_path / "data_analysis_result"
    df_all_rates = df_standard[["路线"]].copy()

    START_DATE_COL = "揽收时间"
    END_DATE_COL = "签收时间"

    for file_path in data_analysis_path.glob("*_data_analysis_result.xlsx"):
        company_key = _find_company_key_from_filename(file_path.name)
        if not company_key:
            continue
        print(f"    - 正在计算 '{company_key}' 的达成率...")
        try:
            df_detail = pd.read_excel(file_path, sheet_name="线路详细数据")
            df_detail["路线"] = df_detail["寄出城市"] + "-" + df_detail["寄达城市"]

            if (
                START_DATE_COL not in df_detail.columns
                or END_DATE_COL not in df_detail.columns
            ):
                print(
                    f"      -> 警告：文件 '{file_path.name}' 缺少 '{START_DATE_COL}' 或 '{END_DATE_COL}' 列，已跳过。"
                )
                continue

            df_detail[START_DATE_COL] = pd.to_datetime(
                df_detail[START_DATE_COL], errors="coerce"
            )
            df_detail[END_DATE_COL] = pd.to_datetime(
                df_detail[END_DATE_COL], errors="coerce"
            )
            df_detail.dropna(subset=[START_DATE_COL, END_DATE_COL], inplace=True)
            if df_detail.empty:
                print(
                    f"      -> 警告：文件 '{file_path.name}' 清理后无有效日期数据，已跳过。"
                )
                continue

            df_detail["实际天数"] = (
                df_detail[END_DATE_COL].dt.normalize()
                - df_detail[START_DATE_COL].dt.normalize()
            ).dt.days

            for std_type in ["普标", "高标"]:
                std_col = f"{std_type}时效标准"
                target_col = f"达成率_{company_key}_{std_type}"
                temp_df = pd.merge(
                    df_detail,
                    df_standard[["路线", f"{std_col}_days"]],
                    on="路线",
                    how="inner",
                )
                if temp_df.empty:
                    continue

                temp_df["is_met"] = temp_df["实际天数"] <= temp_df[f"{std_col}_days"]
                route_rates = temp_df.groupby("路线")["is_met"].mean().reset_index()
                route_rates.rename(columns={"is_met": target_col}, inplace=True)
                df_all_rates = pd.merge(
                    df_all_rates, route_rates, on="路线", how="left"
                )
        except Exception as e:
            print(f"      -> 计算'{company_key}'达成率时出错: {e}")

    print("  -> ✓ 所有公司达成率计算完毕。")
    return df_all_rates


def generate_zto_monthly_report():
    main_report_path = output_path / "分析总报告.xlsx"
    zto_monthly_report_path = output_path / "中通月报.xlsx"

    if not main_report_path.exists():
        print(
            f"🔥🔥🔥 错误：主报告 '{main_report_path.name}' 不存在，无法生成中通报告。请先运行相关步骤。"
        )
        return

    print("\n--- 任务开始：生成中通月报 (基于已生成的主报告) ---")
    try:
        df_main_report = pd.read_excel(main_report_path)
        print("  - ✓ 成功读取主报告。")

        # --- Part 1: 计算所有公司的达成率 ---
        df_standard = pd.read_excel(
            base_data_path, sheet_name="ZTO_standard_time_limit"
        )
        if "线路" in df_standard.columns:
            df_standard.rename(columns={"线路": "路线"}, inplace=True)
        if "路线" not in df_main_report.columns:
            df_main_report["路线"] = (
                df_main_report["寄出城市"] + "-" + df_main_report["寄达城市"]
            )

        for std_type in ["普标", "高标"]:
            df_standard[f"{std_type}时效标准_days"] = (
                df_standard[f"{std_type}时效标准"]
                .str.replace(r"T\+", "", regex=True)
                .astype(int)
            )

        df_all_rates = _calculate_all_achievement_rates(df_standard)

        # --- Part 2: 准备Sheet 2 (中通报告数据) ---
        print("  - [1/5] 正在准备'中通报告数据' Sheet...")
        df_sheet2 = df_standard.copy()
        for std_type in ["普标", "高标"]:
            std_col_name = f"{std_type}时效标准"
            df_sheet2[f"中通达成率_{std_col_name}"] = df_all_rates.get(
                f"达成率_中通_{std_type}"
            )

            tongdatu_cols = [
                f"达成率_{c}_{std_type}"
                for c in COMPANIES_TONGDATU
                if f"达成率_{c}_{std_type}" in df_all_rates.columns
            ]
            if tongdatu_cols:
                df_sheet2[f"通达兔最优_{std_col_name}"] = df_all_rates[
                    tongdatu_cols
                ].max(axis=1)
            else:
                df_sheet2[f"通达兔最优_{std_col_name}"] = np.nan

            industry_cols = [
                f"达成率_{c}_{std_type}"
                for c in COMPANIES_NINE_MAJOR
                if f"达成率_{c}_{std_type}" in df_all_rates.columns
            ]
            if industry_cols:
                df_sheet2[f"行业最优_{std_col_name}"] = df_all_rates[industry_cols].max(
                    axis=1
                )
            else:
                df_sheet2[f"行业最优_{std_col_name}"] = np.nan

        df_sheet2.drop(
            columns=[c for c in df_sheet2.columns if "_days" in c], inplace=True
        )

        # --- Part 3: 准备Sheet 1 (线路详细分析) ---
        print("  - [2/5] 正在准备'线路详细分析' Sheet...")
        additional_info_cols = ["线路里程", "城市圈"]
        existing_additional_cols = [
            c for c in additional_info_cols if c in df_main_report.columns
        ]
        zto_metric_cols = [c for c in df_main_report.columns if "中通" in c]
        cols_to_extract = list(
            dict.fromkeys(["路线"] + existing_additional_cols + zto_metric_cols)
        )
        df_zto_extra_data = df_main_report[cols_to_extract]
        df_sheet1 = pd.merge(df_sheet2.copy(), df_zto_extra_data, on="路线", how="left")

        # --- Part 4: 准备汇总Sheet (寄出/寄达城市汇总) ---
        print("  - [3/5] 正在准备'寄出城市汇总'和'寄达城市汇总' Sheets...")

        source_cols_map = {
            # 中通时效
            "全程时限中通": "全程时限",
            "寄出地处理时限中通": "寄出地处理时限",
            "运输时限中通": "运输时限",
            "寄达地处理时限中通": "寄达地处理时限",
            "投递时限中通": "投递时限",
            # 行业最优 (minimum)
            "全程时限_minimum": "全程时限行业最优",
            "寄出地处理时限_minimum": "寄出地处理时限行业最优",
            "运输时限_minimum": "运输时限行业最优",
            "寄达地处理时限_minimum": "寄达地处理时限行业最优",
            "投递时限_minimum": "投递时限行业最优",
            # 行业均值 (average)
            "全程时限_average": "全程时限行业均值",
            "寄出地处理时限_average": "寄出地处理时限行业均值",
            "运输时限_average": "运输时限行业均值",
            "寄达地处理时限_average": "寄达地处理时限行业均值",
            "投递时限_average": "投递时限行业均值",
        }

        # 提取需要聚合的源列名
        agg_source_cols = list(source_cols_map.keys())

        # 准备Sheet 3: 寄出城市汇总
        origin_group_cols = ["寄出省份", "寄出城市"]
        cols_for_origin_summary = origin_group_cols + agg_source_cols
        # 检查所需列是否存在
        if all(c in df_main_report.columns for c in cols_for_origin_summary):
            df_sheet3_origin = df_main_report.groupby(
                origin_group_cols, as_index=False
            )[agg_source_cols].mean()
            df_sheet3_origin.rename(columns=source_cols_map, inplace=True)
        else:
            print("  -> 警告：主报告中缺少'寄出城市汇总'所需的列，将生成空表。")
            missing_cols = [
                c for c in cols_for_origin_summary if c not in df_main_report.columns
            ]
            print(f"     缺少的源列: {missing_cols}")
            final_cols = origin_group_cols + list(source_cols_map.values())
            df_sheet3_origin = pd.DataFrame(columns=final_cols)

        # 准备Sheet 4: 寄达城市汇总
        dest_group_cols = ["寄达省份", "寄达城市"]
        cols_for_dest_summary = dest_group_cols + agg_source_cols
        # 检查所需列是否存在
        if all(c in df_main_report.columns for c in cols_for_dest_summary):
            df_sheet4_dest = df_main_report.groupby(dest_group_cols, as_index=False)[
                agg_source_cols
            ].mean()
            df_sheet4_dest.rename(columns=source_cols_map, inplace=True)
        else:
            print("  -> 警告：主报告中缺少'寄达城市汇总'所需的列，将生成空表。")
            missing_cols = [
                c for c in cols_for_dest_summary if c not in df_main_report.columns
            ]
            print(f"     缺少的源列: {missing_cols}")
            final_cols = dest_group_cols + list(source_cols_map.values())
            df_sheet4_dest = pd.DataFrame(columns=final_cols)

        print("  - [4/5] ✓ 汇总表准备完毕。")

        # --- Part 5: 写入Excel ---
        print("  - [5/5] 正在写入Excel文件并应用格式...")
        with pd.ExcelWriter(zto_monthly_report_path, engine="xlsxwriter") as writer:
            # 写入Sheet 1
            df_sheet1.to_excel(writer, sheet_name="线路详细分析", index=False)
            # 写入Sheet 2
            df_sheet2.to_excel(writer, sheet_name="中通报告数据", index=False)
            # 写入Sheet 3 和 Sheet 4
            df_sheet3_origin.to_excel(writer, sheet_name="寄出城市汇总", index=False)
            df_sheet4_dest.to_excel(writer, sheet_name="寄达城市汇总", index=False)

            # ---- 应用格式 ----
            workbook = writer.book
            worksheet2 = writer.sheets["中通报告数据"]
            percent_format = workbook.add_format({"num_format": "0.00%"})
            cols_to_format = [
                "中通达成率_普标时效标准",
                "通达兔最优_普标时效标准",
                "行业最优_普标时效标准",
                "中通达成率_高标时效标准",
                "通达兔最优_高标时效标准",
                "行业最优_高标时效标准",
            ]

            # 为Sheet2的达成率列应用格式
            for col_name in cols_to_format:
                if col_name in df_sheet2.columns:
                    col_idx = df_sheet2.columns.get_loc(col_name)
                    worksheet2.set_column(col_idx, col_idx, 18, percent_format)

        print(f"--- ✓ 中通月报生成完毕 --- \n文件已保存至: {zto_monthly_report_path}")

    except Exception as e:
        import traceback

        print(f"    -> 🔥🔥🔥 生成中通月报失败: {e}")
        print(traceback.format_exc())


# --- 执行中通报告生成函数 ---
# 在实际运行前，请确保Cell 1中的路径等变量已正确设置
generate_zto_monthly_report()


--- 任务开始：生成中通月报 (基于已生成的主报告) ---
  - ✓ 成功读取主报告。
  -> 开始计算所有公司的达成率 (基于日历日)...
    - 正在计算 '快包' 的达成率...
    - 正在计算 '京东' 的达成率...
    - 正在计算 '申通' 的达成率...
    - 正在计算 '极兔' 的达成率...
    - 正在计算 '圆通' 的达成率...
    - 正在计算 '顺丰' 的达成率...
    - 正在计算 '韵达' 的达成率...
    - 正在计算 '德邦' 的达成率...
    - 正在计算 'EMS' 的达成率...
    - 正在计算 '中通' 的达成率...
  -> ✓ 所有公司达成率计算完毕。
  - [1/5] 正在准备'中通报告数据' Sheet...
  - [2/5] 正在准备'线路详细分析' Sheet...
  - [3/5] 正在准备'寄出城市汇总'和'寄达城市汇总' Sheets...
  - [4/5] ✓ 汇总表准备完毕。
  - [5/5] 正在写入Excel文件并应用格式...
--- ✓ 中通月报生成完毕 --- 
文件已保存至: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输出/中通月报.xlsx


In [4]:
# ==============================================================================
# Cell 6: 邮政、极兔月报生成 (最终正确版 - 统一口径)
# ==============================================================================
import sys
import traceback
import zipfile
from functools import reduce
from pathlib import Path

import numpy as np
import pandas as pd

# --- 1. 全局配置与辅助函数 ---
COMPANIES_NINE_MAJOR = [
    "EMS",
    "中通",
    "京东",
    "圆通",
    "德邦",
    "极兔",
    "申通",
    "韵达",
    "顺丰",
]
COMPANIES_EIGHT_OTHERS = [
    "中通",
    "京东",
    "圆通",
    "德邦",
    "极兔",
    "申通",
    "韵达",
    "顺丰",
]
COMPANIES_FOR_INDUSTRY_COMPARISON = COMPANIES_NINE_MAJOR
COMPANIES_ALL_TEN = COMPANIES_FOR_INDUSTRY_COMPARISON + ["快包"]

REPORT_CONFIG = {
    "邮政": {
        "products": ["EMS", "快包"],
        "output_filename": "邮政月报.xlsx",
        "product_to_filename": {"EMS": "EMS", "快包": "邮政"},
    },
    "极兔": {
        "products": ["极兔"],
        "output_filename": "极兔月报.xlsx",
        "product_to_filename": {"极兔": "极兔"},
    },
}

FILENAME_TO_COMPANY_MAP = {
    v: k
    for k, v in {
        "EMS": "EMS",
        "中通": "中通",
        "京东": "京东",
        "圆通": "圆通",
        "德邦": "德邦",
        "极兔": "极兔",
        "申通": "申通",
        "韵达": "韵达",
        "顺丰": "顺丰",
        "快包": "邮政",
    }.items()
}


def _find_company_key_from_filename(filename):
    name = (
        Path(filename)
        .stem.replace("_data_analysis_result", "")
        .replace("_transit_data", "")
    )
    filename_to_company = {v: k for k, v in FILENAME_TO_COMPANY_MAP.items()}
    filename_to_company.update(
        {
            "EMS": "EMS",
            "德邦": "德邦",
            "极兔": "极兔",
            "圆通": "圆通",
            "顺丰": "顺丰",
            "中通": "中通",
            "京东": "京东",
            "韵达": "韵达",
            "申通": "申通",
            "邮政": "快包",
        }
    )
    sorted_keys = sorted(filename_to_company.keys(), key=len, reverse=True)
    for keyword in sorted_keys:
        if keyword in name:
            return filename_to_company[keyword]
    return None


def load_top_cities(file_path: Path, sheet_name: str) -> set:
    try:
        if not file_path.exists():
            print(f"❌ 错误：基础数据文件 'basic_data.xlsx' 未找到: {file_path}")
            return set()
        df_cities = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl")
        if "城市" not in df_cities.columns:
            return set()
        city_list = df_cities["城市"].dropna().astype(str).str.strip().unique().tolist()
        print(f"✅ 成功从 '{sheet_name}' 加载 {len(city_list)} 个城市。")
        return set(city_list)
    except Exception as e:
        print(f"❌ 错误：加载城市列表 '{sheet_name}' 失败: {e}")
        return set()


def auto_adjust_xlsx_columns(writer, df, sheet_name):
    workbook = writer.book
    worksheet = writer.sheets[sheet_name]
    for i, col_name in enumerate(df.columns):
        column_data = df[col_name]
        if isinstance(column_data, pd.DataFrame):
            column_data = column_data.iloc[:, 0]
        if col_name == "完整物流信息":
            width = 50
        else:
            column_len = (
                column_data.astype(str).str.len().max() if not column_data.empty else 0
            )
            width = min(max(column_len, len(str(col_name)), 8) + 2, 40)
        worksheet.set_column(i, i, width)


# --- 2. 各Sheet的生成函数 ---
def create_sheet1_route_details(df_main, products: list):
    """【恢复】此函数已完全恢复到您的原始版本"""
    print(f"  - [1/4] 正在准备 '线路明细' Sheet for {', '.join(products)}...")
    column_mapping = {
        "寄出省份": "寄出省份",
        "寄出城市": "寄出城市",
        "寄达省份": "寄达省份",
        "寄达城市": "寄达城市",
        "线路": "路线",
        "线路里程": "线路里程",
        "城市圈": "城市圈",
        "行业最优_中转次数": "最优中转次数",
        "行业最优_48小时妥投率": "48小时准时率_maximum",
        "行业均值_48小时妥投率": "48小时准时率_average",
        "行业最差_48小时妥投率": "48小时准时率_minimum",
        "行业最优_72小时妥投率": "72小时准时率_maximum",
        "行业均值_72小时妥投率": "72小时准时率_average",
        "行业最差_72小时妥投率": "72小时准时率_minimum",
        "行业最差_全程时限": "全程时限_maximum",
        "行业均值_全程时限": "全程时限_average",
        "行业最优_全程时限": "全程时限_minimum",
        "行业最差_T+N": "送达天数_maximum",
        "行业均值_T+N": "送达天数_average",
        "行业最优_T+N": "送达天数_minimum",
        "行业最差_寄出地处理时长": "寄出地处理时限_maximum",
        "行业均值_寄出地处理时长": "寄出地处理时限_average",
        "行业最优_寄出地处理时长": "寄出地处理时限_minimum",
        "行业最差_运输时长": "运输时限_maximum",
        "行业均值_运输时长": "运输时限_average",
        "行业最优_运输时长": "运输时限_minimum",
        "行业最差_寄达地处理时长": "寄达地处理时限_maximum",
        "行业均值_寄达地处理时长": "寄达地处理时限_average",
        "行业最优_寄达地处理时长": "寄达地处理时限_minimum",
        "行业最差_投递时长": "投递时限_maximum",
        "行业均值_投递时长": "投递时限_average",
        "行业最优_投递时长": "投递时限_minimum",
    }
    source_metric_map = {
        "中转次数": "平均中转次数",
        "48小时妥投率": "48小时准时率",
        "72小时妥投率": "72小时准时率",
        "T+N": "送达天数",
        "寄出地处理时长": "寄出地处理时限",
        "运输时长": "运输时限",
        "寄达地处理时长": "寄达地处理时限",
        "投递时长": "投递时限",
        "全程时限": "全程时限",
    }
    for prod in products:
        for metric, source_metric_name in source_metric_map.items():
            rank_source_suffix = f"_{prod.lower()}排名"
            if prod == "EMS":
                rank_source_suffix = "_ems排名"
            elif prod == "快包":
                rank_source_suffix = "_快包排名"
            if metric == "中转次数":
                column_mapping[f"{prod}_{metric}"] = f"{prod}{source_metric_name}"
            else:
                column_mapping[f"{prod}_{metric}"] = f"{source_metric_name}{prod}"
                column_mapping[f"{prod}排名_{metric}"] = (
                    f"{source_metric_name}{rank_source_suffix}"
                )
    df_sheet1 = pd.DataFrame()
    for new_col, source_col in column_mapping.items():
        if source_col in df_main.columns:
            df_sheet1[new_col] = df_main[source_col]
        else:
            df_sheet1[new_col] = pd.NA
    filter_metrics = [
        "运输时长",
        "寄达地处理时长",
        "寄出地处理时长",
        "投递时长",
        "全程时限",
    ]
    for metric in filter_metrics:
        for prod in products:
            prod_metric_col = f"{prod}_{metric}"
            compare_col = ""
            if prod in ["EMS", "极兔"]:
                compare_col = f"行业最优_{metric}"
            elif prod == "快包":
                compare_col = f"行业均值_{metric}"
            else:
                continue
            filter_col_name = f"{prod}筛选指标_{metric}"
            if (
                prod_metric_col in df_sheet1.columns
                and compare_col in df_sheet1.columns
            ):
                prod_values = pd.to_numeric(df_sheet1[prod_metric_col], errors="coerce")
                compare_values = pd.to_numeric(df_sheet1[compare_col], errors="coerce")
                df_sheet1[filter_col_name] = prod_values - compare_values
            else:
                df_sheet1[filter_col_name] = pd.NA
    original_cols = list(column_mapping.keys())
    new_filter_cols = [col for col in df_sheet1.columns if "筛选指标" in col]
    final_cols_order = original_cols + new_filter_cols
    existing_cols = [col for col in final_cols_order if col in df_sheet1.columns]
    return df_sheet1[existing_cols]


def create_sheet2_mail_details(
    data_analysis_path,
    zhuzhuyun_merge_path,
    products: list,
    product_to_filename: dict,
    top_30_cities: set,
    df_main,
):
    print(f"  - [2/4] 正在准备 '邮件明细' Sheet for {', '.join(products)}...")
    # 步骤1：加载源数据 (此时 df_s2 可能有，也可能没有“完整物流信息”列)
    dfs = []
    for prod_name, filename_part in product_to_filename.items():
        file_path = data_analysis_path / f"{filename_part}_data_analysis_result.xlsx"
        if file_path.exists():
            try:
                df = pd.read_excel(
                    file_path,
                    sheet_name="线路详细数据",
                    dtype={"单号": str},
                    engine="openpyxl",
                )
                if not df.empty:
                    df["单号"] = df["单号"].astype(str).str.strip()
                    df["产品种类"] = prod_name
                    dfs.append(df)
            except Exception as e:
                print(f"      -> 警告: 读取文件 {file_path.name} 失败: {e}")
    if not dfs:
        return pd.DataFrame()

    df_s2 = pd.concat(dfs, ignore_index=True)
    df_s2.rename(
        columns={
            "单号": "邮件号",
            "公里": "线路里程",
            "签收时间": "完成投递时间",
            "寄出地处理时限": "寄出地处理时长",
            "运输时限": "运输时长",
            "寄达地处理时限": "寄达地处理时长",
            "投递时限": "投递时长",
            "揽收时间": "揽件时间",
        },
        inplace=True,
    )

    # 步骤2：始终从“猪猪云”加载并合并数据，这是必须的
    trace_dfs = []
    for product_type, filename in {
        p: f"{product_to_filename.get(p, p)}.xlsx" for p in products
    }.items():
        trace_file_path = zhuzhuyun_merge_path / filename
        if trace_file_path.exists():
            try:
                df_trace_part = pd.read_excel(
                    trace_file_path,
                    usecols=["快递单号", "完整物流信息"],
                    dtype={"快递单号": str},
                    engine="openpyxl",
                )
                if not df_trace_part.empty:
                    df_trace_part["快递单号"] = (
                        df_trace_part["快递单号"].astype(str).str.strip()
                    )
                    df_trace_part["产品种类"] = product_type
                    trace_dfs.append(df_trace_part)
            except Exception as e:
                print(f"      -> 警告: 读取轨迹文件 {filename} 失败: {e}")

    if trace_dfs:
        df_traces = pd.concat(trace_dfs, ignore_index=True).drop_duplicates(
            subset=["快递单号", "产品种类"]
        )
        df_s2 = pd.merge(
            df_s2,
            df_traces,
            how="left",
            left_on=["邮件号", "产品种类"],
            right_on=["快递单号", "产品种类"],
        )
        df_s2.drop(columns=["快递单号"], inplace=True, errors="ignore")

    # ========================= 核心逻辑：兼容性处理 (START) =========================
    #
    if "完整物流信息_y" in df_s2.columns:
        # **处理情况1：源文件和猪猪云都有该列，导致冲突，生成了_x和_y**
        # 策略：优先使用猪猪云(_y)的数据，如果猪猪云没匹配上(值为NaN)，则用源文件(_x)的数据填充
        print("      -> [兼容模式] 检测到'完整物流信息'列名冲突，智能合并中...")
        df_s2["完整物流信息"] = df_s2["完整物流信息_y"].fillna(df_s2["完整物流信息_x"])
        # 清理掉临时的_x和_y列
        df_s2.drop(columns=["完整物流信息_x", "完整物流信息_y"], inplace=True)

    elif "完整物流信息" in df_s2.columns:
        print("      -> [兼容模式] '完整物流信息'列已存在且无冲突，流程继续。")
        pass

    else:
        print("      -> [兼容模式] 未找到'完整物流信息'列，已创建空列作为保障。")
        df_s2["完整物流信息"] = ""

    df_s2["完整物流信息"].fillna("", inplace=True)
    #
    # ========================== 核心逻辑：兼容性处理 (END) ==========================

    df_s2["线路"] = df_s2["寄出城市"] + "-" + df_s2["寄达城市"]
    time_cols = [
        "揽件时间",
        "完成投递时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
    ]
    for col in time_cols:
        if col in df_s2.columns:
            df_s2[col] = pd.to_datetime(df_s2[col], errors="coerce")

    if "揽件时间" in df_s2.columns and "完成投递时间" in df_s2.columns:
        valid_times = df_s2["揽件时间"].notna() & df_s2["完成投递时间"].notna()
        days_diff = (
            df_s2.loc[valid_times, "完成投递时间"].dt.normalize()
            - df_s2.loc[valid_times, "揽件时间"].dt.normalize()
        ).dt.days
        df_s2.loc[valid_times, "T+N"] = "T+" + days_diff.astype(int).astype(str)

    df_s2["是否达成72小时妥投率"] = np.where(df_s2["全程时限"] <= 72, "是", "否")
    top_30_mask = (df_s2["寄出城市"].isin(top_30_cities)) & (
        df_s2["寄达城市"].isin(top_30_cities)
    )
    df_s2["是否达成48小时妥投率"] = "不适用"
    df_s2.loc[top_30_mask, "是否达成48小时妥投率"] = np.where(
        df_s2.loc[top_30_mask, "全程时限"] <= 48, "是", "否"
    )

    # ... 后续代码部分保持不变 ...
    df_main_renamed = df_main.rename(columns={"路线": "线路"})
    cols_to_merge_from_main = ["线路", "送达天数_average", "送达天数_minimum"]
    metrics_for_merge = ["48小时准时率", "72小时准时率", "全程时限"]
    for m in metrics_for_merge:
        for p in products:
            cols_to_merge_from_main.append(f"{m}{p}")
        for agg in ["maximum", "average", "minimum"]:
            cols_to_merge_from_main.append(f"{m}_{agg}")

    if "全程时限_minimum" in df_main_renamed.columns:
        cols_to_merge_from_main.append("全程时限_minimum")
    if "全程时限_maximum" in df_main_renamed.columns:
        cols_to_merge_from_main.append("全程时限_maximum")

    existing_cols_to_merge = [
        c for c in cols_to_merge_from_main if c in df_main_renamed.columns
    ]
    df_s2 = pd.merge(
        df_s2, df_main_renamed[existing_cols_to_merge], on="线路", how="left"
    )
    df_s2["行业均值"] = df_s2.get("送达天数_average", "").fillna("").astype(str)
    df_s2["行业最优"] = df_s2.get("送达天数_minimum", "").fillna("").astype(str)

    rename_dict = {
        "48小时准时率_maximum": "行业最优_48小时妥投率",
        "48小时准时率_average": "行业均值_48小时妥投率",
        "48小时准时率_minimum": "行业最差_48小时妥投率",
        "72小时准时率_maximum": "行业最优_72小时妥投率",
        "72小时准时率_average": "行业均值_72小时妥投率",
        "72小时准时率_minimum": "行业最差_72小时妥投率",
        "全程时限_minimum": "行业最优_线路全程时限",
        "全程时限_average": "行业均值_线路全程时限",
        "全程时限_maximum": "行业最差_线路全程时限",
    }
    for p in products:
        rename_dict[f"48小时准时率{p}"] = f"{p}_48小时妥投率"
        rename_dict[f"72小时准时率{p}"] = f"{p}_72小时妥投率"
        rename_dict[f"全程时限{p}"] = f"{p}_线路全程时限"
    df_s2.rename(columns=rename_dict, inplace=True)

    final_columns_order_s2 = [
        "邮件号",
        "产品种类",
        "寄出省份",
        "寄出城市",
        "寄达省份",
        "寄达城市",
        "线路",
        "线路里程",
        "全程时限",
        "T+N",
        "行业均值",
        "行业最优",
        "是否达成48小时妥投率",
        "是否达成72小时妥投率",
        "寄出地处理时长",
        "运输时长",
        "寄达地处理时长",
        "投递时长",
        "揽件时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "完成投递时间",
    ]

    newly_added_columns = [
        "EMS_48小时妥投率",
        "快包_48小时妥投率",
        "行业最优_48小时妥投率",
        "行业均值_48小时妥投率",
        "行业最差_48小时妥投率",
        "EMS_72小时妥投率",
        "快包_72小时妥投率",
        "行业最优_72小时妥投率",
        "行业均值_72小时妥投率",
        "行业最差_72小时妥投率",
        "EMS_线路全程时限",
        "快包_线路全程时限",
        "行业最优_线路全程时限",
        "行业均值_线路全程时限",
        "行业最差_线路全程时限",
    ]
    if "极兔" in products:
        newly_added_columns = [
            c for c in newly_added_columns if "极兔" in c or "行业" in c
        ]

    final_columns_order_s2.extend(newly_added_columns)
    final_columns_order_s2.append("完整物流信息")

    existing_cols = [col for col in final_columns_order_s2 if col in df_s2.columns]
    return df_s2[existing_cols]


def weighted_agg(group, metrics, company):
    """辅助函数：对单个分组进行加权平均计算"""
    weight_col = f"快递数量{company}"
    if weight_col not in group.columns or group[weight_col].sum() == 0:
        return pd.Series([np.nan] * len(metrics), index=metrics)

    total_weight = group[weight_col].sum()
    results = {}
    for metric in metrics:
        metric_col = f"{metric}{company}"
        if metric_col in group.columns:
            weighted_sum = (group[metric_col] * group[weight_col]).sum()
            results[metric] = weighted_sum / total_weight
        else:
            results[metric] = np.nan
    return pd.Series(results)


def create_regional_report_data(
    df_main: pd.DataFrame, grouping_level: str, final_col_order: list, products: list
):
    """【最终修复】: 恢复原始计算框架，并注入加权平均"""
    print(
        f"  - [{3 if grouping_level == 'city' else 4}/4] 正在准备 '分{grouping_level}明细' (恢复原始计算框架 + 加权平均)..."
    )

    # 1. 恢复“分离式聚合”框架
    metrics_by_destination = [
        "到达寄达地城市-离开寄达地分拣中心时长",
        "离开寄达地分拣中心-派件",
        "寄达地处理时限",
        "投递时限",
    ]
    metrics_by_origin = [
        "揽收-到达寄出地分拣中心时长",
        "到达寄出地分拣中心-离开寄出地城市时长",
        "寄出地处理时限",
        "运输时限",
        "全程时限",
    ]

    all_pivots_origin = []
    group_cols_origin = {"city": ["寄出省份", "寄出城市"], "province": ["寄出省份"]}[
        grouping_level
    ]
    for comp in COMPANIES_ALL_TEN:
        comp_metrics = [m for m in metrics_by_origin if f"{m}{comp}" in df_main.columns]
        if not comp_metrics or f"快递数量{comp}" not in df_main.columns:
            continue

        # ========================= 修改点 1 of 3 =========================
        df_agg = (
            df_main.groupby(group_cols_origin)
            .apply(
                weighted_agg,
                metrics=comp_metrics,
                company=comp,  # <-- 删除 include_groups=False
            )
            .reset_index()
        )
        # ===============================================================
        df_agg["company"] = comp
        all_pivots_origin.append(df_agg)

    if not all_pivots_origin:
        df_pivot_origin = pd.DataFrame(
            index=pd.MultiIndex.from_tuples([], names=group_cols_origin)
        )
    else:
        df_pivot_origin = pd.concat(all_pivots_origin).pivot_table(
            index=group_cols_origin, columns="company", values=metrics_by_origin
        )
        if not df_pivot_origin.empty:
            df_pivot_origin.columns = [f"{v}_{c}" for v, c in df_pivot_origin.columns]

    all_pivots_dest = []
    group_cols_dest = {"city": ["寄达省份", "寄达城市"], "province": ["寄达省份"]}[
        grouping_level
    ]

    needed_cols = group_cols_dest.copy()
    for comp in COMPANIES_ALL_TEN:
        needed_cols.append(f"快递数量{comp}")
        for metric in metrics_by_destination:
            needed_cols.append(f"{metric}{comp}")

    existing_needed_cols = [c for c in needed_cols if c in df_main.columns]

    df_dest_subset = df_main[existing_needed_cols]
    df_dest_temp = df_dest_subset.rename(
        columns={"寄达省份": "寄出省份", "寄达城市": "寄出城市"}
    )

    for comp in COMPANIES_ALL_TEN:
        comp_metrics = [
            m for m in metrics_by_destination if f"{m}{comp}" in df_dest_temp.columns
        ]
        if not comp_metrics or f"快递数量{comp}" not in df_dest_temp.columns:
            continue

        # ========================= 修改点 2 of 3 =========================
        df_agg = (
            df_dest_temp.groupby(group_cols_origin)
            .apply(
                weighted_agg,
                metrics=comp_metrics,
                company=comp,  # <-- 删除 include_groups=False
            )
            .reset_index()
        )
        # ===============================================================
        df_agg["company"] = comp
        all_pivots_dest.append(df_agg)

    if not all_pivots_dest:
        df_pivot_dest = pd.DataFrame(
            index=pd.MultiIndex.from_tuples([], names=group_cols_origin)
        )
    else:
        df_pivot_dest = pd.concat(all_pivots_dest).pivot_table(
            index=group_cols_origin, columns="company", values=metrics_by_destination
        )
        if not df_pivot_dest.empty:
            df_pivot_dest.columns = [f"{v}_{c}" for v, c in df_pivot_dest.columns]

    df_merged_main = pd.merge(
        df_pivot_origin, df_pivot_dest, left_index=True, right_index=True, how="outer"
    )

    # 2. 恢复“双挂全程时限”计算框架 + 注入加权平均
    df_sent = df_main.rename(columns={"寄出省份": "省份", "寄出城市": "城市"})
    df_recv = df_main.rename(columns={"寄达省份": "省份", "寄达城市": "城市"})
    df_dual = pd.concat([df_sent, df_recv]).dropna(subset=["省份"])
    dual_group_cols = ["省份", "城市"] if grouping_level == "city" else ["省份"]

    all_pivots_dual = []
    for comp in COMPANIES_ALL_TEN:
        if (
            f"全程时限{comp}" not in df_dual.columns
            or f"快递数量{comp}" not in df_dual.columns
        ):
            continue

        # ========================= 修改点 3 of 3 =========================
        df_agg = (
            df_dual.groupby(dual_group_cols)
            .apply(
                weighted_agg,
                metrics=["全程时限"],
                company=comp,  # <-- 删除 include_groups=False
            )
            .reset_index()
        )
        # ===============================================================
        df_agg["company"] = comp
        all_pivots_dual.append(df_agg)

    if not all_pivots_dual:
        df_pivot_dual = pd.DataFrame(
            index=pd.MultiIndex.from_tuples([], names=dual_group_cols)
        )
    else:
        df_pivot_dual = pd.concat(all_pivots_dual).pivot_table(
            index=dual_group_cols, columns="company", values="全程时限"
        )
        if not df_pivot_dual.empty:
            df_pivot_dual.columns = [
                f"全程（双挂）时限_{c}" for c in df_pivot_dual.columns
            ]
            df_pivot_dual.index.names = group_cols_origin

    df_merged = pd.merge(
        df_merged_main, df_pivot_dual, left_index=True, right_index=True, how="outer"
    ).reset_index()

    # 【修复】: 恢复原始代码中对列名的处理方式
    final_report_data = df_merged.copy()
    rename_map = {}
    for col in final_report_data.columns:
        if isinstance(col, str) and "_" in col:
            parts = col.split("_")
            metric = parts[0]
            company = parts[1]
            if metric == "全程时限":
                metric = "全程（寄出地）时限"
            if company in COMPANIES_ALL_TEN:
                rename_map[col] = f"{company}_{metric}"
    final_report_data.rename(columns=rename_map, inplace=True)

    # 3. 恢复后续的行业统计、排名和附加计算
    all_metrics_for_ranking = [
        m.replace("全程时限", "全程（寄出地）时限")
        for m in (metrics_by_origin + metrics_by_destination)
    ] + ["全程（双挂）时限"]
    all_metrics_for_ranking = list(set(all_metrics_for_ranking))

    for metric in all_metrics_for_ranking:
        industry_cols = [
            f"{prod}_{metric}"
            for prod in COMPANIES_FOR_INDUSTRY_COMPARISON
            if f"{prod}_{metric}" in final_report_data.columns
        ]
        if not industry_cols:
            continue
        final_report_data[f"行业均值_{metric}"] = final_report_data[industry_cols].mean(
            axis=1
        )
        final_report_data[f"行业最优_{metric}"] = final_report_data[industry_cols].min(
            axis=1
        )
        for prod in products:
            rank_pool = (
                COMPANIES_EIGHT_OTHERS + ["快包"]
                if prod == "快包"
                else list(COMPANIES_NINE_MAJOR)
            )
            rank_cols = [
                f"{p}_{metric}"
                for p in rank_pool
                if f"{p}_{metric}" in final_report_data.columns
            ]
            prod_col_name = f"{prod}_{metric}"
            if rank_cols and prod_col_name in final_report_data.columns:
                final_report_data[f"{prod}排名_{metric}"] = final_report_data[
                    rank_cols
                ].rank(axis=1, method="min", ascending=True)[prod_col_name]

    sum_metric = "寄出地处理时限+寄达地处理时限+投递时限"
    part_metrics = ["寄出地处理时限", "寄达地处理时限", "投递时限"]
    all_report_companies = products + [
        c for c in COMPANIES_FOR_INDUSTRY_COMPARISON if c not in products
    ]
    for company in all_report_companies:
        part_cols = [f"{company}_{p}" for p in part_metrics]
        if all(c in final_report_data.columns for c in part_cols):
            final_report_data[f"{company}_{sum_metric}"] = final_report_data[
                part_cols
            ].sum(axis=1, min_count=3)
    avg_part_cols = [f"行业均值_{p}" for p in part_metrics]
    if all(c in final_report_data.columns for c in avg_part_cols):
        final_report_data[f"行业均值_{sum_metric}"] = final_report_data[
            avg_part_cols
        ].sum(axis=1, min_count=3)
    total_sum_cols = [
        f"{c}_{sum_metric}"
        for c in COMPANIES_FOR_INDUSTRY_COMPARISON
        if f"{c}_{sum_metric}" in final_report_data.columns
    ]
    if total_sum_cols:
        final_report_data[f"行业最优_{sum_metric}"] = final_report_data[
            total_sum_cols
        ].min(axis=1, skipna=True)
    for prod in products:
        rank_pool = (
            COMPANIES_EIGHT_OTHERS + ["快包"]
            if prod == "快包"
            else list(COMPANIES_NINE_MAJOR)
        )
        rank_cols = [
            f"{c}_{sum_metric}"
            for c in rank_pool
            if f"{c}_{sum_metric}" in final_report_data.columns
        ]
        prod_sum_col = f"{prod}_{sum_metric}"
        if rank_cols and prod_sum_col in final_report_data.columns:
            final_report_data[f"{prod}排名_{sum_metric}"] = final_report_data[
                rank_cols
            ].rank(axis=1, method="min", ascending=True)[prod_sum_col]

    # 4. 最后整理
    final_report_data.rename(
        columns={"寄出省份": "省份", "寄出城市": "地市"}, inplace=True
    )
    existing_cols = [c for c in final_col_order if c in final_report_data.columns]
    final_df = final_report_data[existing_cols].copy()
    sort_keys = ["省份", "地市"] if grouping_level == "city" else ["省份"]
    if any(k in final_df.columns for k in sort_keys):
        final_df.sort_values(
            by=[k for k in sort_keys if k in final_df.columns], inplace=True
        )
    if not final_df.empty:
        final_df.insert(0, "序号", range(1, len(final_df) + 1))
    return final_df.reset_index(drop=True)


def _generate_dynamic_column_order(products: list, level: str) -> list:
    """【恢复】此函数已完全恢复到您的原始版本"""
    base_cols = ["序号", "省份"]
    if level == "city":
        base_cols.append("地市")
    order = base_cols.copy()
    sections = {
        "寄出地处理": [
            "揽收-到达寄出地分拣中心时长",
            "到达寄出地分拣中心-离开寄出地城市时长",
            "寄出地处理时限",
        ],
        "寄达地处理": [
            "到达寄达地城市-离开寄达地分拣中心时长",
            "离开寄达地分拣中心-派件",
            "寄达地处理时限",
        ],
        "投递": ["投递时限"],
        "端到端处理": ["寄出地处理时限+寄达地处理时限+投递时限"],
        "寄出地全程": ["全程（寄出地）时限"],
        "双挂全程": ["全程（双挂）时限"],
    }
    for _, metrics in sections.items():
        for prod in products:
            for metric in metrics:
                order.extend([f"{prod}_{metric}", f"{prod}排名_{metric}"])
        for metric in metrics:
            order.append(f"行业均值_{metric}")
        for metric in metrics:
            order.append(f"行业最优_{metric}")
    return order


# --- 3. 主执行函数 ---
def generate_company_monthly_report(report_type: str):
    if report_type not in REPORT_CONFIG:
        print(f"🔥🔥🔥 错误: 未知的报告类型 '{report_type}'。")
        return
    config = REPORT_CONFIG[report_type]
    print(f"\n{'=' * 20} 开始生成: {config['output_filename']} {'=' * 20}")

    base_path = Path.cwd()
    output_path = base_path / "报告数据" / "输出"
    data_analysis_path = output_path / "data_analysis_result"
    zhuzhuyun_merge_path = base_path / "报告数据" / "temp" / "3_猪猪云合并数据"
    basic_data_path = base_path / "报告数据" / "输入" / "basic_data.xlsx"
    final_output_path = output_path / config["output_filename"]
    for p in [output_path, data_analysis_path, zhuzhuyun_merge_path]:
        p.mkdir(parents=True, exist_ok=True)

    print("  - 正在独立加载所有公司的'线路汇总数据'...")
    try:
        all_routes = set()
        files_in_analysis_result = list(
            data_analysis_path.glob("*_data_analysis_result.xlsx")
        )
        if not files_in_analysis_result:
            raise FileNotFoundError(
                f"在路径 '{data_analysis_path.resolve()}' 中未找到任何 '*_data_analysis_result.xlsx' 文件。"
            )

        for file_path in files_in_analysis_result:
            if file_path.name.startswith("~$"):
                continue
            try:
                all_routes.update(
                    pd.read_excel(
                        file_path,
                        sheet_name="线路汇总数据",
                        usecols=["路线"],
                        engine="openpyxl",
                    )["路线"].unique()
                )
            except zipfile.BadZipFile:
                print(
                    f"      -> 警告: 文件 {file_path.name} 已损坏或不是有效的Excel文件，已跳过。"
                )
                continue

        df_main = pd.DataFrame(list(all_routes), columns=["路线"])
        df_base_info = pd.read_excel(
            basic_data_path, sheet_name="inter-city_routes", engine="openpyxl"
        ).rename(columns={"公里": "线路里程", "经济圈": "城市圈"})
        cols_to_merge = [
            "寄出省份",
            "寄出城市",
            "寄达省份",
            "寄达城市",
            "路线",
            "线路里程",
            "城市圈",
        ]
        df_main = pd.merge(
            df_main,
            df_base_info[[c for c in cols_to_merge if c in df_base_info.columns]],
            on="路线",
            how="left",
        )

        metrics_to_extract = [
            "快递数量",
            "全程时限",
            "寄出地处理时限",
            "运输时限",
            "寄达地处理时限",
            "投递时限",
            "揽收-到达寄出地分拣中心时长",
            "到达寄出地分拣中心-离开寄出地城市时长",
            "到达寄达地城市-离开寄达地分拣中心时长",
            "离开寄达地分拣中心-派件",
            "72小时准时率",
            "48小时准时率",
            "送达天数_80分位",
            "中转次数",
        ]

        for file_path in files_in_analysis_result:
            if file_path.name.startswith("~$"):
                continue
            company_key = _find_company_key_from_filename(file_path.name)
            if not company_key:
                continue
            try:
                df_summary = pd.read_excel(
                    file_path, sheet_name="线路汇总数据", engine="openpyxl"
                )
                for metric in metrics_to_extract:
                    if metric in df_summary.columns:
                        if metric == "中转次数":
                            new_col_name = f"{company_key}平均中转次数"
                        else:
                            new_col_name = (
                                f"{metric.replace('_80分位', '')}{company_key}"
                            )

                        df_metric = (
                            df_summary[["路线", metric]]
                            .copy()
                            .rename(columns={metric: new_col_name})
                        )
                        df_main = pd.merge(df_main, df_metric, on="路线", how="left")
            except zipfile.BadZipFile:
                print(
                    f"      -> 警告: 文件 {file_path.name} 已损坏或不是有效的Excel文件，已跳过。"
                )
                continue

        df_main.dropna(axis=1, how="all", inplace=True)

        print("  - 正在计算'最优中转次数'...")
        all_transfer_cols = [
            f"{comp}平均中转次数"
            for comp in COMPANIES_ALL_TEN
            if f"{comp}平均中转次数" in df_main.columns
        ]
        if all_transfer_cols:
            cols_for_best_turnover = [c for c in all_transfer_cols if "快包" not in c]
            if cols_for_best_turnover:
                df_main["最优中转次数"] = df_main[cols_for_best_turnover].min(axis=1)

        # 重新计算行业统计值
        stat_metrics = [
            metric
            for metric in metrics_to_extract
            if metric not in ["快递数量", "中转次数"]
        ]
        for metric in stat_metrics:
            base_metric_name = metric.replace("_80分位", "")
            metric_cols = [
                f"{base_metric_name}{comp}"
                for comp in COMPANIES_FOR_INDUSTRY_COMPARISON
                if f"{base_metric_name}{comp}" in df_main.columns
            ]
            if metric_cols:
                df_main[f"{base_metric_name}_average"] = df_main[metric_cols].mean(
                    axis=1
                )
                df_main[f"{base_metric_name}_minimum"] = df_main[metric_cols].min(
                    axis=1
                )
                df_main[f"{base_metric_name}_maximum"] = df_main[metric_cols].max(
                    axis=1
                )

        # 重新计算排名
        rank_metrics = [
            "48小时准时率",
            "72小时准时率",
            "全程时限",
            "寄出地处理时限",
            "运输时限",
            "寄达地处理时限",
            "投递时限",
            "送达天数",
        ]
        for param in rank_metrics:
            is_desc = "准时率" in param
            ems_cols = [
                f"{param}{comp}"
                for comp in COMPANIES_NINE_MAJOR
                if f"{param}{comp}" in df_main.columns
            ]
            if ems_cols and f"{param}EMS" in df_main.columns:
                df_main[f"{param}_ems排名"] = df_main[ems_cols].rank(
                    axis=1, method="min", ascending=not is_desc
                )[f"{param}EMS"]
            kb_cols = [
                f"{param}{comp}"
                for comp in COMPANIES_ALL_TEN
                if comp != "EMS" and f"{param}{comp}" in df_main.columns
            ]
            if kb_cols and f"{param}快包" in df_main.columns:
                df_main[f"{param}_快包排名"] = df_main[kb_cols].rank(
                    axis=1, method="min", ascending=not is_desc
                )[f"{param}快包"]

        # T+N 格式化
        for col in df_main.columns:
            if (
                "送达天数" in col
                and "排名" not in col
                and df_main[col].dtype != "object"
            ):
                df_main[col] = pd.to_numeric(df_main[col], errors="coerce").round()
                df_main[col] = df_main[col].apply(
                    lambda x: f"T+{int(x)}" if pd.notna(x) else x
                )

    except Exception as e:
        print(f"🔥🔥🔥 独立加载数据失败: {e}")
        traceback.print_exc()
        return

    # top_30_cities 的加载仍然是需要的，因为它可能被其他函数（或未来的需求）使用
    # 但在 create_sheet2_mail_details 中不再直接用于判断单个邮件
    top_30_cities = load_top_cities(basic_data_path, "30_top_volume_city_2024")
    if not top_30_cities:
        print("🔥🔥🔥 错误：未能加载Top 30城市列表，流程中止。")
        return

    try:
        df_s1 = create_sheet1_route_details(df_main, config["products"])
        # 在调用时，不再需要传递 top_30_cities
        df_s2 = create_sheet2_mail_details(
            data_analysis_path,
            zhuzhuyun_merge_path,
            config["products"],
            config["product_to_filename"],
            top_30_cities,
            df_main,
        )

        city_cols_order = _generate_dynamic_column_order(config["products"], "city")
        province_cols_order = _generate_dynamic_column_order(
            config["products"], "province"
        )

        df_s3 = create_regional_report_data(
            df_main, "city", city_cols_order, config["products"]
        )
        df_s4 = create_regional_report_data(
            df_main, "province", province_cols_order, config["products"]
        )

        print(f"\n--- 所有数据计算完成，正在写入最终文件: {final_output_path.name} ---")
        with pd.ExcelWriter(final_output_path, engine="xlsxwriter") as writer:
            sheets_to_write = {
                "线路明细": df_s1,
                "邮件明细": df_s2,
                "分城市明细": df_s3,
                "分省份明细": df_s4,
            }
            for sheet_name, df in sheets_to_write.items():
                print(f"  - 正在写入Sheet: {sheet_name}...")
                if df is not None and not df.empty:
                    df = df.loc[:, ~df.columns.duplicated()]
                    for col in df.select_dtypes(
                        include=["datetime64[ns]", "datetimetz"]
                    ).columns:
                        df[col] = (
                            df[col].dt.strftime("%Y-%m-%d %H:%M:%S").replace("NaT", "")
                        )
                    df.to_excel(
                        writer, sheet_name=sheet_name, index=False, float_format="%.2f"
                    )
                    auto_adjust_xlsx_columns(writer, df, sheet_name)

        print(f"\n🎉🎉🎉 恭喜！已成功生成报告 '{config['output_filename']}'！🎉🎉🎉")

    except Exception as e:
        print(f"🔥🔥🔥 生成报告 '{config['output_filename']}' 时发生严重错误: {e}")
        traceback.print_exc()


# --- 4. 执行主函数 ---
if __name__ == "__main__":
    generate_company_monthly_report("邮政")
    generate_company_monthly_report("极兔")


  - 正在独立加载所有公司的'线路汇总数据'...
  - 正在计算'最优中转次数'...
✅ 成功从 '30_top_volume_city_2024' 加载 30 个城市。
  - [1/4] 正在准备 '线路明细' Sheet for EMS, 快包...
  - [2/4] 正在准备 '邮件明细' Sheet for EMS, 快包...
      -> [兼容模式] 检测到'完整物流信息'列名冲突，智能合并中...
  - [3/4] 正在准备 '分city明细' (恢复原始计算框架 + 加权平均)...
  - [4/4] 正在准备 '分province明细' (恢复原始计算框架 + 加权平均)...

--- 所有数据计算完成，正在写入最终文件: 邮政月报.xlsx ---
  - 正在写入Sheet: 线路明细...
  - 正在写入Sheet: 邮件明细...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = (


  - 正在写入Sheet: 分城市明细...
  - 正在写入Sheet: 分省份明细...

🎉🎉🎉 恭喜！已成功生成报告 '邮政月报.xlsx'！🎉🎉🎉

  - 正在独立加载所有公司的'线路汇总数据'...
  - 正在计算'最优中转次数'...
✅ 成功从 '30_top_volume_city_2024' 加载 30 个城市。
  - [1/4] 正在准备 '线路明细' Sheet for 极兔...
  - [2/4] 正在准备 '邮件明细' Sheet for 极兔...
      -> [兼容模式] 检测到'完整物流信息'列名冲突，智能合并中...
  - [3/4] 正在准备 '分city明细' (恢复原始计算框架 + 加权平均)...
  - [4/4] 正在准备 '分province明细' (恢复原始计算框架 + 加权平均)...

--- 所有数据计算完成，正在写入最终文件: 极兔月报.xlsx ---
  - 正在写入Sheet: 线路明细...
  - 正在写入Sheet: 邮件明细...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = (


  - 正在写入Sheet: 分城市明细...
  - 正在写入Sheet: 分省份明细...

🎉🎉🎉 恭喜！已成功生成报告 '极兔月报.xlsx'！🎉🎉🎉


In [None]:
# ==============================================================================
# Cell 7: 画图
# ==============================================================================
import os
from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.font_manager import FontProperties, fontManager

# --- 1. 配置区域 ---

# 路径配置
ROOT_PATH = Path.cwd()
DATA_ANALYSIS_PATH = ROOT_PATH / "报告数据" / "输出" / "data_analysis_result"
OUTPUT_IMAGE_PATH = ROOT_PATH / "报告数据" / "输出" / "4_报告图片"

OUTPUT_IMAGE_PATH.mkdir(parents=True, exist_ok=True)
print(f"数据输入路径: {DATA_ANALYSIS_PATH}")
print(f"图片输出路径: {OUTPUT_IMAGE_PATH}")

# 公司列表与文件名映射
COMPANY_MAPPING = {
    "EMS": "EMS",
    "中通": "中通",
    "京东": "京东",
    "圆通": "圆通",
    "德邦": "德邦",
    "极兔": "极兔",
    "申通": "申通",
    "韵达": "韵达",
    "顺丰": "顺丰",
    "邮政": "邮政",  # "邮政" 对应 "邮政" 文件
}
# 分析对象包含 "快包"，但在文件名层面它由 "邮政" 文件代表
COMPANIES_TO_ANALYZE = [
    "EMS",
    "快包",
    "中通",
    "京东",
    "圆通",
    "德邦",
    "极兔",
    "申通",
    "韵达",
    "顺丰",
]

METRICS_TO_PLOT = [
    "揽收时间",
    "到达分拣中心时间",
    "离开寄件城市时间",
    "到达收件城市时间",
    "派送时间",
    "签收时间",
]

# --- 2. 字体解决方案 ---
font_path_str = "微软雅黑.ttf"
font_path_obj = Path(font_path_str)
if font_path_obj.exists():
    fontManager.addfont(str(font_path_obj))
    chinese_font = FontProperties(fname=font_path_str)
    plt.rcParams["font.sans-serif"] = [chinese_font.get_name()]
    plt.rcParams["axes.unicode_minus"] = False
    print(f"Matplotlib 全局字体已成功设置为: {chinese_font.get_name()}")
else:
    print(f"错误: 在当前目录找不到字体文件 '{font_path_str}'。")

TIME_LABELS = [f"{h:02d}:00-{(h + 1) % 24:02d}:00" for h in range(24)]
COLORS = [
    "olive",
    "grey",
    "yellow",
    "orange",
    "green",
    "palegoldenrod",
    "darkolivegreen",
    "pink",
    "Thistle",
    "steelblue",
    "darkslategrey",
    "slategray",
    "tan",
    "darkolivegreen",
    "grey",
    "pink",
    "goldenrod",
    "mediumslateblue",
    "saddlebrown",
    "olive",
    "navy",
    "sandybrown",
    "moccasin",
    "black",
]

# --- 3. 辅助函数 ---


def calculate_hourly_distribution(time_series: pd.Series) -> list:
    """高效计算给定时间序列中每小时的数据点数量"""
    if time_series.empty or time_series.isna().all():
        return [0] * 24

    dt_series = pd.to_datetime(time_series, errors="coerce").dropna()
    if dt_series.empty:
        return [0] * 24

    counts = dt_series.dt.hour.value_counts().sort_index()
    hourly_counts = [0] * 24
    for hour, count in counts.items():
        if 0 <= hour < 24:
            hourly_counts[hour] = int(count)
    return hourly_counts


def plot_and_save_distribution(
    company_name: str, metric_name: str, hourly_counts: list, output_path: Path
):
    if sum(hourly_counts) == 0:
        print(
            f"    -> {company_name} 的 '{metric_name}' 数据为空或无法处理，不生成图片。"
        )
        return
    plt.figure(figsize=(20, 10))
    bars = plt.bar(TIME_LABELS, hourly_counts, width=0.5, color=COLORS)
    total_count = sum(hourly_counts)
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            percentage = height / total_count
            plt.text(
                bar.get_x() + bar.get_width() / 2.0,
                height,
                f"{percentage:.1%}",
                ha="center",
                va="bottom",
                fontsize=16,
            )
    plt.xlabel("24小时分布", fontsize=16)
    plt.ylabel("快件数量", fontsize=16)
    plt.xticks(rotation=75, fontsize=16)
    plt.yticks(fontsize=16)
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.tight_layout()
    save_path = output_path / f"{company_name}_{metric_name}_分布图.png"
    plt.savefig(save_path)
    plt.close()
    print(f"    -> 图片已保存: {save_path.name}")


# --- 4. 主执行流程 ---


def run_plotting_and_analysis():
    print("🚀 开始执行分析任务...")
    analysis_results = {}

    for company in COMPANIES_TO_ANALYZE:
        print(f"\n--- 正在处理公司: {company} ---")

        file_prefix = (
            "邮政" if company == "快包" else COMPANY_MAPPING.get(company, company)
        )
        file_path = DATA_ANALYSIS_PATH / f"{file_prefix}_data_analysis_result.xlsx"

        if not file_path.exists():
            print(f"  -> 未找到文件: {file_path.name}，跳过该公司。")
            continue

        try:
            # 读取 '线路详细数据' sheet
            df = pd.read_excel(file_path, sheet_name="线路详细数据")
        except Exception as e:
            print(f"  -> 读取文件 {file_path.name} 失败: {e}，跳过该公司。")
            continue

        if df.empty:
            print(f"  -> 文件 {file_path.name} 的 '线路详细数据' sheet 为空，跳过。")
            continue

        analysis_results[company] = {}

        for metric in METRICS_TO_PLOT:
            if metric not in df.columns:
                print(f"    -> '{metric}' 列不存在，跳过。")
                continue

            hourly_counts = calculate_hourly_distribution(df[metric])
            plot_and_save_distribution(
                company, metric, hourly_counts, OUTPUT_IMAGE_PATH
            )

            total_count = sum(hourly_counts)
            if total_count == 0:
                continue

            if metric == "到达分拣中心时间":
                count_16_22 = sum(hourly_counts[16:22])
                analysis_results[company]["分拣中心_16_22_占比"] = (
                    count_16_22 / total_count
                )
            if metric == "离开寄件城市时间":
                count_16_22 = sum(hourly_counts[16:22])
                analysis_results[company]["离开城市_16_22_占比"] = (
                    count_16_22 / total_count
                )
                count_22_24 = sum(hourly_counts[22:24])
                analysis_results[company]["离开城市_22_24_占比"] = (
                    count_22_24 / total_count
                )

    print("\n\n--- 📈 分析结果汇总 ---")

    def print_ranking_results(metric_key: str, description: str):
        print(f"\n--- {description} 占比排名 ---")
        company_ratios = []
        for company, metrics in analysis_results.items():
            if metric_key in metrics:
                company_ratios.append((company, metrics[metric_key]))
        if not company_ratios:
            print("无相关数据可供排名。")
            return
        sorted_ratios = sorted(company_ratios, key=lambda item: item[1], reverse=True)
        print("排名 | 公司   | 占比")
        print("-----|--------|-------")
        for i, (company, ratio) in enumerate(sorted_ratios):
            print(f"{i + 1:<4} | {company:<6} | {ratio:>6.2%}")

    print_ranking_results("分拣中心_16_22_占比", "到达分拣中心时间 (16:00 - 22:00)")
    print_ranking_results("离开城市_16_22_占比", "离开寄件城市时间 (16:00 - 22:00)")
    print_ranking_results("离开城市_22_24_占比", "离开寄件城市时间 (22:00 - 24:00)")
    print("\n🎉 全部任务执行完毕！")


# --- 5. 执行主函数 ---
if __name__ == "__main__":
    run_plotting_and_analysis()

数据输入路径: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输出/data_analysis_result
图片输出路径: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输出/4_报告图片
Matplotlib 全局字体已成功设置为: Microsoft YaHei
🚀 开始执行绘图与分析任务 (新版流程)...

--- 正在处理公司: EMS ---
    -> 图片已保存: EMS_揽收时间_分布图.png
    -> 图片已保存: EMS_到达分拣中心时间_分布图.png
    -> 图片已保存: EMS_离开寄件城市时间_分布图.png
    -> 图片已保存: EMS_到达收件城市时间_分布图.png
    -> 图片已保存: EMS_派送时间_分布图.png
    -> 图片已保存: EMS_签收时间_分布图.png

--- 正在处理公司: 快包 ---
    -> 图片已保存: 快包_揽收时间_分布图.png
    -> 图片已保存: 快包_到达分拣中心时间_分布图.png
    -> 图片已保存: 快包_离开寄件城市时间_分布图.png
    -> 图片已保存: 快包_到达收件城市时间_分布图.png
    -> 图片已保存: 快包_派送时间_分布图.png
    -> 图片已保存: 快包_签收时间_分布图.png

--- 正在处理公司: 中通 ---
    -> 图片已保存: 中通_揽收时间_分布图.png
    -> 中通 的 '到达分拣中心时间' 数据为空或无法处理，不生成图片。
    -> 图片已保存: 中通_离开寄件城市时间_分布图.png
    -> 图片已保存: 中通_到达收件城市时间_分布图.png
    -> 图片已保存: 中通_派送时间_分布图.png
    -> 图片已保存: 中通_签收时间_分布图.png

--- 正在处理公司: 京东 ---
    -> 图片已保存: 京东_揽收时间_分布图.png
    -> 图片已保存: 京东_到达分拣中心时间_分布图.png
    -> 图片已保存: 京东_离开寄件城市时间_分布图.