In [1]:
# --------------------------------------------------
# Cell 1: 导入库并设置项目结构
# --------------------------------------------------
import math
import os
import re
import sys
import warnings
from datetime import datetime
from pathlib import Path
from time import perf_counter

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("库导入完成。")

# --- 项目路径设置  ---
# .
# ├── 1_data_preprocess.ipynb
# └── 报告数据/
#     ├── 输入/
#     │   ├── 安监数据/              (存放老师给的原始Excel文件；10家公司，10个文件)
#     │   └── basic_data.xlsx          (城市信息、线路信息)
#     ├── 输出/                      (存放所有最终生成的报告)
#     ├── temp/
#     │   ├── 1_待上传猪猪云数据/        (需要逐个手动上传到猪猪云的文件；8家公司，16个文件，排除顺丰和中通)
#     │   ├── 2_猪猪云下载数据/          (【手动放入】存放从猪猪云下载的结果文件；8家公司，16个文件，排除顺丰和中通)
#     │   ├── 3_猪猪云合并数据/         （猪猪云下载数据按公司合并后数据；8家公司，8个文件，排除顺丰和中通）
#     │   ├── 4_logistics数据         （存放logistics数据——提取完整物流信息的时间戳后的数据；8家公司，8个文件，排除顺丰和中通）
#     └── └── 5_中转数据/               (存放中转数据——提取中转城市和平均中转次数后的数据；8家公司，8个文件，排除顺丰和中通)
# 根目录
base_path = Path.cwd()
report_path = base_path / "报告数据"
# 输入路径
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
# 输出路径
output_path = report_path / "输出"
# 中间过程文件路径（自动创建，用于存放临时文件）
temp_path = report_path / "temp"
upload_split_path = temp_path / "1_待上传猪猪云文件"  # 存放拆分后待上传的文件
zhuzhuyun_download_path = temp_path / "2_猪猪云下载数据"  # 关键：这是手动放置文件的目录
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"
transit_data_path = temp_path / "5_中转数据"

# 创建所有需要的文件夹
for p in [
    report_path,
    input_path,
    anjian_data_path,
    zhuzhuyun_download_path,
    zhuzhuyun_merge_path,
    transit_data_path,
    output_path,
    temp_path,
    upload_split_path,
    pycharm_input_path,
]:
    p.mkdir(exist_ok=True)

print("项目文件夹结构设置/检查完毕。请按以下结构组织文件：")
print(
    f"将10个安监数据Excel文件放入: {anjian_data_path}\n⚠️ 请修改“京邦达”的名字为“京东”"
)
# 计算口径问题
# 1.所有的均值、最优值、排名均不计算快包（和EMS本质是一家公司的不同产品）
# 2.线路的送达天数计算使用的是80分位数
# 3.线路层的均值不加权（计算全部线路均值，需要用∑（单一线路均值*线路快递数量）/快递总数量，或直接对全部线路求均值不延续适用线路均值；但行业层面可直接对公司层面的均值（不包括快包），直接求简单的算术平均（并非全样本真实均值、只是简单对各公司水平求均值）
# 4.“单一数据源原则”：logistics经过
### (data["寄出地处理时限"].between(0.1，48))
### (data["运输时限"].between(0.5，200))&(data["寄达地处理时限"].between(0.1，60))&
### (data[“投递时限"].between(0，36))]
### 筛选后得到data_analysis_result；后续的数据只能够基于data_analysis_result进一步计算，不能再重新回到 logistics 数据
# 5.“48小时妥投率”口径：经过筛选后的数据（即data_analysis_result）的基础上，仅使用“Top 30城市互寄”的数据来计算；而“72小时妥投率”直接用data_analysis_result数据，不经过“Top 30城市“筛选
# 6.邮政月报.xlsx中的“分城市明细”和“分省份明细”sheet中
### 全程时限（双挂）是该城市按照所有以该城市为寄出地和寄达地的邮件进行聚合
### 全程时限（寄出地）是对应的城市按照所有以该城市为寄出地的邮件进行聚合
### 寄出地处理时限、揽收-到达寄出地分拣中心时长、到达寄出地分拣中心-离开寄出地城市时长是按照所有该城市作为寄出地的邮件聚合
### 寄达地处理时限、到达寄达地城市-离开寄达地分拣中心时长、离开寄达地分拣中心-派件是按照所有该城市作为寄达地的邮件聚合
### 投递时限是按照所有该城市作为寄达地的邮件聚合
### 寄出地处理时限+寄达地处理时限+投递时限是前面的简单加和，即寄出地处理时限（按寄出地聚合）+寄达地处理时限（按寄达地聚合）+投递时限（按寄达地聚合）

库导入完成。
项目文件夹结构设置/检查完毕。请按以下结构组织文件：
将10个安监数据Excel文件放入: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输入/安监数据
⚠️ 请修改“京邦达”的名字为“京东”


In [None]:
# --------------------------------------------------
# Cell 2: 拆分安监数据，准备用于猪猪云上传的数据
# --------------------------------------------------
def prepare_data_for_upload(
    source_dir: Path,
    target_dir: Path,
    companies_to_exclude: list,
    chunk_size: int = 50000,
):
    """
    读取安监数据，保留'企业'和'单号'列，并按chunk_size动态拆分为n个文件，为手动上传做准备。
    """
    # 确保运行此函数所需的库已导入
    import math
    from pathlib import Path

    import pandas as pd

    print(">>> 第1部分：开始准备上传数据...")
    # 确保目标文件夹存在，并清空旧的待上传文件
    target_dir.mkdir(parents=True, exist_ok=True)
    for f in target_dir.glob("*.xlsx"):
        f.unlink()

    anjian_files = list(source_dir.glob("*.xlsx"))
    if not anjian_files:
        print(f"错误：在 {source_dir} 中未找到任何安监数据文件。请先放置文件。")
        return

    for file_path in anjian_files:
        company_name_found = False
        # 公司列表根据文件名灵活匹配
        companies = [
            "EMS",
            "中通",
            "京东",
            "圆通",
            "德邦",
            "极兔",
            "申通",
            "韵达",
            "邮政",
            "顺丰",
        ]
        for company in companies:
            if company in file_path.stem:
                company_name = company
                company_name_found = True
                break
        if not company_name_found:
            print(f"警告: 文件 {file_path.name} 未能匹配到已知公司名，已跳过。")
            continue

        if company_name in companies_to_exclude:
            print(f"公司 '{company_name}' 在排除列表内，跳过准备上传。")
            continue

        print(f"正在处理: {file_path.name} (公司: {company_name})")

        # 读取并校验Excel文件
        try:
            df = pd.read_excel(file_path, dtype={"单号": str})
            if "企业" not in df.columns or "单号" not in df.columns:
                print(
                    f"  -> 警告: 文件 {file_path.name} 缺少 '企业' 或 '单号' 列，已跳过。"
                )
                continue
            df_to_upload = df[["企业", "单号"]].copy()
        except Exception as e:
            print(f"  -> 错误: 读取文件 {file_path.name} 时出错: {e}，已跳过。")
            continue

        # --- 这是核心修改部分：动态拆分为 n 个文件 ---
        total_rows = len(df_to_upload)
        if total_rows == 0:
            print(f"  -> {company_name} 数据为空，跳过保存。")
            continue

        # 1. 根据总行数和 chunk_size 计算需要拆分的文件数量 (n)
        num_chunks = math.ceil(total_rows / chunk_size)

        # 2. 循环生成每个分片文件
        for i in range(num_chunks):
            start_row = i * chunk_size
            end_row = start_row + chunk_size
            chunk_df = df_to_upload.iloc[start_row:end_row]

            # 文件名从 1 开始计数, 例如: 圆通1.xlsx, 圆通2.xlsx ...
            output_path = target_dir / f"{company_name}{i + 1}.xlsx"
            chunk_df.to_excel(output_path, index=False)

        # 3. 打印最终结果
        print(f"  -> {company_name} 数据已拆分为 {num_chunks} 个文件。")

    print("\n" + "=" * 100)
    print("【第一部分完成】数据准备完毕！")
    # 使用传入的 target_dir 变量显示路径，更具通用性
    print(f"请前往文件夹: \n{target_dir.resolve()}")
    print(
        "将里面的所有Excel文件手动上传到“猪猪快递云”网站，\n\n最后将所有下载结果放入“/报告数据/输入/猪猪云下载数据/”文件夹中。"
    )
    print("=" * 100)
    # 定义需要排除的公司列表


companies_to_exclude_from_upload = ["顺丰", "中通"]
prepare_data_for_upload(
    anjian_data_path, upload_split_path, companies_to_exclude_from_upload
)

>>> 第1部分：开始准备上传数据...
正在处理: 2025年7月韵达抽样.xlsx (公司: 韵达)
  -> 韵达 数据已拆分为 2 个文件。
公司 '顺丰' 在排除列表内，跳过准备上传。
正在处理: 2025年7月京东抽样.xlsx (公司: 京东)
  -> 京东 数据已拆分为 2 个文件。
正在处理: 2025年7月极兔抽样.xlsx (公司: 极兔)
  -> 极兔 数据已拆分为 2 个文件。
正在处理: 2025年7月德邦抽样.xlsx (公司: 德邦)
  -> 德邦 数据已拆分为 2 个文件。
正在处理: 2025年7月申通抽样.xlsx (公司: 申通)
  -> 申通 数据已拆分为 2 个文件。
正在处理: 2025年7月EMS抽样.xlsx (公司: EMS)
  -> EMS 数据已拆分为 1 个文件。
正在处理: 2025年7月圆通抽样.xlsx (公司: 圆通)
  -> 圆通 数据已拆分为 2 个文件。
正在处理: 2025年7月中国邮政抽样.xlsx (公司: 邮政)
  -> 邮政 数据已拆分为 2 个文件。
公司 '中通' 在排除列表内，跳过准备上传。

【第一部分完成】数据准备完毕！
请前往文件夹: 
/Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/temp/1_待上传猪猪云文件
将里面的所有Excel文件手动上传到“猪猪快递云”网站，

最后将所有下载结果放入“/报告数据/输入/猪猪云下载数据/”文件夹中。


In [None]:
# --------------------------------------------------
# Cell 3: 合并猪猪云下载数据，生成猪猪云合并数据
# --------------------------------------------------
companies = [
    "邮政",
    "EMS",
    "中通",
    "京东",
    "圆通",
    "德邦",
    "极兔",
    "申通",
    "韵达",
    "顺丰",
]
companies_lower = [comp.lower() for comp in companies]

company_files = {}

for file_path in zhuzhuyun_download_path.iterdir():
    if file_path.is_file() and file_path.suffix == ".xlsx":
        filename_stem_lower = file_path.stem.lower()  # 获取不带扩展名的文件名并转为小写
        company_name_found = False
        found_company = None
        for i, company_lower in enumerate(companies_lower):
            if company_lower in filename_stem_lower:
                found_company = companies[i]
                company_name_found = True
                break

        if company_name_found:
            if found_company not in company_files:
                company_files[found_company] = []
            company_files[found_company].append(file_path)
        else:
            print(
                f"  警告: 文件 '{file_path.name}' 未匹配到任何已知公司名，将跳过此文件。"
            )

print("\n开始处理各公司文件...")

for company_name, files_list in company_files.items():
    print(f"\n正在合并 {company_name} 的文件 ({len(files_list)} 个)...")

    all_dfs = []

    for file_path in files_list:
        try:
            df = pd.read_excel(file_path)
            all_dfs.append(df)
            print(f"  已读取: {file_path.name}")
        except Exception as e:
            print(f"  读取文件失败: {file_path.name}, 错误: {e}")

    if all_dfs:
        merged_df = pd.concat(all_dfs, ignore_index=True)
        output_filename = f"{company_name}.xlsx"
        output_file_path = zhuzhuyun_merge_path / output_filename

        # 1. 检查 '快递单号' 列是否存在
        if "快递单号" in merged_df.columns:
            # 记录原始行数
            rows_before_dedup = len(merged_df)
            # 2. 基于'快递单号'列删除重复项，保留第一个出现的
            merged_df.drop_duplicates(subset=["快递单号"], keep="first", inplace=True)

            # 记录删除后的行数
            rows_after_dedup = len(merged_df)

            # 3. 计算并打印移除了多少重复值
            num_duplicates_removed = rows_before_dedup - rows_after_dedup
            if num_duplicates_removed > 0:
                print(f"  已基于'快递单号'移除 {num_duplicates_removed} 个重复值。")
            else:
                print(f"  未发现'快递单号'的重复值。")
        else:
            print(f"  警告: 合并后的数据中未找到 '快递单号' 列，无法执行去重操作。")

        # 4. 保存处理后的数据
        try:
            merged_df.to_excel(output_file_path, index=False)
            print(
                f"  成功保存合并数据到: {output_file_path.name} (共 {len(merged_df)} 行)"
            )
        except Exception as e:
            print(f"  保存文件失败: {output_file_path.name}, 错误: {e}")

    else:
        print(f"  没有成功读取 {company_name} 的任何文件，跳过合并。")

print("\n所有公司文件合并完成！")


In [None]:
#####-------- logistics数据提取原则-------------#####
# 1. “到达分拣中心时间”本质是“到达寄件城市分拣中心时间”
#### 寄达城市（即收件城市）分拣中心时间以提取到的第一个分拣中心为准
#### 寄出城市（即寄件城市）分拣时间以提取到的最后一个分拣中心为准
# 2. “签收时间”分为3种情况，
#### 1）上门送件，以上门的时间为签收时间；
#### 2）放在取件网点，顾客之后自己取走，以放在取件网点的时间为签收时间；
#### 3）放在快递柜、丰巢等，顾客之后自己取走，以放在快递柜、丰巢的时间为签收时间。
# 3. “转运中心”中只有省份名称，没有城市名称的，认为该转运中心就在该省的省会，如果相应的寄出/寄达城市就是该省的省会，则认为快件仍在该寄出地/寄达地

In [11]:
# --------------------------------------------------
# Cell 4.0: logistics数据提取-通用库导入与项目结构设置
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("通用库导入完成。")

# --- 项目路径设置 ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"

# 确保文件夹存在
for p in [anjian_data_path, zhuzhuyun_merge_path, pycharm_input_path]:
    p.mkdir(parents=True, exist_ok=True)

print("项目文件夹结构设置/检查完毕。")


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)

通用库导入完成。
项目文件夹结构设置/检查完毕。


In [None]:
# --------------------------------------------------
# Cell 4.1: EMS logistics数据提取（定稿版）
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

# --- 初始化 (仅在独立运行此单元格时需要) ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 EMS ---")

# --- 项目路径设置 (请确保在Cell 0中已正确设置) ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 公司档案: EMS ---
COMPANY_PROFILES_EMS = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件",
        "delivery": r"派送中|派件员|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往",
        "arrive": r"到达|抵达",
        "center": r"处理中心|分拨中心|转运场|运营区|枢纽中心|集散中心",
        "exclude": r"揽投部|营业部|经营分部|网点|集货点|营销中心",
        "sign_p1": r"已派送至|已投至|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收",
        "sign_fallback": r"已签收|已妥投|已妥收",
    },
    "EMS": {
        "collect": r"已收取快件|已收寄|已收取邮件|揽收|物流项目组",
        "delivery": r"已安排派送|正在派送中|已派送(?!至)|为您派件|为您派送",
        "leave_p1": r"已乘机|已搭乘邮航专机",
        "leave_hard": r"离开",
        "leave_soft": r"准备发出|已发出",
        "center_p1": r"邮区中心|航空枢纽|处理中心|网路中心|航空中心",
        "center_p2": r"包件车间|集散中心|集散点|航站|快件处理车间|快件处理中心",
        "exclude": r"揽投部|邮政支局|直投中心|揽收部|营销中心",
        "sign_p1": r"已派送至|已投至|到达【拼多多中转仓】|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点|已暂存至",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收|家门口签收",
        "sign_fallback": r"已签收|已妥投|已妥收",
        "sign_ignore": r"完成取件",
    },
}


# --- 解析器: EMS (终极版) ---
def parse_logistics_events_ems_ultimate(
    row: pd.Series, profile: dict, location_maps: dict
) -> pd.Series:
    log_text = row["完整物流信息"]
    sender_city_key, dest_city_key = row["寄出城市"], row["寄达城市"]
    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}
    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)

    kw_collect, kw_delivery = profile.get("collect"), profile.get("delivery")
    kw_leave_p1, kw_leave_hard, kw_leave_soft = (
        profile.get("leave_p1"),
        profile.get("leave_hard"),
        profile.get("leave_soft"),
    )
    kw_arrive, kw_exclude = profile.get("arrive"), profile.get("exclude", "")
    kw_sign_p1, kw_sign_p2, kw_sign_fallback, kw_sign_ignore = (
        profile.get("sign_p1"),
        profile.get("sign_p2"),
        profile.get("sign_fallback"),
        profile.get("sign_ignore"),
    )
    kw_center_p1, kw_center_p2 = profile.get("center_p1"), profile.get("center_p2")
    kw_all_centers = "|".join(filter(None, [kw_center_p1, kw_center_p2]))

    datetime_capture_pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    all_events = []
    for line in log_text.strip().split("\n"):
        match = re.search(datetime_capture_pattern, line)
        if match:
            dt = pd.to_datetime(match.group(1), errors="coerce")
            if pd.notna(dt):
                all_events.append({"dt": dt, "line": line})
    if not all_events:
        return pd.Series(extracted_times)
    all_events.sort(key=lambda x: x["dt"])

    def get_location_pattern(city_key, location_maps):
        if not city_key or not isinstance(city_key, str):
            return None
        aliases = location_maps["city_alias_map"].get(
            city_key, [city_key.replace("市", "")]
        )
        province = location_maps["city_to_province_map"].get(city_key)
        if province:
            aliases.append(province.replace("省", "").replace("市", ""))

        # <<< MODIFIED >>>: 增加更严格的过滤和类型转换，防止NaN或数字类型导致错误
        clean_aliases = {str(a) for a in aliases if pd.notna(a) and a != ""}
        return "|".join(map(re.escape, clean_aliases))

    for event in all_events:
        if kw_collect and re.search(kw_collect, event["line"]):
            extracted_times["揽收时间_zzy"] = event["dt"]
            break
    for event in all_events:
        if (
            pd.isna(extracted_times["派送时间_zzy"])
            and kw_delivery
            and re.search(kw_delivery, event["line"])
        ):
            extracted_times["派送时间_zzy"] = event["dt"]
            break
    if kw_sign_p1:
        for event in all_events:
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p1, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_p2:
        for event in all_events:
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p2, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_fallback:
        for event in all_events:
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_fallback, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break

    sender_pattern = get_location_pattern(sender_city_key, location_maps)

    if pd.notna(extracted_times["揽收时间_zzy"]) and sender_pattern:
        true_leave_kws_parts = [kw_leave_p1, kw_leave_hard]
        true_leave_kws = "|".join(filter(None, true_leave_kws_parts))
        last_true_leave_event = None
        if true_leave_kws:
            for event in all_events:
                if event["dt"] > extracted_times["揽收时间_zzy"]:
                    if re.search(true_leave_kws, event["line"]) and re.search(
                        sender_pattern, event["line"]
                    ):
                        last_true_leave_event = event
        if last_true_leave_event:
            extracted_times["离开寄件城市时间_zzy"] = last_true_leave_event["dt"]

        last_sender_center_arrival = None
        for event in all_events:
            if event["dt"] > extracted_times["揽收时间_zzy"]:
                if (
                    pd.notna(extracted_times["离开寄件城市时间_zzy"])
                    and event["dt"] >= extracted_times["离开寄件城市时间_zzy"]
                ):
                    break
                if (
                    kw_all_centers
                    and re.search(kw_all_centers, event["line"])
                    and kw_arrive
                    and re.search(kw_arrive, event["line"])
                    and re.search(sender_pattern, event["line"])
                ):
                    if not (kw_exclude and re.search(kw_exclude, event["line"])):
                        last_sender_center_arrival = event

        if last_sender_center_arrival:
            extracted_times["到达分拣中心时间_zzy"] = last_sender_center_arrival["dt"]

    dest_pattern = get_location_pattern(dest_city_key, location_maps)
    if dest_pattern:
        if pd.isna(extracted_times["到达收件城市时间_zzy"]):
            for event in all_events:
                if (
                    kw_arrive
                    and re.search(kw_arrive, event["line"])
                    and re.search(dest_pattern, event["line"])
                ):
                    extracted_times["到达收件城市时间_zzy"] = event["dt"]
                    break
        start_time_for_dest_search = extracted_times.get("到达收件城市时间_zzy", pd.NaT)
        if pd.isna(start_time_for_dest_search) and pd.notna(
            extracted_times["揽收时间_zzy"]
        ):
            start_time_for_dest_search = extracted_times["揽收时间_zzy"] + pd.Timedelta(
                hours=12
            )
        if pd.notna(start_time_for_dest_search) and kw_all_centers:
            if kw_leave_hard:
                for event in all_events:
                    if event["dt"] > start_time_for_dest_search:
                        if (
                            re.search(kw_all_centers, event["line"])
                            and re.search(kw_leave_hard, event["line"])
                            and re.search(dest_pattern, event["line"])
                        ):
                            if kw_exclude and re.search(kw_exclude, event["line"]):
                                continue
                            extracted_times["离开收件城市分拣中心时间_zzy"] = event[
                                "dt"
                            ]
                            break
            if (
                pd.isna(extracted_times["离开收件城市分拣中心时间_zzy"])
                and kw_leave_soft
            ):
                for event in all_events:
                    if event["dt"] > start_time_for_dest_search:
                        if (
                            re.search(kw_all_centers, event["line"])
                            and re.search(kw_leave_soft, event["line"])
                            and re.search(dest_pattern, event["line"])
                        ):
                            if kw_exclude and re.search(kw_exclude, event["line"]):
                                continue
                            extracted_times["离开收件城市分拣中心时间_zzy"] = event[
                                "dt"
                            ]
                            break
    return pd.Series(extracted_times)


# --- 主流程: EMS ---
def process_ems_data(zhuzhuyun_dir, anjian_dir, output_dir, base_data_file):
    company_name = "EMS"
    config = {
        "zhuzhu_filename": "EMS.xlsx",
        "internal_name": "EMS",
        "anjian_map_key": "EMS",
        "parser": parse_logistics_events_ems_ultimate,
    }
    company_start_time = perf_counter()
    location_maps = {
        "city_alias_map": {},
        "capital_cities_set": set(),
        "city_to_province_map": {},
    }
    if base_data_file.exists():
        try:
            df_hierarchy = pd.read_excel(
                base_data_file,
                sheet_name="city_hierarchy",
                dtype={"Province": str, "City": str, "District": str},
            )
            df_hierarchy.dropna(subset=["Province", "City"], inplace=True)
            df_hierarchy["City_clean"] = df_hierarchy["City"].str.replace(
                "市", "", regex=False
            )
            df_hierarchy["District_clean"] = df_hierarchy["District"].str.replace(
                "市", "", regex=False
            )
            for city, group in df_hierarchy.groupby("City"):
                aliases = [group.iloc[0]["City_clean"]] + group[
                    "District_clean"
                ].unique().tolist()
                location_maps["city_alias_map"][city] = [a for a in aliases if a]
            if "IsCapital" in df_hierarchy.columns:
                df_capitals = df_hierarchy[df_hierarchy["IsCapital"] == 1]
                location_maps["capital_cities_set"] = set(df_capitals["City"].unique())
            city_to_province = df_hierarchy.drop_duplicates("City")[
                ["City", "Province"]
            ].set_index("City")
            location_maps["city_to_province_map"] = city_to_province[
                "Province"
            ].to_dict()
        except Exception as e:
            print(f"[ERROR] 构建地理位置地图失败: {e}。")
    df_base_subset = None
    if base_data_file.exists():
        try:
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
        except Exception as e:
            print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")

    all_anjian_files = [
        f for f in anjian_dir.glob("*.xlsx") if not f.name.startswith("~$")
    ]
    if not all_anjian_files:
        print("[ERROR] 安监数据文件夹中未找到有效的Excel文件！")
        return
    all_anjian_df = pd.concat(
        [
            pd.read_excel(f, dtype={"单号": str, "寄出城市": str, "寄达城市": str})
            for f in all_anjian_files
        ],
        ignore_index=True,
    )

    if "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)
    anjian_to_internal_map = {config["anjian_map_key"]: "EMS"}
    all_anjian_df["企业"] = (
        all_anjian_df["企业"].map(anjian_to_internal_map).fillna(all_anjian_df["企业"])
    )
    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()
    company_file = zhuzhuyun_dir / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return
    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(
        columns={"快递单号": "单号", "快递公司": "企业"}, inplace=True
    )
    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")
    base_profile = COMPANY_PROFILES_EMS["default"].copy()
    company_specific_profile = COMPANY_PROFILES_EMS.get(config["internal_name"], {})
    profile = {**base_profile, **company_specific_profile}
    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)
    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)
    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")
    df_final["企业"] = company_name
    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)
    output_file = output_dir / f"{company_name}_logistics_data.xlsx"
    final_df_to_save.to_excel(output_file, index=False)
    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.name}"
    )


# --- 运行: EMS ---
process_ems_data(
    zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
)

--- [开始] 处理 EMS ---
  -> 使用 'parse_logistics_events_ems_ultimate' 解析器通过 .apply() 运行...
✅ EMS 处理完成，耗时 229.98 秒。文件已保存至: EMS_logistics_data_TEST.xlsx


In [None]:
# --------------------------------------------------
# Cell 4.2: 邮政logistics数据提取（定稿版）
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

# --- 初始化 (仅在独立运行此单元格时需要) ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 邮政 ---")

# --- 项目路径设置 (请确保在Cell 0中已正确设置) ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 公司档案: 邮政 ---
COMPANY_PROFILES_POSTAL = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件",
        "delivery": r"派送中|派件员|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往",
        "arrive": r"到达|抵达",
        "center": r"处理中心|分拨中心|转运场|运营区|枢纽中心|集散中心",
        "exclude": r"揽投部|营业部|经营分部|网点|集货点|营销中心",
        "sign_p1": r"已派送至|已投至|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收",
        "sign_fallback": r"已签收|已妥投|已妥收",
    },
    "邮政国内小包": {
        "collect": r"已收取快件|已收寄|已收取邮件|揽收|物流项目组",
        "delivery": r"派送中|派件员|为您派件|已安排派送",
        "leave_p1": r"已乘机|已搭乘邮航专机",
        "leave_hard": r"离开",
        "leave_soft": r"准备发出|已发出|发往",
        "leave": r"离开|已发出|发往|准备发出",
        "center_p1": r"邮区中心|航空枢纽|处理中心|网路中心|航空中心",
        "center_p2": r"包件车间|集散中心|集散点|航站|快件处理车间|快件处理班",
        "exclude": r"揽投部|邮政支局|直投中心|揽收部|营销中心|直投点",
        "sign_p1": r"已派送至|已投至|到达【拼多多中转仓】|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点|已暂存至",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收|家门口签收",
        "sign_fallback": r"已签收|已妥投|已妥收",
        "sign_ignore": r"完成取件",
    },
}


# --- 解析器: 邮政 (终极完美版) ---
def parse_logistics_events_postal_perfected(
    row: pd.Series, profile: dict, location_maps: dict
) -> pd.Series:
    log_text = row["完整物流信息"]
    sender_city_key, dest_city_key = row["寄出城市"], row["寄达城市"]
    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}
    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)

    kw_collect, kw_delivery = profile.get("collect"), profile.get("delivery")
    kw_leave_p1, kw_leave_hard, kw_leave_soft, kw_leave = (
        profile.get("leave_p1"),
        profile.get("leave_hard"),
        profile.get("leave_soft"),
        profile.get("leave"),
    )
    kw_arrive, kw_exclude = profile.get("arrive"), profile.get("exclude", "")
    kw_sign_p1, kw_sign_p2, kw_sign_fallback, kw_sign_ignore = (
        profile.get("sign_p1"),
        profile.get("sign_p2"),
        profile.get("sign_fallback"),
        profile.get("sign_ignore"),
    )
    kw_center_p1, kw_center_p2 = profile.get("center_p1"), profile.get("center_p2")
    kw_all_centers = "|".join(filter(None, [kw_center_p1, kw_center_p2]))

    datetime_capture_pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    all_events = []
    for line in log_text.strip().split("\n"):
        match = re.search(datetime_capture_pattern, line)
        if match:
            dt = pd.to_datetime(match.group(1), errors="coerce")
            if pd.notna(dt):
                all_events.append({"dt": dt, "line": line})
    if not all_events:
        return pd.Series(extracted_times)
    all_events.sort(key=lambda x: x["dt"])

    def get_location_patterns(city_key, location_maps):
        if not city_key or not isinstance(city_key, str):
            return None, None
        aliases = location_maps["city_alias_map"].get(
            city_key, [city_key.replace("市", "")]
        )
        city_pattern = "|".join(
            map(re.escape, {str(a) for a in aliases if pd.notna(a) and a != ""})
        )
        province = location_maps["city_to_province_map"].get(city_key)
        province_pattern = None
        if province:
            province_clean = province.replace("省", "").replace("市", "")
            if province_clean and not any(
                alias in province_clean
                for alias in {
                    a.replace("市", "") for a in aliases if pd.notna(a) and a != ""
                }
            ):
                province_pattern = re.escape(province_clean)
        return city_pattern, province_pattern

    for event in all_events:
        if (
            pd.isna(extracted_times["揽收时间_zzy"])
            and kw_collect
            and re.search(kw_collect, event["line"])
        ):
            extracted_times["揽收时间_zzy"] = event["dt"]
    for event in all_events:
        if (
            pd.isna(extracted_times["派送时间_zzy"])
            and kw_delivery
            and re.search(kw_delivery, event["line"])
        ):
            extracted_times["派送时间_zzy"] = event["dt"]
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_p1:
        for event in all_events:
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p1, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_p2:
        for event in all_events:
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p2, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_fallback:
        for event in all_events:
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_fallback, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break

    sender_city_pattern, sender_province_pattern = get_location_patterns(
        sender_city_key, location_maps
    )
    if pd.notna(extracted_times["揽收时间_zzy"]):
        sender_candidates = []
        if sender_city_pattern:
            for event in all_events:
                if event["dt"] > extracted_times["揽收时间_zzy"] and re.search(
                    sender_city_pattern, event["line"]
                ):
                    sender_candidates.append(event)
        if not sender_candidates and sender_province_pattern:
            for event in all_events:
                if event["dt"] > extracted_times["揽收时间_zzy"] and re.search(
                    sender_province_pattern, event["line"]
                ):
                    sender_candidates.append(event)

        last_center_arrival, last_center_leave = None, None
        true_leave_kws = "|".join(filter(None, [kw_leave_p1, kw_leave]))
        for event in sender_candidates:
            if (
                kw_all_centers
                and re.search(kw_all_centers, event["line"])
                and not (kw_exclude and re.search(kw_exclude, event["line"]))
            ):
                if kw_arrive and re.search(kw_arrive, event["line"]):
                    last_center_arrival = event
                if true_leave_kws and re.search(true_leave_kws, event["line"]):
                    last_center_leave = event
            elif kw_leave_p1 and re.search(kw_leave_p1, event["line"]):
                last_center_leave = event

        if last_center_arrival:
            extracted_times["到达分拣中心时间_zzy"] = last_center_arrival["dt"]
        if last_center_leave:
            extracted_times["离开寄件城市时间_zzy"] = last_center_leave["dt"]

    dest_city_pattern, dest_province_pattern = get_location_patterns(
        dest_city_key, location_maps
    )
    if dest_city_pattern:
        for event in all_events:
            if (
                pd.isna(extracted_times["到达收件城市时间_zzy"])
                and kw_all_centers
                and re.search(kw_all_centers, event["line"])
                and not (kw_exclude and re.search(kw_exclude, event["line"]))
                and kw_arrive
                and re.search(kw_arrive, event["line"])
                and re.search(dest_city_pattern, event["line"])
            ):
                extracted_times["到达收件城市时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["到达收件城市时间_zzy"]) and dest_province_pattern:
        for event in all_events:
            if (
                pd.isna(extracted_times["到达收件城市时间_zzy"])
                and kw_all_centers
                and re.search(kw_all_centers, event["line"])
                and not (kw_exclude and re.search(kw_exclude, event["line"]))
                and kw_arrive
                and re.search(kw_arrive, event["line"])
                and re.search(dest_province_pattern, event["line"])
            ):
                extracted_times["到达收件城市时间_zzy"] = event["dt"]
                break

    start_time = extracted_times.get("到达收件城市时间_zzy", pd.NaT)
    if pd.isna(start_time) and pd.notna(extracted_times["揽收时间_zzy"]):
        start_time = extracted_times["揽收时间_zzy"]

    if pd.notna(start_time):
        if kw_leave_hard:
            for event in all_events:
                if event["dt"] > start_time:
                    if (
                        dest_city_pattern
                        and re.search(dest_city_pattern, event["line"])
                        or (
                            dest_province_pattern
                            and re.search(dest_province_pattern, event["line"])
                        )
                    ):
                        if (
                            kw_all_centers
                            and re.search(kw_all_centers, event["line"])
                            and not (
                                kw_exclude and re.search(kw_exclude, event["line"])
                            )
                        ):
                            if re.search(kw_leave_hard, event["line"]):
                                extracted_times["离开收件城市分拣中心时间_zzy"] = event[
                                    "dt"
                                ]
                                break
        if pd.isna(extracted_times["离开收件城市分拣中心时间_zzy"]) and kw_leave_soft:
            for event in all_events:
                if event["dt"] > start_time:
                    if (
                        dest_city_pattern
                        and re.search(dest_city_pattern, event["line"])
                        or (
                            dest_province_pattern
                            and re.search(dest_province_pattern, event["line"])
                        )
                    ):
                        if (
                            kw_all_centers
                            and re.search(kw_all_centers, event["line"])
                            and not (
                                kw_exclude and re.search(kw_exclude, event["line"])
                            )
                        ):
                            if re.search(kw_leave_soft, event["line"]):
                                extracted_times["离开收件城市分拣中心时间_zzy"] = event[
                                    "dt"
                                ]
                                break

    return pd.Series(extracted_times)


# --- 主流程: 邮政 ---
def process_postal_data(zhuzhuyun_dir, anjian_dir, output_dir, base_data_file):
    company_name = "邮政"
    company_start_time = perf_counter()
    COMPANY_CONFIG_MAP = {
        "EMS": {"internal_name": "EMS", "anjian_map_key": "EMS"},
        "邮政": {"internal_name": "邮政国内小包", "anjian_map_key": "ZGYZ"},
        "京东": {"internal_name": "京东", "anjian_map_key": "JBD"},
        "圆通": {"internal_name": "圆通", "anjian_map_key": "YTO"},
        "申通": {"internal_name": "申通", "anjian_map_key": "STO"},
        "韵达": {"internal_name": "韵达", "anjian_map_key": "YUNDA"},
        "极兔": {"internal_name": "极兔", "anjian_map_key": "JT"},
        "德邦": {"internal_name": "德邦", "anjian_map_key": "DEPPON"},
    }
    config = {
        "zhuzhu_filename": "邮政.xlsx",
        "internal_name": "邮政国内小包",
        "parser": parse_logistics_events_postal_perfected,
    }
    location_maps = {
        "city_alias_map": {},
        "capital_cities_set": set(),
        "city_to_province_map": {},
    }
    if base_data_file.exists():
        try:
            # <<< MODIFIED >>>: 强制指定地理信息列为字符串
            df_hierarchy = pd.read_excel(
                base_data_file,
                sheet_name="city_hierarchy",
                dtype={"Province": str, "City": str, "District": str},
            )
            # <<< MODIFIED >>>: 放宽数据清洗条件
            df_hierarchy.dropna(subset=["Province", "City"], inplace=True)
            df_hierarchy["City_clean"] = df_hierarchy["City"].str.replace(
                "市", "", regex=False
            )
            df_hierarchy["District_clean"] = df_hierarchy["District"].str.replace(
                "市", "", regex=False
            )
            for city, group in df_hierarchy.groupby("City"):
                # <<< MODIFIED >>>: 增加对NaN值的过滤
                aliases = [group.iloc[0]["City_clean"]] + group[
                    "District_clean"
                ].unique().tolist()
                location_maps["city_alias_map"][city] = [
                    a for a in aliases if pd.notna(a) and a != ""
                ]
            if "IsCapital" in df_hierarchy.columns:
                df_capitals = df_hierarchy[df_hierarchy["IsCapital"] == 1]
                location_maps["capital_cities_set"] = set(df_capitals["City"].unique())
            city_to_province = df_hierarchy.drop_duplicates("City")[
                ["City", "Province"]
            ].set_index("City")
            location_maps["city_to_province_map"] = city_to_province[
                "Province"
            ].to_dict()
        except Exception as e:
            print(f"[ERROR] 构建地理位置地图失败: {e}。")
    df_base_subset = None
    if base_data_file.exists():
        try:
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
        except Exception as e:
            print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")

    all_anjian_files = [
        f for f in anjian_dir.glob("*.xlsx") if not f.name.startswith("~$")
    ]
    if not all_anjian_files:
        print("[ERROR] 安监数据文件夹中未找到有效的Excel文件！")
        return
    all_anjian_df = pd.concat(
        [
            pd.read_excel(f, dtype={"单号": str, "寄出城市": str, "寄达城市": str})
            for f in all_anjian_files
        ],
        ignore_index=True,
    )
    if "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)
    anjian_to_internal_map = {
        v["anjian_map_key"]: v["internal_name"]
        for k, v in COMPANY_CONFIG_MAP.items()
        if "anjian_map_key" in v
    }
    all_anjian_df["企业"] = (
        all_anjian_df["企业"].map(anjian_to_internal_map).fillna(all_anjian_df["企业"])
    )
    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()
    company_file = zhuzhuyun_dir / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return
    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(
        columns={"快递单号": "单号", "快递公司": "企业"}, inplace=True
    )
    df_company_zhu["企业"] = config["internal_name"]
    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")
    base_profile = COMPANY_PROFILES_POSTAL["default"].copy()
    company_specific_profile = COMPANY_PROFILES_POSTAL.get(config["internal_name"], {})
    profile = {**base_profile, **company_specific_profile}
    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)
    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)
    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")
    df_final["企业"] = company_name
    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)
    output_file = output_dir / f"{company_name}_logistics_data.xlsx"
    final_df_to_save.to_excel(output_file, index=False)
    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.name}"
    )


# --- 运行: 邮政 ---
process_postal_data(
    zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
)

--- [开始] 处理 邮政 ---
  -> 使用 'parse_logistics_events_postal_perfected' 解析器通过 .apply() 运行...
✅ 邮政 处理完成，耗时 203.61 秒。文件已保存至: 邮政_logistics_data_TEST.xlsx


In [None]:
# --------------------------------------------------
# Cell 4.3: 京东logistics数据提取（定稿版）
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

# --- 初始化 (仅在独立运行此单元格时需要) ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 京东 ---")

# --- 项目路径设置 (请确保在Cell 0中已正确设置) ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 公司档案: 京东 ---
COMPANY_PROFILES_JD = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件|揽收完成",
        "delivery": r"派送中|派件员|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往",
        "arrive": r"到达|抵达",
        "center": r"处理中心|分拨中心|转运场|运营区|枢纽中心|集散中心|分拣中心|接货仓",
        "exclude": r"揽投部|营业部|经营分部|网点|集货点|营销中心|项目营业点|校园服务站|接驳点",
        "sign_p1": r"已派送至|已投至|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点|已送达至|已送至|校园服务站|快递柜|云柜|丰巢柜|便民驿站",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收|已送达至|已由.*?代收|已由.*?签收",
        "sign_fallback": r"已签收|已妥投|已妥收|已送达",
        "sign_ignore": r"完成取件",
    },
    "京东": {},
}


# --- 解析器: 京东 (基于新原则重构) ---
def parse_logistics_events_jd(
    row: pd.Series, profile: dict, location_maps: dict
) -> pd.Series:
    log_text = row["完整物流信息"]
    sender_city_key, dest_city_key = row["寄出城市"], row["寄达城市"]
    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}
    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)

    kw = {
        key: profile.get(key)
        for key in [
            "collect",
            "delivery",
            "leave",
            "arrive",
            "center",
            "exclude",
            "sign_p1",
            "sign_p2",
            "sign_fallback",
            "sign_ignore",
        ]
    }

    datetime_capture_pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    all_events = []
    for line in log_text.strip().split("\n"):
        match = re.search(datetime_capture_pattern, line)
        if match:
            location_in_brackets = re.search(r"【(.*?)】", line)
            is_center = False
            location_name = ""
            if location_in_brackets:
                location_name = location_in_brackets.group(1)
                if kw["center"] and re.search(kw["center"], location_name):
                    if not (kw["exclude"] and re.search(kw["exclude"], location_name)):
                        is_center = True

            all_events.append(
                {
                    "dt": pd.to_datetime(match.group(1), errors="coerce"),
                    "line": line,
                    "is_center": is_center,
                    "location": location_name,
                }
            )

    all_events = sorted(
        [e for e in all_events if pd.notna(e["dt"])], key=lambda x: x["dt"]
    )
    if not all_events:
        return pd.Series(extracted_times)

    def get_location_pattern(city_key, location_maps):
        if not city_key or not isinstance(city_key, str):
            return None
        aliases = set(
            location_maps["city_alias_map"].get(city_key, [city_key.replace("市", "")])
        )

        province = location_maps["city_to_province_map"].get(city_key)
        if province:
            aliases.add(province.replace("省", "").replace("市", ""))
            if location_maps.get("province_capital_map", {}).get(province) == city_key:
                aliases.add(province.replace("省", ""))

        return "|".join(map(re.escape, {a for a in aliases if a}))

    sender_pattern = get_location_pattern(sender_city_key, location_maps)
    dest_pattern = get_location_pattern(dest_city_key, location_maps)

    # 1. 揽收时间
    for event in all_events:
        if kw["collect"] and re.search(kw["collect"], event["line"]):
            extracted_times["揽收时间_zzy"] = event["dt"]
            break
    if pd.isna(extracted_times["揽收时间_zzy"]):
        extracted_times["揽收时间_zzy"] = all_events[0]["dt"]

    # 2. 派送时间
    for event in all_events:
        if kw["delivery"] and re.search(kw["delivery"], event["line"]):
            extracted_times["派送时间_zzy"] = event["dt"]
            break

    # 3. 签收时间
    for p_key in ["sign_p1", "sign_p2", "sign_fallback"]:
        if pd.notna(extracted_times["签收时间_zzy"]):
            break
        if kw[p_key]:
            for event in reversed(all_events):
                if kw["sign_ignore"] and re.search(kw["sign_ignore"], event["line"]):
                    continue
                if re.search(kw[p_key], event["line"]):
                    extracted_times["签收时间_zzy"] = event["dt"]
                    break

    # 4. 始发地相关时间
    if pd.notna(extracted_times["揽收时间_zzy"]) and sender_pattern:
        origin_center_events = [
            e
            for e in all_events
            if e["dt"] > extracted_times["揽收时间_zzy"]
            and e["is_center"]
            and re.search(sender_pattern, e["location"])
        ]

        for event in reversed(origin_center_events):
            if kw["leave"] and re.search(kw["leave"], event["line"]):
                match = re.search(r"发往【(.*?)】", event["line"])
                if match:
                    destination_of_leave = match.group(1)
                    if not re.search(sender_pattern, destination_of_leave):
                        extracted_times["离开寄件城市时间_zzy"] = event["dt"]

                        location_match = re.search(r"【(.*?)】", event["line"])
                        if location_match:
                            location_name = re.escape(
                                location_match.group(1).split("】")[0]
                            )
                            for arr_event in origin_center_events:
                                if (
                                    arr_event["dt"] <= event["dt"]
                                    and re.search(location_name, arr_event["line"])
                                    and re.search(kw["arrive"], arr_event["line"])
                                ):
                                    extracted_times["到达分拣中心时间_zzy"] = arr_event[
                                        "dt"
                                    ]
                                    break
                        break

    # --- 5. 目的地相关时间 (逻辑修正) ---
    if dest_pattern:
        # A. 到达收件城市时间 (采用两步查找法)
        # 第一步：优先查找明确的“到达【城市】”记录
        for event in all_events:
            if (
                not event["is_center"]
                and kw["arrive"]
                and re.search(kw["arrive"], event["line"])
            ):
                location_name = event["location"].replace("市", "")
                if dest_city_key and location_name == dest_city_key.replace("市", ""):
                    extracted_times["到达收件城市时间_zzy"] = event["dt"]
                    break

        # 第二步：如果没找到，则回退到查找第一个“到达城市内中心”的记录
        if pd.isna(extracted_times["到达收件城市时间_zzy"]):
            for event in all_events:
                if (
                    event["is_center"]
                    and kw["arrive"]
                    and re.search(kw["arrive"], event["line"])
                    and re.search(dest_pattern, event["location"])
                ):
                    extracted_times["到达收件城市时间_zzy"] = event["dt"]
                    break

        # B. 离开收件城市分拣中心时间
        if pd.notna(extracted_times["派送时间_zzy"]):
            dest_center_events = [
                e
                for e in all_events
                if e["dt"] < extracted_times["派送时间_zzy"]
                and e["is_center"]
                and re.search(dest_pattern, e["location"])
            ]
            for event in dest_center_events:
                if (
                    pd.notna(extracted_times["到达收件城市时间_zzy"])
                    and event["dt"] < extracted_times["到达收件城市时间_zzy"]
                ):
                    continue  # 必须在到达收件城市之后
                if kw["leave"] and re.search(kw["leave"], event["line"]):
                    extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                    break

    return pd.Series(extracted_times)


# --- 主流程: 京东 ---
def process_jd_data(zhuzhuyun_dir, anjian_dir, output_dir, base_data_file):
    company_name = "京东"
    config = {
        "zhuzhu_filename": "京东.xlsx",
        "internal_name": "京东",
        "anjian_map_key": "JBD",
        "parser": parse_logistics_events_jd,
    }
    company_start_time = perf_counter()

    location_maps = {
        "city_alias_map": {},
        "capital_cities_set": set(),
        "city_to_province_map": {},
        "province_capital_map": {},
    }
    if base_data_file.exists():
        try:
            excel_file = pd.ExcelFile(base_data_file)
            df_hierarchy = pd.read_excel(excel_file, sheet_name="city_hierarchy")

            df_hierarchy_no_na = df_hierarchy.dropna(subset=["Province", "City"])
            df_hierarchy_no_na["District"] = df_hierarchy_no_na["District"].fillna("")
            df_hierarchy_no_na["City_clean"] = df_hierarchy_no_na["City"].str.replace(
                "市", "", regex=False
            )
            df_hierarchy_no_na["District_clean"] = df_hierarchy_no_na[
                "District"
            ].str.replace("市", "", regex=False)
            for city, group in df_hierarchy_no_na.groupby("City"):
                aliases = [group.iloc[0]["City_clean"]] + group[
                    "District_clean"
                ].unique().tolist()
                location_maps["city_alias_map"][city] = [a for a in aliases if a]

            city_to_province = df_hierarchy_no_na.drop_duplicates("City")[
                ["City", "Province"]
            ].set_index("City")
            location_maps["city_to_province_map"] = city_to_province[
                "Province"
            ].to_dict()

            if "IsCapital" in df_hierarchy.columns:
                df_capitals = df_hierarchy[df_hierarchy["IsCapital"] == 1].copy()
                df_capitals = df_capitals.drop_duplicates(subset=["Province"])
                location_maps["province_capital_map"] = df_capitals.set_index(
                    "Province"
                )["City"].to_dict()
                location_maps["capital_cities_set"] = set(df_capitals["City"].unique())
        except Exception as e:
            print(f"[ERROR] 构建地理位置地图失败: {e}。")

    df_base_subset = None
    if base_data_file.exists():
        try:
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
        except Exception as e:
            print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")
    all_anjian_df = pd.concat(
        [pd.read_excel(f, dtype={"单号": str}) for f in anjian_dir.glob("*.xlsx")],
        ignore_index=True,
    )
    if "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)
    anjian_to_internal_map = {config["anjian_map_key"]: config["internal_name"]}
    all_anjian_df["企业"] = (
        all_anjian_df["企业"].map(anjian_to_internal_map).fillna(all_anjian_df["企业"])
    )
    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()
    company_file = zhuzhuyun_dir / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return
    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(
        columns={"快递单号": "单号", "快递公司": "企业"}, inplace=True
    )
    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")

    initial_rows = len(df_merged)
    df_merged.dropna(subset=["完整物流信息"], inplace=True)
    rows_dropped = initial_rows - len(df_merged)
    if rows_dropped > 0:
        print(f"  -> 已删除 {rows_dropped} 行缺少'完整物流信息'的数据。")

    base_profile = COMPANY_PROFILES_JD["default"].copy()
    company_specific_profile = COMPANY_PROFILES_JD.get(config["internal_name"], {})
    profile = {**base_profile, **company_specific_profile}
    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)
    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)
    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")
    df_final["企业"] = company_name
    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)
    output_file = output_dir / f"{company_name}_logistics_data.xlsx"
    final_df_to_save.to_excel(output_file, index=False)
    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.name}"
    )


# --- 运行: 京东 ---
process_jd_data(
    zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
)

--- [开始] 处理 京东 ---
  -> 已删除 3082 行缺少'完整物流信息'的数据。
  -> 使用 'parse_logistics_events_jd' 解析器通过 .apply() 运行...
✅ 京东 处理完成，耗时 185.99 秒。文件已保存至: 京东_logistics_data_TEST.xlsx


In [None]:
# --------------------------------------------------
# Cell 4.4: 极兔logistics数据提取（定稿版）
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

# --- 初始化 (仅在独立运行此单元格时需要) ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 极兔 ---")

# --- 项目路径设置 (请确保在Cell 0中已正确设置) ---
# 假设这些路径已在之前的单元格中定义
# base_path = Path.cwd()
# report_path = base_path / "报告数据"
# input_path = report_path / "输入"
# anjian_data_path = input_path / "安监数据"
# base_data_path = input_path / "basic_data.xlsx"
# temp_path = report_path / "temp"
# zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
# pycharm_input_path = temp_path / "4_logistics数据"


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    """通用行应用提取器"""
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 公司档案: 极兔 (规则已完善) ---
COMPANY_PROFILES_JITU = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件",
        "delivery": r"派送中|派件员|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往",
        "arrive": r"到达|抵达",
        "center": r"处理中心|分拨中心|转运场|运营区|枢纽中心|集散中心|分拣中心",
        "sign_p1": r"已暂存至|已投至|已到站|自提柜|智能柜|菜鸟驿站|快递超市|代收点|存放.*(快递柜|驿站|自提点)",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收|已上门",
        "sign_fallback": r"已签收|已妥投|已妥收|已投递",
    },
    "极兔": {
        "center": r"转运中心|集散中心|集散点",
        "center_exclude": r"公司|分部|服务部|网格仓|揽投部|营业部|经营分部|网点|集货点|营销中心|站点",
        "dest_arrival_exclude": r"服务部|网点",
        "sign_p1": r"已存放至.*?【|已送达.*?【|暂由.*?代为保管",
        "sign_p2": r"已按址投递|已由本人签收|客户签收",
        "sign_fallback": r"快件已签收|快件已按约定投递",
    },
}


# --- 解析器: 极兔 (已应用最终修复) ---
def parse_logistics_events_jitu(
    row: pd.Series, profile: dict, location_maps: dict
) -> pd.Series:
    """
    解析极兔的物流事件，并根据预设原则提取关键时间点。
    """
    log_text = row["完整物流信息"]
    sender_city_key, dest_city_key = row["寄出城市"], row["寄达城市"]

    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}

    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)

    # 从profile中加载关键词
    kw_collect, kw_delivery = profile.get("collect"), profile.get("delivery")
    kw_leave, kw_arrive = profile.get("leave"), profile.get("arrive")
    kw_sign_p1, kw_sign_p2, kw_sign_fallback, kw_sign_ignore = (
        profile.get("sign_p1"),
        profile.get("sign_p2"),
        profile.get("sign_fallback"),
        profile.get("sign_ignore"),
    )
    kw_all_centers = profile.get("center")
    kw_center_exclude = profile.get("center_exclude", "")
    kw_dest_arrival_exclude = profile.get("dest_arrival_exclude", "")

    # 预处理物流信息
    datetime_capture_pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    all_events = []
    for line in log_text.strip().split("\n"):
        match = re.search(datetime_capture_pattern, line)
        if match:
            dt = pd.to_datetime(match.group(1), errors="coerce")
            if pd.notna(dt):
                all_events.append({"dt": dt, "line": line})
    if not all_events:
        return pd.Series(extracted_times)
    all_events.sort(key=lambda x: x["dt"])

    # 辅助函数：生成城市别名匹配模式
    def get_location_pattern(city_key, location_maps):
        if not city_key or not isinstance(city_key, str) or pd.isna(city_key):
            return None
        aliases = location_maps.get("city_alias_map", {}).get(
            city_key, [city_key.replace("市", "")]
        )
        clean_aliases = {
            str(a)
            for a in aliases
            if a is not None and pd.notna(a) and str(a).strip() != ""
        }
        if not clean_aliases:
            return None
        return "|".join(map(re.escape, clean_aliases))

    # 提取揽收时间
    for event in all_events:
        if kw_collect and re.search(kw_collect, event["line"]):
            extracted_times["揽收时间_zzy"] = event["dt"]
            break

    # 提取派送时间
    for event in all_events:
        if kw_delivery and re.search(kw_delivery, event["line"]):
            extracted_times["派送时间_zzy"] = event["dt"]
            break

    # 提取签收时间 (按优先级)
    p1_event, p2_event, p3_event = None, None, None
    for event in all_events:
        if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
            continue
        if kw_sign_p1 and re.search(kw_sign_p1, event["line"]):
            p1_event = event
            break
    for event in reversed(all_events):
        if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
            continue
        if kw_sign_p2 and re.search(kw_sign_p2, event["line"]):
            p2_event = event
            break
    for event in reversed(all_events):
        if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
            continue
        if kw_sign_fallback and re.search(kw_sign_fallback, event["line"]):
            p3_event = event
            break
    if p1_event:
        extracted_times["签收时间_zzy"] = p1_event["dt"]
    elif p2_event:
        extracted_times["签收时间_zzy"] = p2_event["dt"]
    elif p3_event:
        extracted_times["签收时间_zzy"] = p3_event["dt"]

    # 提取与城市和分拣中心相关的时间
    sender_pattern = get_location_pattern(sender_city_key, location_maps)
    dest_pattern = get_location_pattern(dest_city_key, location_maps)

    # 寄件城市相关时间
    if pd.notna(extracted_times["揽收时间_zzy"]) and sender_pattern and kw_all_centers:
        last_sender_leave_event = None
        last_sender_center_arrival = None

        for event in all_events:
            if event["dt"] > extracted_times["揽收时间_zzy"]:
                action_part = re.split(r"；|已发往|发往", event["line"])[0]
                is_real_center = re.search(kw_all_centers, action_part) and not (
                    kw_center_exclude and re.search(kw_center_exclude, action_part)
                )
                if (
                    kw_leave
                    and re.search(kw_leave, action_part)
                    and re.search(sender_pattern, action_part)
                    and is_real_center
                ):
                    last_sender_leave_event = event
                if (
                    kw_arrive
                    and re.search(kw_arrive, action_part)
                    and re.search(sender_pattern, action_part)
                    and is_real_center
                ):
                    last_sender_center_arrival = event

        if last_sender_leave_event:
            extracted_times["离开寄件城市时间_zzy"] = last_sender_leave_event["dt"]
        if last_sender_center_arrival:
            extracted_times["到达分拣中心时间_zzy"] = last_sender_center_arrival["dt"]

    # 收件城市相关时间
    if dest_pattern:
        # 查找首次到达收件城市
        for event in all_events:
            action_part = re.split(r"；|已发往|发往", event["line"])[0]
            if (
                kw_arrive
                and re.search(kw_arrive, action_part)
                and re.search(dest_pattern, action_part)
            ):
                if not (
                    kw_dest_arrival_exclude
                    and re.search(kw_dest_arrival_exclude, action_part)
                ):
                    extracted_times["到达收件城市时间_zzy"] = event["dt"]
                    break

        # 查找离开收件城市分拣中心时间
        if pd.notna(extracted_times["派送时间_zzy"]):
            search_end_time = extracted_times["派送时间_zzy"]
            for event in reversed(all_events):
                if event["dt"] < search_end_time:
                    action_part = re.split(r"；|已发往|发往", event["line"])[0]
                    is_real_center = re.search(kw_all_centers, action_part) and not (
                        kw_center_exclude and re.search(kw_center_exclude, action_part)
                    )
                    if (
                        kw_leave
                        and re.search(kw_leave, action_part)
                        and re.search(dest_pattern, action_part)
                        and is_real_center
                    ):
                        extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                        break

    return pd.Series(extracted_times)


# --- 主流程: 极兔 (逻辑不变) ---
def process_jitu_data(zhuzhuyun_dir, anjian_dir, output_dir, base_data_file):
    """主流程函数，用于处理极兔快递的数据"""
    company_name = "极兔"
    company_start_time = perf_counter()

    config = {
        "zhuzhu_filename": "极兔.xlsx",
        "internal_name": "极兔",
        "parser": parse_logistics_events_jitu,
    }

    COMPANY_CONFIG = {"极兔": {"internal_name": "极兔", "anjian_map_key": "JT"}}

    location_maps = {}
    if base_data_file.exists():
        try:
            df_hierarchy = pd.read_excel(
                base_data_file,
                sheet_name="city_hierarchy",
                dtype={"Province": str, "City": str, "District": str},
            )
            df_hierarchy.dropna(subset=["Province", "City"], inplace=True)
            df_hierarchy["City_clean"] = df_hierarchy["City"].str.replace(
                "市", "", regex=False
            )

            # <<< 最终修正点 >>>: 调整District清洗逻辑，避免错误删除“乌市”中的“市”
            df_hierarchy["District_clean"] = df_hierarchy["District"].str.replace(
                r"区|县|自治州", "", regex=True
            )

            location_maps["city_alias_map"] = {
                city: [group.iloc[0]["City_clean"]]
                + group["District_clean"].unique().tolist()
                for city, group in df_hierarchy.groupby("City")
            }
        except Exception as e:
            print(f"[ERROR] 构建地理位置地图失败: {e}。")

    df_base_subset = None
    if base_data_file.exists():
        try:
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
        except Exception as e:
            print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")

    all_anjian_files = [
        f for f in anjian_dir.glob("*.xlsx") if not f.name.startswith("~$")
    ]
    if not all_anjian_files:
        print("[ERROR] 安监数据文件夹中未找到有效的Excel文件！")
        return
    all_anjian_df = pd.concat(
        [pd.read_excel(f, dtype={"单号": str}) for f in all_anjian_files],
        ignore_index=True,
    )
    if "快递公司" in all_anjian_df.columns and "企业" in all_anjian_df.columns:
        all_anjian_df["企业"].fillna(all_anjian_df["快递公司"], inplace=True)
        all_anjian_df.drop(columns=["快递公司"], inplace=True)
    elif "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)

    anjian_to_internal_map = {
        v["anjian_map_key"]: v["internal_name"]
        for k, v in COMPANY_CONFIG.items()
        if "anjian_map_key" in v
    }
    if "企业" in all_anjian_df.columns:
        all_anjian_df["企业"] = (
            all_anjian_df["企业"]
            .str.strip()
            .map(anjian_to_internal_map)
            .fillna(all_anjian_df["企业"].str.strip())
        )
        all_anjian_df["单号"] = all_anjian_df["单号"].str.strip()
    else:
        print("[ERROR] 安监数据中未找到 '企业' 或 '快递公司' 列，无法匹配寄送信息！")
        return
    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()

    company_file = zhuzhuyun_merge_path / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return

    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(columns={"快递单号": "单号"}, inplace=True)
    df_company_zhu["企业"] = config["internal_name"]
    df_company_zhu["单号"] = df_company_zhu["单号"].str.strip()
    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")

    base_profile = COMPANY_PROFILES_JITU["default"].copy()
    company_specific_profile = COMPANY_PROFILES_JITU.get(config["internal_name"], {})
    profile = base_profile.copy()
    for key, value in company_specific_profile.items():
        if (
            key in profile
            and value
            and key not in ["center_exclude", "dest_arrival_exclude"]
        ):
            profile[key] = f"{value}|{profile[key]}"
        elif value:
            profile[key] = value

    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)

    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)

    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")

    df_final["企业"] = company_name

    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)

    output_file = pycharm_input_path / f"{company_name}_logistics_data.xlsx"
    final_df_to_save.to_excel(output_file, index=False)

    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.name}"
    )


# --- 运行示例 (请确保路径变量已正确设置) ---
process_jitu_data(
    zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
)

--- [开始] 处理 极兔 ---
  -> 使用 'parse_logistics_events_jitu' 解析器通过 .apply() 运行...
✅ 极兔 处理完成，耗时 90.55 秒。文件已保存至: 极兔_logistics_data_TEST.xlsx


In [None]:
# --------------------------------------------------
# Cell 4.5: 韵达logistics数据提取(定稿版)
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

# --- 初始化 (仅在独立运行此单元格时需要) ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 韵达 ---")

# --- 项目路径设置 (请确保在Cell 0中已正确设置) ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 公司档案: 韵达 (引入分场景排除标准) ---
COMPANY_PROFILES_YUNDA = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件",
        "delivery": r"派送中|派件员|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往",
        "arrive": r"到达|抵达",
        "center": r"处理中心|分拨中心|转运场|运营区|枢纽中心|集散中心",
        "sign_p1": r"已暂存至|已投至|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收",
        "sign_fallback": r"已签收|已妥投|已妥收|已投递",
    },
    "韵达": {
        "center": r"分拨交付中心",
        # <<< 修正点1 >>>：定义两套排除标准
        # 严格标准：用于识别真正的“中心”操作
        "center_exclude": r"公司|分部|服务部|网格仓|揽投部|营业部|经营分部|网点|集货点|营销中心",
        # 宽松标准：用于识别“到达收件城市”这一首个落地动作
        "dest_arrival_exclude": r"服务部",  # 根据案例，仅排除“服务部”
        "sign_p2": r"已送货上门签收|已由邮政派送签收",
        "sign_fallback": r"快件已投递|快件已按址投递",
    },
}


# --- 解析器: 韵达 (分场景调用不同排除标准) ---
def parse_logistics_events_yunda(
    row: pd.Series, profile: dict, location_maps: dict
) -> pd.Series:
    log_text = row["完整物流信息"]
    sender_city_key, dest_city_key = row["寄出城市"], row["寄达城市"]
    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}
    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)

    kw_collect, kw_delivery = profile.get("collect"), profile.get("delivery")
    kw_leave, kw_arrive = profile.get("leave"), profile.get("arrive")
    kw_sign_p1, kw_sign_p2, kw_sign_fallback, kw_sign_ignore = (
        profile.get("sign_p1"),
        profile.get("sign_p2"),
        profile.get("sign_fallback"),
        profile.get("sign_ignore"),
    )
    kw_all_centers = profile.get("center")
    # <<< 修正点2 >>>：准备两套排除关键词
    kw_center_exclude = profile.get("center_exclude", "")
    kw_dest_arrival_exclude = profile.get("dest_arrival_exclude", "")

    datetime_capture_pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    all_events = []
    for line in log_text.strip().split("\n"):
        match = re.search(datetime_capture_pattern, line)
        if match:
            dt = pd.to_datetime(match.group(1), errors="coerce")
            if pd.notna(dt):
                all_events.append({"dt": dt, "line": line})
    if not all_events:
        return pd.Series(extracted_times)
    all_events.sort(key=lambda x: x["dt"])

    def get_location_pattern(city_key, location_maps):
        if not city_key or not isinstance(city_key, str) or pd.isna(city_key):
            return None
        aliases = location_maps["city_alias_map"].get(
            city_key, [city_key.replace("市", "")]
        )
        clean_aliases = {
            str(a)
            for a in aliases
            if a is not None and pd.notna(a) and str(a).strip() != ""
        }
        if not clean_aliases:
            return None
        return "|".join(map(re.escape, clean_aliases))

    for event in all_events:
        if kw_collect and re.search(kw_collect, event["line"]):
            extracted_times["揽收时间_zzy"] = event["dt"]
            break

    for event in all_events:
        if kw_delivery and re.search(kw_delivery, event["line"]):
            extracted_times["派送时间_zzy"] = event["dt"]
            break

    p1_event, p2_event, p3_event = None, None, None
    for event in all_events:
        if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
            continue
        if kw_sign_p1 and re.search(kw_sign_p1, event["line"]):
            p1_event = event
            break
    for event in reversed(all_events):
        if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
            continue
        if kw_sign_p2 and re.search(kw_sign_p2, event["line"]):
            p2_event = event
            break
    for event in reversed(all_events):
        if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
            continue
        if kw_sign_fallback and re.search(kw_sign_fallback, event["line"]):
            p3_event = event
            break
    if p1_event:
        extracted_times["签收时间_zzy"] = p1_event["dt"]
    elif p2_event:
        extracted_times["签收时间_zzy"] = p2_event["dt"]
    elif p3_event:
        extracted_times["签收时间_zzy"] = p3_event["dt"]

    sender_pattern = get_location_pattern(sender_city_key, location_maps)

    if pd.notna(extracted_times["揽收时间_zzy"]) and sender_pattern and kw_all_centers:
        last_sender_leave_event = None
        for event in all_events:
            if event["dt"] > extracted_times["揽收时间_zzy"]:
                action_part = re.split(r"；|发往", event["line"])[0]
                # 使用严格的 center_exclude
                if (
                    kw_leave
                    and re.search(kw_leave, action_part)
                    and re.search(kw_all_centers, action_part)
                    and re.search(sender_pattern, action_part)
                    and not (
                        kw_center_exclude and re.search(kw_center_exclude, action_part)
                    )
                ):
                    last_sender_leave_event = event

        if last_sender_leave_event:
            extracted_times["离开寄件城市时间_zzy"] = last_sender_leave_event["dt"]

        last_sender_center_arrival = None
        search_end_time = extracted_times.get(
            "离开寄件城市时间_zzy", all_events[-1]["dt"] + pd.Timedelta(seconds=1)
        )

        for event in all_events:
            if not (extracted_times["揽收时间_zzy"] < event["dt"] < search_end_time):
                continue
            action_part = re.split(r"；|发往", event["line"])[0]
            # 使用严格的 center_exclude
            if (
                kw_arrive
                and re.search(kw_arrive, action_part)
                and re.search(kw_all_centers, action_part)
                and re.search(sender_pattern, action_part)
                and not (
                    kw_center_exclude and re.search(kw_center_exclude, action_part)
                )
            ):
                last_sender_center_arrival = event

        if last_sender_center_arrival:
            extracted_times["到达分拣中心时间_zzy"] = last_sender_center_arrival["dt"]

    dest_pattern = get_location_pattern(dest_city_key, location_maps)
    if dest_pattern:
        if pd.isna(extracted_times["到达收件城市时间_zzy"]):
            for event in all_events:
                action_part = re.split(r"；|发往", event["line"])[0]
                # <<< 修正点3 >>>：使用宽松的 dest_arrival_exclude
                if (
                    kw_arrive
                    and re.search(kw_arrive, action_part)
                    and re.search(dest_pattern, action_part)
                ):
                    if not (
                        kw_dest_arrival_exclude
                        and re.search(kw_dest_arrival_exclude, action_part)
                    ):
                        extracted_times["到达收件城市时间_zzy"] = event["dt"]
                        break

        if pd.notna(extracted_times["派送时间_zzy"]):
            for event in reversed(all_events):
                if event["dt"] < extracted_times["派送时间_zzy"]:
                    action_part = re.split(r"；|发往", event["line"])[0]
                    # 使用严格的 center_exclude
                    if (
                        kw_leave
                        and re.search(kw_leave, action_part)
                        and kw_all_centers
                        and re.search(kw_all_centers, action_part)
                        and re.search(dest_pattern, action_part)
                        and not (
                            kw_center_exclude
                            and re.search(kw_center_exclude, action_part)
                        )
                    ):
                        extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                        break

    return pd.Series(extracted_times)


# --- 主流程: 韵达 (整合所有修正) ---
def process_yunda_data(zhuzhuyun_dir, anjian_dir, output_dir, base_data_file):
    company_name = "韵达"
    company_start_time = perf_counter()
    COMPANY_CONFIG = {
        "EMS": {"internal_name": "EMS", "anjian_map_key": "EMS"},
        "邮政": {"internal_name": "邮政国内小包", "anjian_map_key": "ZGYZ"},
        "京东": {"internal_name": "京东", "anjian_map_key": "JBD"},
        "圆通": {"internal_name": "圆通", "anjian_map_key": "YTO"},
        "申通": {"internal_name": "申通", "anjian_map_key": "STO"},
        "韵达": {"internal_name": "韵达", "anjian_map_key": "YUNDA"},
        "极兔": {"internal_name": "极兔", "anjian_map_key": "JT"},
        "德邦": {"internal_name": "德邦", "anjian_map_key": "DEPPON"},
    }
    config = {
        "zhuzhu_filename": "韵达.xlsx",
        "internal_name": "韵达",
        "parser": parse_logistics_events_yunda,
    }

    location_maps = {}
    if base_data_file.exists():
        try:
            df_hierarchy = pd.read_excel(
                base_data_file,
                sheet_name="city_hierarchy",
                dtype={"Province": str, "City": str, "District": str},
            )
            df_hierarchy.dropna(subset=["Province", "City"], inplace=True)
            df_hierarchy["City_clean"] = df_hierarchy["City"].str.replace(
                "市", "", regex=False
            )
            df_hierarchy["District_clean"] = df_hierarchy["District"].str.replace(
                "市", "", regex=False
            )
            location_maps["city_alias_map"] = {
                city: [group.iloc[0]["City_clean"]]
                + group["District_clean"].unique().tolist()
                for city, group in df_hierarchy.groupby("City")
            }
            if "IsCapital" in df_hierarchy.columns:
                df_capitals = df_hierarchy[df_hierarchy["IsCapital"] == 1]
                location_maps["capital_cities_set"] = set(df_capitals["City"].unique())
            city_to_province = df_hierarchy.drop_duplicates("City")[
                ["City", "Province"]
            ].set_index("City")
            location_maps["city_to_province_map"] = city_to_province[
                "Province"
            ].to_dict()
        except Exception as e:
            print(f"[ERROR] 构建地理位置地图失败: {e}。")

    df_base_subset = None
    if base_data_file.exists():
        try:
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
        except Exception as e:
            print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")

    all_anjian_files = [
        f for f in anjian_dir.glob("*.xlsx") if not f.name.startswith("~$")
    ]
    if not all_anjian_files:
        print("[ERROR] 安监数据文件夹中未找到有效的Excel文件！")
        return
    all_anjian_df = pd.concat(
        [pd.read_excel(f, dtype={"单号": str}) for f in all_anjian_files],
        ignore_index=True,
    )

    if "快递公司" in all_anjian_df.columns and "企业" in all_anjian_df.columns:
        all_anjian_df["企业"].fillna(all_anjian_df["快递公司"], inplace=True)
        all_anjian_df.drop(columns=["快递公司"], inplace=True)
    elif "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)

    anjian_to_internal_map = {
        v["anjian_map_key"]: v["internal_name"]
        for k, v in COMPANY_CONFIG.items()
        if "anjian_map_key" in v
    }
    if "企业" in all_anjian_df.columns:
        all_anjian_df["企业"] = (
            all_anjian_df["企业"]
            .str.strip()
            .map(anjian_to_internal_map)
            .fillna(all_anjian_df["企业"].str.strip())
        )
        all_anjian_df["单号"] = all_anjian_df["单号"].str.strip()
    else:
        print("[ERROR] 安监数据中未找到 '企业' 或 '快递公司' 列，无法匹配寄送信息！")
        return

    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()

    company_file = zhuzhuyun_merge_path / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return

    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(columns={"快递单号": "单号"}, inplace=True)
    df_company_zhu["企业"] = config["internal_name"]
    df_company_zhu["单号"] = df_company_zhu["单号"].str.strip()

    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")

    base_profile = COMPANY_PROFILES_YUNDA["default"].copy()
    company_specific_profile = COMPANY_PROFILES_YUNDA.get(config["internal_name"], {})

    profile = base_profile.copy()
    for key, value in company_specific_profile.items():
        if (
            key in profile
            and value
            and key not in ["center_exclude", "dest_arrival_exclude"]
        ):  # 这两个key不合并，直接使用公司专属的
            profile[key] = f"{value}|{profile[key]}"
        elif value:
            profile[key] = value

    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)

    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)

    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")

    df_final["企业"] = company_name

    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)
    output_file = pycharm_input_path / f"{company_name}_logistics_data.xlsx"
    final_df_to_save.to_excel(output_file, index=False)

    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.name}"
    )


# --- 运行: 韵达 ---
process_yunda_data(
    zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
)

--- [开始] 处理 韵达 ---
  -> 使用 'parse_logistics_events_yunda' 解析器通过 .apply() 运行...
✅ 韵达 处理完成，耗时 92.30 秒。文件已保存至: 韵达_logistics_data_TEST.xlsx


In [None]:
# --------------------------------------------------
# Cell 4.6: 德邦logistics数据提取（定稿版）
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter
from typing import Any, Dict, List, Optional

import pandas as pd

# --- 初始化 ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 德邦 ---")

# --- 项目路径设置 (请确保路径正确) ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"

# --- 全局预编译正则表达式 ---
DATETIME_CAPTURE_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")

# --- 公司档案: 德邦 ---
COMPANY_PROFILES_DEPPON = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件",
        "delivery": r"派送中|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往|航班起飞",
        "arrive": r"到达|抵达|航班到达",
        "center": r"处理中心|分拨中心|转运场|转运中心|运营区|枢纽中心|集散中心|分拨站|空运总调|机场运作部|运作部|集配站",
        "exclude": r"揽投部|营业部|经营分部|网点|集货点|营销中心",
        "sign_p1": r"已派送至|已投至|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点|包裹已存放至",  # P1: 放入驿站/柜子 - 动作
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收",  # P2: 人员签收
        "sign_p3": r"DONT_MATCH_ANYTHING",  # P3: 意向/通用确认 (德邦有专用)
        "sign_fallback": r"已签收|已妥投|已妥收",  # Fallback
    },
    "德邦": {
        "sign_p1": r"快件已暂存至|包裹已存放至|集运仓签收|菜鸟驿站签收|水电表箱签收|丰巢柜|妈妈驿站",
        "sign_p2": r"家门口签收|本人签收|代收|已由同事签收|收发室签收|前台签收|亲属签收|其他签收|门卫签收|物业签收|正常签收",
        "sign_p3": r"经收货人同意，此件放置在",
        "exclude": r"经营分部",
    },
}


# --- 解析器辅助函数 ---
def get_location_pattern(city_key: Optional[str], location_maps: Dict) -> Optional[str]:
    if not city_key or not isinstance(city_key, str):
        return None
    aliases = set(
        location_maps["city_alias_map"].get(city_key, [city_key.replace("市", "")])
    )
    if city_key in location_maps["capital_cities_set"]:
        province = location_maps["city_to_province_map"].get(city_key)
        if province:
            aliases.add(province.replace("省", "").replace("市", ""))
    valid_aliases = {alias for alias in aliases if alias}
    return "|".join(map(re.escape, valid_aliases))


# --- 主解析器: 德邦 (最终修正版) ---
def parse_logistics_events_deppon_final_logic(
    row: pd.Series, profile: Dict, location_maps: Dict
) -> pd.Series:
    log_text, sender_city, dest_city = (
        row["完整物流信息"],
        row["寄出城市"],
        row["寄达城市"],
    )
    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}

    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)

    all_events = []
    for line in log_text.strip().split("\n"):
        match = DATETIME_CAPTURE_PATTERN.search(line)
        if match:
            dt = pd.to_datetime(match.group(1), errors="coerce")
            if pd.notna(dt):
                all_events.append({"dt": dt, "line": line})

    if not all_events:
        return pd.Series(extracted_times)
    all_events.sort(key=lambda x: x["dt"])

    # --- 基础时间提取 ---
    collect_event = next(
        (e for e in all_events if re.search(profile.get("collect"), e["line"])), None
    )
    if collect_event:
        extracted_times["揽收时间_zzy"] = collect_event["dt"]

    delivery_event = next(
        (e for e in all_events if re.search(profile.get("delivery"), e["line"])),
        None,
    )
    if delivery_event:
        extracted_times["派送时间_zzy"] = delivery_event["dt"]

    # --- 签收时间（多级优先级处理）---
    sign_event = None
    patterns_to_check = ["sign_p1", "sign_p2", "sign_p3", "sign_fallback"]
    for pattern_key in patterns_to_check:
        pattern = profile.get(pattern_key)
        if pattern:
            sign_event = next(
                (e for e in reversed(all_events) if re.search(pattern, e["line"])),
                None,
            )
        if sign_event:
            break

    if sign_event:
        extracted_times["签收时间_zzy"] = sign_event["dt"]

    # --- 准备地理位置和关键字 ---
    sender_pattern = get_location_pattern(sender_city, location_maps)
    dest_pattern = get_location_pattern(dest_city, location_maps)
    kw_center, kw_leave, kw_arrive, kw_exclude = (
        profile.get("center"),
        profile.get("leave"),
        profile.get("arrive"),
        profile.get("exclude", "DONT_MATCH_ANYTHING"),
    )

    # --- 寄件城市阶段处理 (Sender City Phase Processing) ---
    if pd.notna(extracted_times["揽收时间_zzy"]) and sender_pattern:
        true_leave_event = None
        for event in reversed(all_events):
            if event["dt"] > extracted_times["揽收时间_zzy"]:
                if re.search(kw_leave, event["line"]) and re.search(
                    sender_pattern, event["line"]
                ):
                    if not re.search(kw_exclude, event["line"]):
                        true_leave_event = event
                        break
        if true_leave_event:
            extracted_times["离开寄件城市时间_zzy"] = true_leave_event["dt"]

        window_end = true_leave_event["dt"] if true_leave_event else pd.Timestamp.max

        last_arrival_in_origin = None
        for event in all_events:
            if not (extracted_times["揽收时间_zzy"] < event["dt"] < window_end):
                continue
            if (
                re.search(kw_arrive, event["line"])
                and not re.search(kw_leave, event["line"])
                and re.search(kw_center, event["line"])
                and re.search(sender_pattern, event["line"])
                and not re.search(kw_exclude, event["line"])
            ):
                last_arrival_in_origin = event
        if last_arrival_in_origin:
            extracted_times["到达分拣中心时间_zzy"] = last_arrival_in_origin["dt"]

    # --- 收件城市阶段处理 (Destination City Phase Processing) ---
    if dest_pattern:
        arrive_dest_event = next(
            (
                e
                for e in all_events
                if (
                    re.search(kw_arrive, e["line"])
                    and re.search(dest_pattern, e["line"])
                    and not re.search(kw_leave, e["line"])
                )
            ),
            None,
        )
        if arrive_dest_event:
            extracted_times["到达收件城市时间_zzy"] = arrive_dest_event["dt"]

    # *** 此处是核心逻辑修正: 查找离开收件城市中心的最后一个事件 ***
    last_leave_dest_center = None  # 确保变量在任何情况下都存在
    if pd.notna(extracted_times["派送时间_zzy"]) and dest_pattern and kw_center:
        leave_dest_center_pattern = re.compile(
            f"({kw_leave}).*?【(?P<location_name>.*?)】"
        )

        start_window = extracted_times.get("到达收件城市时间_zzy", pd.Timestamp.min)
        end_window = extracted_times["派送时间_zzy"]

        for event in all_events:
            if not (start_window < event["dt"] < end_window):
                continue

            match = leave_dest_center_pattern.search(event["line"])
            if match:
                location_name = match.group("location_name")

                if (
                    re.search(dest_pattern, location_name)
                    and re.search(kw_center, location_name)
                    and not re.search(kw_exclude, location_name)
                ):
                    last_leave_dest_center = event

    if last_leave_dest_center:
        extracted_times["离开收件城市分拣中心时间_zzy"] = last_leave_dest_center["dt"]

    return pd.Series(extracted_times)


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 主流程: 德邦 ---
def process_deppon_data(
    zhuzhuyun_dir: Path, anjian_dir: Path, output_dir: Path, base_data_file: Path
):
    company_name = "德邦"
    config = {
        "zhuzhu_filename": "德邦.xlsx",
        "internal_name": "德邦",
        "anjian_map_key": "DEPPON",
        "parser": parse_logistics_events_deppon_final_logic,
    }
    company_start_time = perf_counter()

    location_maps = {
        "city_alias_map": {},
        "capital_cities_set": set(),
        "city_to_province_map": {},
    }
    try:
        if base_data_file.exists():
            df_hierarchy = pd.read_excel(base_data_file, sheet_name="city_hierarchy")
            df_hierarchy.dropna(subset=["Province", "City", "District"], inplace=True)
            df_hierarchy["City_clean"] = df_hierarchy["City"].str.replace(
                "市", "", regex=False
            )
            df_hierarchy["District_clean"] = df_hierarchy["District"].str.replace(
                r"市|区|县", "", regex=True
            )

            for city, group in df_hierarchy.groupby("City"):
                aliases = [group.iloc[0]["City_clean"]] + group[
                    "District_clean"
                ].unique().tolist()
                location_maps["city_alias_map"][city] = [a for a in aliases if a]
            if "IsCapital" in df_hierarchy.columns:
                df_capitals = df_hierarchy[df_hierarchy["IsCapital"] == 1]
                location_maps["capital_cities_set"] = set(df_capitals["City"].unique())
            city_to_province = df_hierarchy.drop_duplicates("City").set_index("City")[
                "Province"
            ]
            location_maps["city_to_province_map"] = city_to_province.to_dict()

    except Exception as e:
        print(f"[ERROR] 构建地理位置地图失败: {e}。")

    df_base_subset = None
    try:
        if base_data_file.exists():
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
    except Exception as e:
        print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")
    all_anjian_df = pd.concat(
        [pd.read_excel(f, dtype={"单号": str}) for f in anjian_dir.glob("*.xlsx")],
        ignore_index=True,
    )
    if "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)
    anjian_to_internal_map = {config["anjian_map_key"]: config["internal_name"]}
    all_anjian_df["企业"] = (
        all_anjian_df["企业"].map(anjian_to_internal_map).fillna(all_anjian_df["企业"])
    )
    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()

    company_file = zhuzhuyun_dir / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return
    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(
        columns={"快递单号": "单号", "快递公司": "企业"}, inplace=True
    )
    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")

    base_profile = COMPANY_PROFILES_DEPPON["default"].copy()
    company_specific_profile = COMPANY_PROFILES_DEPPON.get(config["internal_name"], {})
    profile = {**base_profile, **company_specific_profile}
    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)

    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)
    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")

    df_final["企业"] = company_name
    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / f"{company_name}_logistics_data.xlsx"
    final_df_to_save.to_excel(output_file, index=False)
    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.relative_to(base_path)}"
    )


# --- 运行主流程 ---
if __name__ == "__main__":
    if all(
        p.exists()
        for p in [
            zhuzhuyun_merge_path,
            anjian_data_path,
            pycharm_input_path,
            base_data_path,
        ]
    ):
        process_deppon_data(
            zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
        )
    else:
        print("[WARNING] 一个或多个所需的数据路径不存在，已跳过执行。请检查路径设置。")

--- [开始] 处理 德邦 ---
  -> 使用 'parse_logistics_events_deppon_final_logic' 解析器通过 .apply() 运行...
✅ 德邦 处理完成，耗时 89.17 秒。文件已保存至: 报告数据/temp/4_logistics数据/德邦_logistics_data_TEST.xlsx


In [None]:
# --------------------------------------------------
# Cell 4.7: 申通logistics数据提取
# -------------------------------------------------

In [18]:
# --------------------------------------------------
# Cell: 申通 (STO) 专用代码 (最终版)
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

# --- 初始化 (仅在独立运行此单元格时需要) ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 申通 ---")

# --- 项目路径设置 (请确保在Cell 0中已正确设置) ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 公司档案: 申通 ---
COMPANY_PROFILES_STO = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件",
        "delivery": r"派送中|派件员|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往",
        "arrive": r"到达|抵达",
        "center": r"处理中心|分拨中心|转运场|运营区|枢纽中心|集散中心",
        "exclude": r"揽投部|营业部|经营分部|网点|集货点|营销中心",
        "sign_p1": r"已经妥投|已送达|作废处理|将提供送货上门|即将为您安排送货上门|快件已到达\[.*?驿站\]|已抵达.*?公司|包裹已完成签收|已放入|已存放至|到达退货服务站点|被退回|超市|正在验收中|已送货上门|按地址投递|已放入.*?驿站|已投递|已抵达.*?服务点|已派送至|已投至|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收",
        "sign_fallback": r"已签收|已妥投|已妥收",
    },
    "申通": {"sign_p2": r"已由【.*?】签收|已签收|代收"},
}


# --- 解析器: 申通 (使用稳定的Postal版本) ---
def parse_logistics_events_sto(
    row: pd.Series, profile: dict, location_maps: dict
) -> pd.Series:
    log_text = row["完整物流信息"]
    sender_city_key, dest_city_key = row["寄出城市"], row["寄达城市"]
    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}
    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)
    kw_collect, kw_delivery = profile.get("collect"), profile.get("delivery")
    kw_leave, kw_arrive, kw_exclude = (
        profile.get("leave"),
        profile.get("arrive"),
        profile.get("exclude", ""),
    )
    kw_sign_p1, kw_sign_p2, kw_sign_fallback, kw_sign_ignore = (
        profile.get("sign_p1"),
        profile.get("sign_p2"),
        profile.get("sign_fallback"),
        profile.get("sign_ignore"),
    )
    kw_center_p1, kw_center_p2 = profile.get("center_p1"), profile.get("center_p2")
    kw_center = profile.get("center")
    if not kw_center_p1 and not kw_center_p2 and kw_center:
        kw_center_p1 = kw_center
    kw_all_centers = ((kw_center_p1 or "") + "|" + (kw_center_p2 or "")).strip("|")
    datetime_capture_pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    all_events = []
    for line in log_text.strip().split("\n"):
        match = re.search(datetime_capture_pattern, line)
        if match:
            dt = pd.to_datetime(match.group(1), errors="coerce")
            if pd.notna(dt):
                all_events.append({"dt": dt, "line": line})
    if not all_events:
        return pd.Series(extracted_times)
    all_events.sort(key=lambda x: x["dt"])

    def get_location_pattern(city_key, location_maps):
        if not city_key or not isinstance(city_key, str):
            return None
        aliases = location_maps["city_alias_map"].get(
            city_key, [city_key.replace("市", "")]
        )
        if city_key in location_maps["capital_cities_set"]:
            province = location_maps["city_to_province_map"].get(city_key)
            if province:
                aliases.append(province.replace("省", "").replace("市", ""))
        return "|".join(map(re.escape, set(a for a in aliases if a)))

    for event in all_events:
        if kw_collect and re.search(kw_collect, event["line"]):
            extracted_times["揽收时间_zzy"] = event["dt"]
            break

    for event in all_events:
        if (
            pd.isna(extracted_times["派送时间_zzy"])
            and kw_delivery
            and re.search(kw_delivery, event["line"])
        ):
            extracted_times["派送时间_zzy"] = event["dt"]
            break

    if kw_sign_p1:
        for event in reversed(all_events):
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p1, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_p2:
        for event in reversed(all_events):
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p2, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_fallback:
        for event in reversed(all_events):
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_fallback, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break

    sender_pattern = get_location_pattern(sender_city_key, location_maps)
    dest_pattern = get_location_pattern(dest_city_key, location_maps)

    # --- 逻辑重构: 增加备用逻辑(Fallback) ---
    if pd.notna(extracted_times["揽收时间_zzy"]) and sender_pattern:
        # 1a. 到达分拣中心时间 (主逻辑)
        for event in all_events:
            if (
                event["dt"] > extracted_times["揽收时间_zzy"]
                and re.search(sender_pattern, event["line"])
                and kw_all_centers
                and re.search(kw_all_centers, event["line"])
                and kw_arrive
                and re.search(kw_arrive, event["line"])
            ):
                extracted_times["到达分拣中心时间_zzy"] = event["dt"]
                break

        # 1b. 到达分拣中心时间 (备用逻辑)
        if pd.isna(extracted_times["到达分拣中心时间_zzy"]):
            for event in all_events:
                if (
                    event["dt"] > extracted_times["揽收时间_zzy"]
                    and re.search(sender_pattern, event["line"])
                    and kw_all_centers
                    and re.search(kw_all_centers, event["line"])
                ):
                    extracted_times["到达分拣中心时间_zzy"] = event["dt"]
                    break

        # 2. 离开寄件城市时间 (主逻辑)
        for event in reversed(all_events):
            if (
                event["dt"] > extracted_times["揽收时间_zzy"]
                and re.search(sender_pattern, event["line"])
                and kw_leave
                and re.search(kw_leave, event["line"])
            ):
                extracted_times["离开寄件城市时间_zzy"] = event["dt"]
                break

    if dest_pattern:
        if pd.isna(extracted_times["到达收件城市时间_zzy"]):
            for event in all_events:
                if (
                    kw_arrive
                    and re.search(kw_arrive, event["line"])
                    and re.search(dest_pattern, event["line"])
                ):
                    extracted_times["到达收件城市时间_zzy"] = event["dt"]
                    break

    if pd.notna(extracted_times["派送时间_zzy"]) and dest_pattern:
        # 3a. 离开收件城市分拣中心时间 (主逻辑)
        if kw_center_p1:
            for event in reversed(all_events):
                if (
                    event["dt"] < extracted_times["派送时间_zzy"]
                    and re.search(kw_center_p1, event["line"])
                    and kw_leave
                    and re.search(kw_leave, event["line"])
                    and re.search(dest_pattern, event["line"])
                ):
                    extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                    break
        if pd.isna(extracted_times["离开收件城市分拣中心时间_zzy"]) and kw_center_p2:
            for event in reversed(all_events):
                if (
                    event["dt"] < extracted_times["派送时间_zzy"]
                    and re.search(kw_center_p2, event["line"])
                    and kw_leave
                    and re.search(kw_leave, event["line"])
                    and re.search(dest_pattern, event["line"])
                ):
                    extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                    break

        # 3b. 离开收件城市分拣中心时间 (备用逻辑)
        if pd.isna(extracted_times["离开收件城市分拣中心时间_zzy"]):
            for event in reversed(all_events):
                if (
                    event["dt"] < extracted_times["派送时间_zzy"]
                    and re.search(dest_pattern, event["line"])
                    and kw_all_centers
                    and re.search(kw_all_centers, event["line"])
                ):
                    extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                    break

    return pd.Series(extracted_times)


# --- 主流程: 申通 ---
def process_sto_data(zhuzhuyun_dir, anjian_dir, output_dir, base_data_file):
    company_name = "申通"
    config = {
        "zhuzhu_filename": "申通.xlsx",
        "internal_name": "申通",
        "anjian_map_key": "STO",
        "parser": parse_logistics_events_sto,
    }
    company_start_time = perf_counter()
    location_maps = {
        "city_alias_map": {},
        "capital_cities_set": set(),
        "city_to_province_map": {},
    }
    if base_data_file.exists():
        try:
            df_hierarchy = pd.read_excel(base_data_file, sheet_name="city_hierarchy")
            df_hierarchy.dropna(subset=["Province", "City", "District"], inplace=True)
            df_hierarchy["City_clean"] = df_hierarchy["City"].str.replace(
                "市", "", regex=False
            )
            df_hierarchy["District_clean"] = df_hierarchy["District"].str.replace(
                "市", "", regex=False
            )
            for city, group in df_hierarchy.groupby("City"):
                aliases = [group.iloc[0]["City_clean"]] + group[
                    "District_clean"
                ].unique().tolist()
                location_maps["city_alias_map"][city] = [a for a in aliases if a]
            if "IsCapital" in df_hierarchy.columns:
                df_capitals = df_hierarchy[df_hierarchy["IsCapital"] == 1]
                location_maps["capital_cities_set"] = set(df_capitals["City"].unique())
            city_to_province = df_hierarchy.drop_duplicates("City")[
                ["City", "Province"]
            ].set_index("City")
            location_maps["city_to_province_map"] = city_to_province[
                "Province"
            ].to_dict()
        except Exception as e:
            print(f"[ERROR] 构建地理位置地图失败: {e}。")
    df_base_subset = None
    if base_data_file.exists():
        try:
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
        except Exception as e:
            print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")
    all_anjian_df = pd.concat(
        [pd.read_excel(f, dtype={"单号": str}) for f in anjian_dir.glob("*.xlsx")],
        ignore_index=True,
    )
    if "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)
    anjian_to_internal_map = {config["anjian_map_key"]: config["internal_name"]}
    all_anjian_df["企业"] = (
        all_anjian_df["企业"].map(anjian_to_internal_map).fillna(all_anjian_df["企业"])
    )
    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()
    company_file = zhuzhuyun_dir / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return
    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(
        columns={"快递单号": "单号", "快递公司": "企业"}, inplace=True
    )
    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")
    base_profile = COMPANY_PROFILES_STO["default"].copy()
    company_specific_profile = COMPANY_PROFILES_STO.get(config["internal_name"], {})
    profile = {**base_profile, **company_specific_profile}
    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)
    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)
    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")
    df_final["企业"] = company_name
    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)

    output_file = output_dir / f"{company_name}_logistics_data_TEST.xlsx"
    final_df_to_save.to_excel(output_file, index=False)
    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.name}"
    )


# --- 运行: 申通 ---
process_sto_data(
    zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
)

--- [开始] 处理 申通 ---
  -> 使用 'parse_logistics_events_sto' 解析器通过 .apply() 运行...
✅ 申通 处理完成，耗时 89.94 秒。文件已保存至: 申通_logistics_data_TEST.xlsx


In [None]:
# --------------------------------------------------
# Cell 4.8: 圆通logistics数据提取
# --------------------------------------------------

In [19]:
# --------------------------------------------------
# Cell: 圆通 (YTO) 专用代码 (最终版)
# --------------------------------------------------
import re
import warnings
from pathlib import Path
from time import perf_counter

import pandas as pd

# --- 初始化 (仅在独立运行此单元格时需要) ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
print("--- [开始] 处理 圆通 ---")

# --- 项目路径设置 (请确保在Cell 0中已正确设置) ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_path = report_path / "输入"
anjian_data_path = input_path / "安监数据"
base_data_path = input_path / "basic_data.xlsx"
temp_path = report_path / "temp"
zhuzhuyun_merge_path = temp_path / "3_猪猪云合并数据"
pycharm_input_path = temp_path / "4_logistics数据"


# --- 通用辅助函数 ---
def extractor_apply_based(
    df: pd.DataFrame, profile: dict, location_maps: dict, parser_func
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    print(f"  -> 使用 '{parser_func.__name__}' 解析器通过 .apply() 运行...")
    return df.apply(lambda row: parser_func(row, profile, location_maps), axis=1)


# --- 公司档案: 圆通 ---
COMPANY_PROFILES_YTO = {
    "default": {
        "collect": r"揽收|已收取|揽件|已取件",
        "delivery": r"派送中|派件员|为您派送|正在为您派件|已安排派送",
        "leave": r"离开|已发出|发往",
        "arrive": r"到达|抵达",
        "center": r"处理中心|分拨中心|转运场|运营区|枢纽中心|集散中心",
        "exclude": r"揽投部|营业部|经营分部|网点|集货点|营销中心",
        "sign_p1": r"已派送至|已投至|已到站|自提柜|智能柜|菜鸟驿站|邮政营业网点",
        "sign_p2": r"本人签收|他人代签收|已代收|放至家门口|指定位置签收",
        "sign_fallback": r"已签收|已妥投|已妥收",
    },
    "圆通": {
        "collect": r"已揽收",
        "delivery": r"正在派件",
        "leave": r"离开",
        "arrive": r"已经到达",
        "center": r"转运中心",
        "exclude": r"",
        "sign_p1": r"已到达\[.*?\]|已到达【.*?(仓|仓库|组|公司)】|已验收成功",
        "sign_p2": r"已签收|签收人|收件人:家门口",
        "sign_fallback": r"快件已投递",
    },
}


# --- 解析器: 圆通 (使用稳定的Postal版本) ---
def parse_logistics_events_yto(
    row: pd.Series, profile: dict, location_maps: dict
) -> pd.Series:
    log_text = row["完整物流信息"]
    sender_city_key, dest_city_key = row["寄出城市"], row["寄达城市"]
    time_cols = [
        "揽收时间_zzy",
        "离开寄件城市时间_zzy",
        "到达收件城市时间_zzy",
        "派送时间_zzy",
        "签收时间_zzy",
        "到达分拣中心时间_zzy",
        "离开收件城市分拣中心时间_zzy",
    ]
    extracted_times = {col: pd.NaT for col in time_cols}
    if not isinstance(log_text, str) or not log_text.strip():
        return pd.Series(extracted_times)
    kw_collect, kw_delivery = profile.get("collect"), profile.get("delivery")
    kw_leave, kw_arrive, kw_exclude = (
        profile.get("leave"),
        profile.get("arrive"),
        profile.get("exclude", ""),
    )
    kw_sign_p1, kw_sign_p2, kw_sign_fallback, kw_sign_ignore = (
        profile.get("sign_p1"),
        profile.get("sign_p2"),
        profile.get("sign_fallback"),
        profile.get("sign_ignore"),
    )
    kw_center_p1, kw_center_p2 = profile.get("center_p1"), profile.get("center_p2")
    kw_center = profile.get("center")
    if not kw_center_p1 and not kw_center_p2 and kw_center:
        kw_center_p1 = kw_center
    kw_all_centers = ((kw_center_p1 or "") + "|" + (kw_center_p2 or "")).strip("|")
    datetime_capture_pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
    all_events = []
    for line in log_text.strip().split("\n"):
        match = re.search(datetime_capture_pattern, line)
        if match:
            dt = pd.to_datetime(match.group(1), errors="coerce")
            if pd.notna(dt):
                all_events.append({"dt": dt, "line": line})
    if not all_events:
        return pd.Series(extracted_times)
    all_events.sort(key=lambda x: x["dt"])

    def get_location_pattern(city_key, location_maps):
        if not city_key or not isinstance(city_key, str):
            return None
        aliases = location_maps["city_alias_map"].get(
            city_key, [city_key.replace("市", "")]
        )
        if city_key in location_maps["capital_cities_set"]:
            province = location_maps["city_to_province_map"].get(city_key)
            if province:
                aliases.append(province.replace("省", "").replace("市", ""))
        return "|".join(map(re.escape, set(a for a in aliases if a)))

    for event in all_events:
        if kw_collect and re.search(kw_collect, event["line"]):
            extracted_times["揽收时间_zzy"] = event["dt"]
            break
    for event in reversed(all_events):
        if (
            pd.isna(extracted_times["派送时间_zzy"])
            and kw_delivery
            and re.search(kw_delivery, event["line"])
        ):
            extracted_times["派送时间_zzy"] = event["dt"]
            break
    if kw_sign_p1:
        for event in reversed(all_events):
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p1, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_p2:
        for event in reversed(all_events):
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_p2, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    if pd.isna(extracted_times["签收时间_zzy"]) and kw_sign_fallback:
        for event in reversed(all_events):
            if kw_sign_ignore and re.search(kw_sign_ignore, event["line"]):
                continue
            if re.search(kw_sign_fallback, event["line"]):
                extracted_times["签收时间_zzy"] = event["dt"]
                break
    plan_a_success = False
    sender_pattern = get_location_pattern(sender_city_key, location_maps)
    if pd.notna(extracted_times["揽收时间_zzy"]) and sender_pattern:
        origin_center_events = [
            e
            for e in all_events
            if e["dt"] > extracted_times["揽收时间_zzy"]
            and kw_all_centers
            and re.search(kw_all_centers, e["line"])
            and re.search(sender_pattern, e["line"])
        ]
        if origin_center_events:
            plan_a_success = True
            last_origin_center_event = origin_center_events[-1]
            if kw_leave and re.search(kw_leave, last_origin_center_event["line"]):
                extracted_times["离开寄件城市时间_zzy"] = last_origin_center_event["dt"]
            location_match = re.search(r"【(.*?)】", last_origin_center_event["line"])
            if location_match:
                location_name = re.escape(location_match.group(1))
                for event in reversed(origin_center_events):
                    if (
                        event["dt"] <= last_origin_center_event["dt"]
                        and re.search(location_name, event["line"])
                        and kw_arrive
                        and re.search(kw_arrive, event["line"])
                    ):
                        extracted_times["到达分拣中心时间_zzy"] = event["dt"]
                        break
    if not plan_a_success and pd.notna(extracted_times["揽收时间_zzy"]):
        first_center_arrival_event = None
        for event in all_events:
            if (
                event["dt"] > extracted_times["揽收时间_zzy"]
                and kw_all_centers
                and re.search(kw_all_centers, event["line"])
                and kw_arrive
                and re.search(kw_arrive, event["line"])
            ):
                extracted_times["到达分拣中心时间_zzy"] = event["dt"]
                first_center_arrival_event = event
                break
        if first_center_arrival_event:
            location_match = re.search(r"【(.*?)】", first_center_arrival_event["line"])
            if location_match:
                location_name = re.escape(location_match.group(1))
                for event in all_events:
                    if (
                        event["dt"] > first_center_arrival_event["dt"]
                        and re.search(location_name, event["line"])
                        and kw_leave
                        and re.search(kw_leave, event["line"])
                    ):
                        extracted_times["离开寄件城市时间_zzy"] = event["dt"]
                        break
    dest_pattern = get_location_pattern(dest_city_key, location_maps)
    if dest_pattern:
        if pd.isna(extracted_times["到达收件城市时间_zzy"]):
            for event in all_events:
                if (
                    kw_arrive
                    and re.search(kw_arrive, event["line"])
                    and re.search(dest_pattern, event["line"])
                ):
                    extracted_times["到达收件城市时间_zzy"] = event["dt"]
                    break
    if pd.notna(extracted_times["派送时间_zzy"]) and dest_pattern:
        if kw_center_p1:
            for event in reversed(all_events):
                if (
                    event["dt"] < extracted_times["派送时间_zzy"]
                    and re.search(kw_center_p1, event["line"])
                    and kw_leave
                    and re.search(kw_leave, event["line"])
                    and re.search(dest_pattern, event["line"])
                ):
                    extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                    break
        if pd.isna(extracted_times["离开收件城市分拣中心时间_zzy"]) and kw_center_p2:
            for event in reversed(all_events):
                if (
                    event["dt"] < extracted_times["派送时间_zzy"]
                    and re.search(kw_center_p2, event["line"])
                    and kw_leave
                    and re.search(kw_leave, event["line"])
                    and re.search(dest_pattern, event["line"])
                ):
                    extracted_times["离开收件城市分拣中心时间_zzy"] = event["dt"]
                    break
    return pd.Series(extracted_times)


# --- 主流程: 圆通 ---
def process_yto_data(zhuzhuyun_dir, anjian_dir, output_dir, base_data_file):
    company_name = "圆通"
    config = {
        "zhuzhu_filename": "圆通.xlsx",
        "internal_name": "圆通",
        "anjian_map_key": "YTO",
        "parser": parse_logistics_events_yto,
    }
    company_start_time = perf_counter()
    location_maps = {
        "city_alias_map": {},
        "capital_cities_set": set(),
        "city_to_province_map": {},
    }
    if base_data_file.exists():
        try:
            df_hierarchy = pd.read_excel(base_data_file, sheet_name="city_hierarchy")
            df_hierarchy.dropna(subset=["Province", "City", "District"], inplace=True)
            df_hierarchy["City_clean"] = df_hierarchy["City"].str.replace(
                "市", "", regex=False
            )
            df_hierarchy["District_clean"] = df_hierarchy["District"].str.replace(
                "市", "", regex=False
            )
            for city, group in df_hierarchy.groupby("City"):
                aliases = [group.iloc[0]["City_clean"]] + group[
                    "District_clean"
                ].unique().tolist()
                location_maps["city_alias_map"][city] = [a for a in aliases if a]
            if "IsCapital" in df_hierarchy.columns:
                df_capitals = df_hierarchy[df_hierarchy["IsCapital"] == 1]
                location_maps["capital_cities_set"] = set(df_capitals["City"].unique())
            city_to_province = df_hierarchy.drop_duplicates("City")[
                ["City", "Province"]
            ].set_index("City")
            location_maps["city_to_province_map"] = city_to_province[
                "Province"
            ].to_dict()
        except Exception as e:
            print(f"[ERROR] 构建地理位置地图失败: {e}。")
    df_base_subset = None
    if base_data_file.exists():
        try:
            df_base = pd.read_excel(base_data_file, sheet_name="inter-city_routes")
            df_base["路线_std"] = (
                df_base["寄出城市"].str.replace("市", "", regex=False)
                + "-"
                + df_base["寄达城市"].str.replace("市", "", regex=False)
            )
            df_base_subset = df_base[
                ["路线_std", "公里", "寄出省份", "寄达省份"]
            ].drop_duplicates("路线_std")
        except Exception as e:
            print(f"[WARNING] 处理 'basic_data.xlsx' 出错: {e}。")
    all_anjian_df = pd.concat(
        [pd.read_excel(f, dtype={"单号": str}) for f in anjian_dir.glob("*.xlsx")],
        ignore_index=True,
    )
    if "快递公司" in all_anjian_df.columns:
        all_anjian_df.rename(columns={"快递公司": "企业"}, inplace=True)
    anjian_to_internal_map = {config["anjian_map_key"]: config["internal_name"]}
    all_anjian_df["企业"] = (
        all_anjian_df["企业"].map(anjian_to_internal_map).fillna(all_anjian_df["企业"])
    )
    base_info_df = all_anjian_df[["单号", "企业", "寄出城市", "寄达城市"]].copy()
    company_file = zhuzhuyun_dir / config["zhuzhu_filename"]
    if not company_file.exists():
        print(f"[WARNING] 未找到文件: {config['zhuzhu_filename']}，已跳过。")
        return
    df_company_zhu = pd.read_excel(company_file, dtype={"快递单号": str})
    df_company_zhu.rename(
        columns={"快递单号": "单号", "快递公司": "企业"}, inplace=True
    )
    df_merged = pd.merge(df_company_zhu, base_info_df, on=["单号", "企业"], how="left")
    base_profile = COMPANY_PROFILES_YTO["default"].copy()
    company_specific_profile = COMPANY_PROFILES_YTO.get(config["internal_name"], {})
    profile = {**base_profile, **company_specific_profile}
    extracted_df = extractor_apply_based(
        df_merged, profile, location_maps, config["parser"]
    )
    df_final = pd.concat([df_merged, extracted_df], axis=1)
    time_cols_to_map = [
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
    ]
    for col in time_cols_to_map:
        df_final[col] = df_final.get(f"{col}_zzy", pd.NaT)
    if df_base_subset is not None:
        df_final["路线_std"] = (
            df_final["寄出城市"].astype(str).str.replace("市", "", regex=False)
            + "-"
            + df_final["寄达城市"].astype(str).str.replace("市", "", regex=False)
        )
        df_final = pd.merge(df_final, df_base_subset, on="路线_std", how="left")
    df_final["企业"] = company_name
    output_columns = [
        "企业",
        "单号",
        "寄出城市",
        "寄达城市",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "公里",
        "寄出省份",
        "寄达省份",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
    ]
    final_df_to_save = df_final.reindex(columns=output_columns)
    output_file = output_dir / f"{company_name}_logistics_data_FINAL.xlsx"
    final_df_to_save.to_excel(output_file, index=False)
    company_end_time = perf_counter()
    print(
        f"✅ {company_name} 处理完成，耗时 {company_end_time - company_start_time:.2f} 秒。文件已保存至: {output_file.name}"
    )


# --- 运行: 圆通 ---
process_yto_data(
    zhuzhuyun_merge_path, anjian_data_path, pycharm_input_path, base_data_path
)

--- [开始] 处理 圆通 ---
  -> 使用 'parse_logistics_events_yto' 解析器通过 .apply() 运行...
✅ 圆通 处理完成，耗时 94.29 秒。文件已保存至: 圆通_logistics_data_FINAL.xlsx


In [None]:
# --------------------------------------------------
# Cell 5: 中转数据生成
# --------------------------------------------------
# --- 1. 设置文件路径 ---
# 输入路径
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_dir = report_path / "输入"
anjian_dir = input_dir / "安监数据"
# 中间过程文件路径
temp_path = report_path / "temp"
logistics_dir = temp_path / "3_猪猪云合并数据"
output_dir = temp_path / "5_中转数据"
basic_data_path = input_dir / "basic_data.xlsx"

anjian_dir.mkdir(parents=True, exist_ok=True)
logistics_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"基础数据文件应位于: {basic_data_path}")
print(f"安监数据文件夹: {anjian_dir}")
print(f"物流明细文件夹: {logistics_dir}")
print(f"计算结果将输出至: {output_dir}")
print("-" * 40)


# --- 2. 城市加载与匹配逻辑 ---
def load_city_pattern(path_to_basic_data):
    try:
        if not path_to_basic_data.exists():
            return None
        df_city = pd.read_excel(
            path_to_basic_data, sheet_name="city_names_complete_2025", engine="openpyxl"
        )
        if "城市" not in df_city.columns or "行政级别" not in df_city.columns:
            return None
        df_city["行政级别"] = df_city["行政级别"].str.strip()
        ####################  !!! 请根据你的需要修改以下变量 !!!   #######################
        target_levels = ["地级市", "直辖市", "副省级城市", "省直辖县级市"]
        ####################  !!! 请根据你的需要修改以下变量 !!!   #######################
        target_df = df_city[df_city["行政级别"].isin(target_levels)].copy()
        target_df["城市_clean"] = target_df["城市"].str.replace(
            r"(市|省|自治区|特别行政区)$", "", regex=True
        )
        city_list = target_df["城市_clean"].dropna().unique().tolist()
        city_list.sort(key=len, reverse=True)
        print(f"成功加载 {len(city_list)} 个目标城市。")
        return re.compile("|".join(city_list))
    except Exception as e:
        print(f"加载和处理城市数据时出错: {e}")
        return None


BASE_CITY_PATTERN = load_city_pattern(basic_data_path)
####################  !!! 请根据你的需要修改以下变量 !!!   #######################
AMBIGUOUS_MAP = {
    "朝阳": "北京",
    "湘潭": "天津",
}  # 可能产生歧义的字段硬编码（如：北京朝阳区 vs 朝阳市）
##############################################################################


def find_all_valid_cities_in_text(text, pattern):
    if not isinstance(text, str) or not pattern:
        return []
    cities_found = pattern.findall(text)
    valid_cities = []
    for city in cities_found:
        is_noise = False
        if city in AMBIGUOUS_MAP:
            context_word = AMBIGUOUS_MAP[city]
            if context_word in text:
                is_noise = True
        if not is_noise:
            valid_cities.append(city + "市")
    return list(dict.fromkeys(valid_cities))


# --- “锚点切割”算法---
def extract_transit_log_slice(log_entries, origin_city, dest_city):
    """
    根据你的最终逻辑，对物流信息进行“手术式切割”。
    log_entries: 已经按时间正序排列的物流记录列表 (最早的在index 0)。
    """
    last_origin_anchor = -1
    first_dest_anchor = -1

    origin_city_clean = origin_city.replace("市", "")
    dest_city_clean = dest_city.replace("市", "")

    # 从头向后，找到最后一次提及“寄出城市”的记录
    for i, entry in enumerate(log_entries):
        if origin_city_clean in entry:
            last_origin_anchor = i  # 持续更新，直到循环结束

    # 从头向后，找到第一次提及“寄达城市”的记录
    for i, entry in enumerate(log_entries):
        if dest_city_clean in entry:
            first_dest_anchor = i
            break  # 找到第一个就停止

    # 只有当两个锚点都找到，并且起点锚点在终点锚点之前，才算有效
    if (
        last_origin_anchor != -1
        and first_dest_anchor != -1
        and last_origin_anchor < first_dest_anchor
    ):
        # 返回这两个锚点之间的“纯净”中转部分
        return log_entries[last_origin_anchor + 1 : first_dest_anchor]

    return []


# --- ---------------------------------------------------- ---


# --- 3. 核心处理逻辑 (以“锚点切割”为核心) ---
def process_company_data(logistics_path, anjian_path):
    company_name = logistics_path.stem
    rename_map = {"邮政国内小包": "邮政"}
    company_name = rename_map.get(company_name, company_name)
    print(f"\n--- 正在处理: {company_name} ---")

    try:
        df_anjian = pd.read_excel(anjian_path)
        df_logistics = pd.read_excel(logistics_path)

        df_anjian = df_anjian[["单号", "寄出城市", "寄达城市"]].dropna()
        df_anjian["寄出城市"] = (
            df_anjian["寄出城市"].str.replace("市", "", regex=False) + "市"
        )
        df_anjian["寄达城市"] = (
            df_anjian["寄达城市"].str.replace("市", "", regex=False) + "市"
        )
        df_anjian["单号"] = df_anjian["单号"].astype(str)

        df_logistics = df_logistics[["快递单号", "完整物流信息"]].dropna()
        df_logistics["快递单号"] = df_logistics["快递单号"].astype(str)

        merged_df = pd.merge(
            df_logistics, df_anjian, left_on="快递单号", right_on="单号", how="inner"
        )
        print(
            f"在 {logistics_path.name} 和 {anjian_path.name} 中共找到 {len(merged_df)} 条匹配的运单。"
        )

    except Exception as e:
        print(f"处理公司 {company_name} 时发生错误: {e}")
        return

    all_routes = []
    for _, row in merged_df.iterrows():
        origin_city_auth = row["寄出城市"]
        dest_city_auth = row["寄达城市"]
        full_log = row["完整物流信息"]

        log_entries = [entry.strip() for entry in full_log.split("\n") if entry.strip()]
        log_entries.reverse()

        transit_slice = extract_transit_log_slice(
            log_entries, origin_city_auth, dest_city_auth
        )

        transit_cities = set()
        for entry in transit_slice:
            cities_in_entry = find_all_valid_cities_in_text(entry, BASE_CITY_PATTERN)
            for city in cities_in_entry:
                if city not in [origin_city_auth, dest_city_auth]:
                    transit_cities.add(city)

        all_routes.append(
            {
                "出发城市": origin_city_auth,
                "到达城市": dest_city_auth,
                "中转城市列表": sorted(list(transit_cities)),
                "中转次数": len(transit_cities),
            }
        )

    if not all_routes:
        print(f"未能在 {company_name} 的数据中提取出任何有效的中转路径。")
        return

    # --- 4 & 5. 聚合计算与保存 ---
    routes_df = pd.DataFrame(all_routes)
    agg_result = (
        routes_df.groupby(["出发城市", "到达城市"])
        .agg(
            平均中转次数=("中转次数", "mean"),
            中转城市=(
                "中转城市列表",
                lambda s: sorted(list(set(c for sub in s for c in sub))),
            ),
        )
        .reset_index()
    )
    agg_result["中转城市"] = agg_result["中转城市"].apply(lambda x: ",".join(x))
    agg_result["平均中转次数"] = agg_result["平均中转次数"].round(2)
    agg_result = agg_result.sort_values(by="平均中转次数", ascending=False).reset_index(
        drop=True
    )
    final_columns = ["出发城市", "到达城市", "中转城市", "平均中转次数"]
    agg_result = agg_result[final_columns]

    # --- 6. 保存结果 ---
    output_filename = f"{company_name}_transit_data.xlsx"
    output_path = output_dir / output_filename
    agg_result.to_excel(output_path, index=False, engine="openpyxl")
    print(f"✔ {company_name} 的中转数据计算完成，已保存至: {output_path}")


# --- 7. 主程序入口 ---
def main():
    if not BASE_CITY_PATTERN:
        print("由于城市列表未能加载，无法继续处理文件。")
        return

    logistics_files = list(logistics_dir.glob("*.xlsx")) + list(
        logistics_dir.glob("*.xls")
    )
    if not logistics_files:
        print(f"警告：在物流明细文件夹中未找到任何Excel文件！请放入: {logistics_dir}")
        return

    for log_path in logistics_files:
        company_name_stem = log_path.stem
        print(f"\n正在为物流文件 [{log_path.name}] 寻找对应的安监数据...")
        search_pattern_xlsx = f"*{company_name_stem}*.xlsx"
        search_pattern_xls = f"*{company_name_stem}*.xls"
        matching_anjian_files = list(anjian_dir.glob(search_pattern_xlsx)) + list(
            anjian_dir.glob(search_pattern_xls)
        )
        if not matching_anjian_files:
            print(
                f"--> 警告：在安监文件夹中未找到任何包含 '{company_name_stem}' 的文件，跳过该公司。"
            )
            continue
        anjian_path = matching_anjian_files[0]
        print(f"--> 成功匹配: [{anjian_path.name}]")
        if len(matching_anjian_files) > 1:
            print(
                f"    注意：找到多个匹配项，已自动选择第一个。匹配项列表: {[p.name for p in matching_anjian_files]}"
            )
        process_company_data(log_path, anjian_path)

    print("\n--- 所有文件处理完毕！ ---")


if __name__ == "__main__":
    main()

基础数据文件应位于: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输入/basic_data.xlsx
安监数据文件夹: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/输入/安监数据
物流明细文件夹: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/temp/3_猪猪云合并数据
计算结果将输出至: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/temp/5_中转数据
----------------------------------------
成功加载 297 个目标城市。

正在为物流文件 [圆通.xlsx] 寻找对应的安监数据...
--> 成功匹配: [2025年6月圆通抽样.xlsx]

--- 正在处理: 圆通 ---
在 圆通.xlsx 和 2025年6月圆通抽样.xlsx 中共找到 82733 条匹配的运单。
✔ 圆通 的中转数据计算完成，已保存至: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/temp/5_中转数据/圆通_transit_data.xlsx

正在为物流文件 [京东.xlsx] 寻找对应的安监数据...
--> 成功匹配: [2025年6月京东抽样.xlsx]

--- 正在处理: 京东 ---
在 京东.xlsx 和 2025年6月京东抽样.xlsx 中共找到 79688 条匹配的运单。
✔ 京东 的中转数据计算完成，已保存至: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis/报告数据/temp/5_中转数据/京东_transit_data.xlsx

正在为物流文件 [申通.xlsx] 寻找对应的安监数据...
--> 成功匹配: [2025年6月申通抽样.xlsx]

--- 正在处理: 申通 ---
在 申通.xlsx 和 2025年6月申通抽样.xlsx 中共找到 830

In [None]:
###################——————【Cell 6 补充】中转数据侦查工具——————#############################
#  输出的中转数据中，觉得输出的“中转次数”或“中转城市“的输出不合理，使用该模块进行物流信息的筛查，寻找原始的”完整物流轨迹“，需修改下方标注的
# 1） 目标公司文件名 2）目标出发城市 3） 目标到达城市 4） 你怀疑的、不合理的中转城市  四个参数

import re
import sys
import warnings
from pathlib import Path

import pandas as pd

# --- 准备工作：确保所有函数和变量都已定义 ---
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
base_dir = Path.cwd()
input_dir = base_dir / "报告数据" / "输入"
data_dir = base_dir / "报告数据" / "temp" / "3_猪猪云合并数据"
basic_data_path = input_dir / "basic_data.xlsx"


def load_target_cities(path_to_basic_data):
    try:
        if not path_to_basic_data.exists():
            return None
        df_city = pd.read_excel(
            path_to_basic_data, sheet_name="city_names_complete_2025", engine="openpyxl"
        )
        if "城市" not in df_city.columns or "行政级别" not in df_city.columns:
            return None
        df_city["行政级别"] = df_city["行政级别"].str.strip()
        target_levels = ["地级市", "直辖市", "副省级城市", "省直辖县级市"]
        target_df = df_city[df_city["行政级别"].isin(target_levels)].copy()
        target_df["城市_clean"] = target_df["城市"].str.replace(
            r"(市|省|自治区|特别行政区)$", "", regex=True
        )
        city_list = target_df["城市_clean"].dropna().unique().tolist()
        city_list.sort(key=len, reverse=True)
        pattern_list = [city + "(?![\\s]?[区县])" for city in city_list]
        return re.compile("|".join(pattern_list))
    except Exception:
        return None


TARGET_CITY_PATTERN = load_target_cities(basic_data_path)


def find_all_cities(text, pattern):
    if not isinstance(text, str) or not pattern:
        return []
    return [match + "市" for match in pattern.findall(text)]


def find_first_city(text, pattern):
    if not isinstance(text, str) or not pattern:
        return None
    match = pattern.search(text)
    return match.group() + "市" if match else None


def find_destination_city(log_entries, pattern):
    for i in range(len(log_entries) - 1, 0, -1):
        city = find_first_city(log_entries[i], pattern)
        if city:
            return city
    return None


# --- 物流侦探主程序 ---
def route_tracer():
    print("--- 物流侦探工具已启动 ---")

    # --- 1. 设置追踪目标 ---
    ################################  !!! 请根据你的需要修改以下变量 !!!   ################################################################
    TARGET_COMPANY_FILE = "德邦.xlsx"  # 目标公司文件名
    TARGET_ORIGIN = "北京市"  # 目标出发城市
    TARGET_DEST = "漯河市"  # 目标到达城市
    SUSPICIOUS_TRANSIT = "南阳市"  # 你怀疑的、不合理的中转城市
    ###################################################################################################################################

    if not TARGET_CITY_PATTERN:
        print("错误：无法加载城市列表，追踪中止。")
        return

    file_path = data_dir / TARGET_COMPANY_FILE
    if not file_path.exists():
        print(f"错误：目标文件 '{TARGET_COMPANY_FILE}' 在 '{data_dir}' 中未找到。")
        return

    print(f"\n正在分析文件: {TARGET_COMPANY_FILE}")
    print(
        f"追踪目标: 从 [{TARGET_ORIGIN}] 到 [{TARGET_DEST}]，且中转了 [{SUSPICIOUS_TRANSIT}] 的包裹"
    )
    print("=" * 60)

    try:
        df = pd.read_excel(file_path, engine="openpyxl")
        df.dropna(subset=["完整物流信息"], inplace=True)
    except Exception as e:
        print(f"读取或处理Excel文件时出错: {e}")
        return

    found_count = 0
    for index, row in df.iterrows():
        full_log = row.get("完整物流信息", "")
        if not isinstance(full_log, str):
            continue

        log_entries = [entry.strip() for entry in full_log.split("\n") if entry.strip()]
        if len(log_entries) < 2:
            continue
        log_entries.reverse()

        origin_city = find_first_city(log_entries[0], TARGET_CITY_PATTERN)
        dest_city = find_destination_city(log_entries, TARGET_CITY_PATTERN)

        # 检查是否是我们追踪的线路
        if origin_city == TARGET_ORIGIN and dest_city == TARGET_DEST:
            # 检查完整的物流信息中是否包含了可疑的中转城市
            if SUSPICIOUS_TRANSIT.replace("市", "") in full_log:
                found_count += 1
                waybill_no = row.get("快递单号", "未找到快递单号列")
                print(f"\n--- 找到匹配记录! ---")
                print(f"  快递单号: {waybill_no}")
                print(f"  原始表格行号: {index + 2}")
                print("【完整的物流信息】:")
                print(full_log)
                print("-" * 20)

    print("=" * 60)
    if found_count == 0:
        print(
            f"在文件中未找到任何从 [{TARGET_ORIGIN}] 到 [{TARGET_DEST}]，且物流信息中包含 [{SUSPICIOUS_TRANSIT}] 的记录。"
        )
    else:
        print(f"追踪完毕，共找到 {found_count} 条符合条件的记录。")


# --- 运行侦探工具 ---
route_tracer()

In [3]:
# ==============================================================================
# Cell 7: 核心数据计算层 (含中转次数和汇总统计)
# ==============================================================================
import sys
import warnings
from pathlib import Path
from time import perf_counter

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")
print("库导入完成。")


# --- 1. 运行模式开关 ---
RUN_MODE = "ALL"
TARGET_COMPANY = "邮政"

# --- 3. 项目路径设置 ---
base_path = Path.cwd()
report_path = base_path / "报告数据"
input_data_path = report_path / "temp" / "4_logistics数据"
zhuzhuyun_data_path = report_path / "temp" / "3_猪猪云合并数据"
transit_data_path = report_path / "temp" / "5_中转数据"
output_path = report_path / "输出" / "data_analysis_result"
basic_data_file = report_path / "输入" / "basic_data.xlsx"

# (新增) 公司名到文件名的映射，以及反向映射
COMPANY_TO_FILENAME_MAP = {
    "EMS": "EMS",
    "中通": "中通",
    "京东": "京东",
    "圆通": "圆通",
    "德邦": "德邦",
    "极兔": "极兔",
    "申通": "申通",
    "韵达": "韵达",
    "顺丰": "顺丰",
    "快包": "邮政",
}
FILENAME_TO_COMPANY_MAP = {v: k for k, v in COMPANY_TO_FILENAME_MAP.items()}


def load_top_cities(file_path: Path) -> set:
    sheet_name = "30_top_volume_city_2024"
    column_name = "城市"
    print(f"\n正在从 '{file_path.name}' 加载 Top 30 城市列表...")
    try:
        if not file_path.exists():
            raise FileNotFoundError(f"基础数据文件不存在: {file_path}")
        df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl")
        if column_name not in df.columns:
            raise ValueError(
                f"在Sheet '{sheet_name}' 中未找到名为 '{column_name}' 的列。"
            )
        cities_set = set(df[column_name].dropna().astype(str).str.strip().tolist())
        if not cities_set:
            print(f"警告: 从 '{file_path.name}' 中加载的城市列表为空。")
        else:
            print(f"成功加载 {len(cities_set)} 个 Top 30 城市。")
        return cities_set
    except Exception as e:
        print(f"错误: 加载Top 30城市列表失败！错误信息: {e}")
        sys.exit(1)


def generate_basic_metrics_df(
    filtered_data: pd.DataFrame,
    original_data_count: int,
    top_30_cities: set,
    total_dropped_count: int,
) -> pd.DataFrame:
    metrics_data, distance_metrics = [], []
    total_count = filtered_data.shape[0]
    top_30_data_all = filtered_data[
        (filtered_data["寄出城市"].isin(top_30_cities))
        & (filtered_data["寄达城市"].isin(top_30_cities))
    ]
    total_top_30_count = top_30_data_all.shape[0]
    if not filtered_data.empty:
        time_cols_map = {
            "全程时限": "全程时限",
            "寄出地处理时限": "寄出地处理时限",
            "运输时限": "运输时限",
            "寄达地处理时限": "寄达地处理时限",
            "投递时限": "投递时限",
        }
        for name, col in time_cols_map.items():
            metrics_data.append(
                [
                    name,
                    filtered_data[col].mean(),
                    filtered_data[col].max(),
                    filtered_data[col].min(),
                ]
            )
        metrics_data.append(
            [
                "72小时准时率",
                (filtered_data["全程时限"] <= 72).mean() if total_count > 0 else 0,
                "",
                "",
            ]
        )
        metrics_data.append(
            [
                "48小时准时率",
                (top_30_data_all["全程时限"] <= 48).mean()
                if total_top_30_count > 0
                else 0,
                "",
                "",
            ]
        )
        metrics_data.append(
            [
                f"总数据量 {original_data_count}\n总筛选掉数据量 {total_dropped_count}\n最终有效数据量 {total_count}\n业务量前30的城市间数据量 {total_top_30_count}",
                "",
                "",
                "",
            ]
        )
        buckets = {
            "0-600": filtered_data[filtered_data["公里"] < 600],
            "600-1500": filtered_data[
                (filtered_data["公里"] >= 600) & (filtered_data["公里"] < 1500)
            ],
            "1500-2500": filtered_data[
                (filtered_data["公里"] >= 1500) & (filtered_data["公里"] < 2500)
            ],
            "2500以上": filtered_data[filtered_data["公里"] >= 2500],
        }
        for _, col_key in time_cols_map.items():
            distance_metrics.append([buckets[b][col_key].mean() for b in buckets])
        distance_metrics.extend([[""] * 4] * 3)
    else:
        metrics_data, distance_metrics = (
            [["无有效数据", "", "", ""]] * 8,
            [[""] * 4] * 8,
        )
    metrics_df = pd.DataFrame(metrics_data, columns=["项目", "mean", "max", "min"])
    distance_df = pd.DataFrame(
        distance_metrics, columns=["0-600", "600-1500", "1500-2500", "2500以上"]
    )
    return pd.concat([metrics_df, distance_df], axis=1).round(4).fillna("")


def main():
    for p in [
        report_path,
        input_data_path,
        zhuzhuyun_data_path,
        transit_data_path,
        output_path,
    ]:
        p.mkdir(exist_ok=True, parents=True)
    print("项目文件夹结构设置/检查完毕。")
    print(f"输入数据(Logistics数据)应位于: {input_data_path}")
    print(f"分析报告将输出至: {output_path}")

    top_30_cities = load_top_cities(basic_data_file)
    start_time = perf_counter()
    all_available_files = list(input_data_path.glob("*.xlsx")) + list(
        input_data_path.glob("*.csv")
    )
    if RUN_MODE == "ALL":
        files_to_process = all_available_files
    elif RUN_MODE == "SINGLE":
        files_to_process = [
            f for f in all_available_files if f.name.startswith(TARGET_COMPANY)
        ]
    else:
        print(f"错误: 无效的 RUN_MODE '{RUN_MODE}'。")
        sys.exit(1)

    if not files_to_process:
        print(f"\n警告: 未找到任何待处理的数据文件，程序即将退出。")
        sys.exit(0)

    # 【修改点 1】: 准备一个列表来收集每个文件的统计数据
    summary_stats = []

    files_to_process = sorted(list(set(files_to_process)))
    print(f"\n找到 {len(files_to_process)} 个待处理文件，开始分析...")
    for file_path in files_to_process:
        stats = process_single_file(file_path, top_30_cities)
        if stats:
            summary_stats.append(stats)

    end_time = perf_counter()
    print(f"\n\n所有文件处理完毕！总耗时: {end_time - start_time:.2f} 秒。")

    # 【修改点 2】: 在所有文件处理后，生成并打印汇总统计表
    if summary_stats:
        print("\n" + "=" * 20 + " 数据筛选流程汇总统计 " + "=" * 20)
        summary_df = pd.DataFrame(summary_stats)

        # 计算衍生列
        summary_df["**总计删除**"] = (
            summary_df["因时间戳缺失删除"]
            + summary_df["因时限空值删除"]
            + summary_df["因范围不符删除"]
        )
        summary_df["最终保留"] = summary_df["原始数据量"] - summary_df["**总计删除**"]
        summary_df["**保留率**"] = (
            summary_df["最终保留"] / summary_df["原始数据量"]
        ).apply(lambda x: f"{x:.1%}")

        # 重新排序列
        summary_df = summary_df[
            [
                "公司 (Company)",
                "原始数据量",
                "因时间戳缺失删除",
                "因时限空值删除",
                "因范围不符删除",
                "**总计删除**",
                "最终保留",
                "**保留率**",
            ]
        ]

        # 计算总计行
        total_row = summary_df.select_dtypes(include=np.number).sum()
        total_row["公司 (Company)"] = "**总计**"
        total_row["**保留率**"] = (
            f"{(total_row['最终保留'] / total_row['原始数据量']):.1%}"
        )

        # 将总计行添加到DataFrame
        summary_df = pd.concat(
            [summary_df, pd.DataFrame(total_row).T], ignore_index=True
        )

        # 打印Markdown格式的表格
        print(summary_df.to_markdown(index=False))


def process_single_file(file_path: Path, top_30_cities: set):
    """(最终方案版) 处理单个文件，并为下游脚本准备所有必需的标签列"""
    file_name = file_path.name
    print(f"\n--- 正在处理文件: {file_name} ---")

    # --- 1. 数据加载 ---
    try:
        data = (
            pd.read_excel(file_path, header=0, dtype={"单号": str}, engine="openpyxl")
            if file_path.suffix == ".xlsx"
            else pd.read_csv(file_path, header=0, dtype={"单号": str})
        )
        original_data_count = data.shape[0]
        print(f"原始数据量: {original_data_count} 条")
    except Exception as e:
        print(f"读取文件 {file_name} 失败: {e}。跳过此文件。")
        return None

    # --- 2. 列检查与补全 ---
    core_required_cols = [
        "单号",
        "寄出省份",
        "寄出城市",
        "寄达省份",
        "寄达城市",
        "公里",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
    ]
    optional_cols = ["到达分拣中心时间", "离开收件城市分拣中心时间"]
    if not all(c in data.columns for c in core_required_cols):
        print(f"错误: 文件 {file_name} 缺少必要的核心列。跳过此文件。")
        return None
    for col in optional_cols:
        if col not in data.columns:
            data[col] = pd.NaT

    # --- 合并猪猪云数据 ---
    company_name_from_file = file_name.split("_")[0]
    zhuzhuyun_file = zhuzhuyun_data_path / f"{company_name_from_file}.xlsx"
    if zhuzhuyun_file.exists():
        try:
            df_zhuzhu = pd.read_excel(
                zhuzhuyun_file,
                usecols=["快递单号", "完整物流信息"],
                dtype={"快递单号": str},
                engine="openpyxl",
            ).rename(columns={"快递单号": "单号"})
            data = pd.merge(
                data, df_zhuzhu.drop_duplicates(subset=["单号"]), on="单号", how="left"
            )
            if "完整物流信息_y" in data.columns:
                # 优先使用 _y (猪猪云) 的数据，如果猪猪云数据为空，则使用 _x (主数据) 的数据
                data["完整物流信息"] = data["完整物流信息_y"].fillna(
                    data["完整物流信息_x"]
                )
                # 删除多余的 _x 和 _y 列，保持数据整洁
                data.drop(columns=["完整物流信息_x", "完整物流信息_y"], inplace=True)

        except Exception as e:
            print(f"  - 警告: 合并猪猪云数据失败: {e}")
    if "完整物流信息" not in data.columns:
        data["完整物流信息"] = ""

    # --- 3. 时间转换与时限计算 ---
    all_time_cols = core_required_cols + optional_cols
    all_time_cols = [c for c in all_time_cols if "时间" in c]
    for col in all_time_cols:
        data[col] = pd.to_datetime(data[col], errors="coerce")

    count_before_time_dropna = len(data)
    data.dropna(subset=[c for c in core_required_cols if "时间" in c], inplace=True)
    time_dropped_count = count_before_time_dropna - len(data)
    print(f"  - 因核心时间戳缺失，删除了 {time_dropped_count} 行。")

    # 计算所有9个时限
    data["全程时限"] = (data["签收时间"] - data["揽收时间"]) / np.timedelta64(1, "h")
    data["寄出地处理时限"] = (
        data["离开寄件城市时间"] - data["揽收时间"]
    ) / np.timedelta64(1, "h")
    data["寄达地处理时限"] = (
        data["派送时间"] - data["到达收件城市时间"]
    ) / np.timedelta64(1, "h")
    data["投递时限"] = (data["签收时间"] - data["派送时间"]) / np.timedelta64(1, "h")
    data["运输时限"] = (
        data["到达收件城市时间"] - data["离开寄件城市时间"]
    ) / np.timedelta64(1, "h")

    if "到达分拣中心时间" in data.columns:
        data["揽收-到达寄出地分拣中心时长"] = (
            data["到达分拣中心时间"] - data["揽收时间"]
        ) / np.timedelta64(1, "h")
        data["到达寄出地分拣中心-离开寄出地城市时长"] = (
            data["离开寄件城市时间"] - data["到达分拣中心时间"]
        ) / np.timedelta64(1, "h")
    else:
        data["揽收-到达寄出地分拣中心时长"] = np.nan
        data["到达寄出地分拣中心-离开寄出地城市时长"] = np.nan

    if "离开收件城市分拣中心时间" in data.columns:
        data["到达寄达地城市-离开寄达地分拣中心时长"] = (
            data["离开收件城市分拣中心时间"] - data["到达收件城市时间"]
        ) / np.timedelta64(1, "h")
        data["离开寄达地分拣中心-派件"] = (
            data["派送时间"] - data["离开收件城市分拣中心时间"]
        ) / np.timedelta64(1, "h")
    else:
        data["到达寄达地城市-离开寄达地分拣中心时长"] = np.nan
        data["离开寄达地分拣中心-派件"] = np.nan

    # --- 4. 严格且有区分的筛选 ---
    company_key = FILENAME_TO_COMPANY_MAP.get(
        company_name_from_file, company_name_from_file
    )
    EXCLUDED_COMPANIES = {"中通", "顺丰"}

    count_before_filter = len(data)

    core_duration_cols = [
        "全程时限",
        "寄出地处理时限",
        "寄达地处理时限",
        "投递时限",
        "运输时限",
    ]
    all_duration_cols = core_duration_cols + [
        "揽收-到达寄出地分拣中心时长",
        "到达寄出地分拣中心-离开寄出地城市时长",
        "到达寄达地城市-离开寄达地分拣中心时长",
        "离开寄达地分拣中心-派件",
    ]

    if company_key in EXCLUDED_COMPANIES:
        print("  - 正在为中通/顺丰执行宽松筛选(第1步)：仅检查5个核心时限的空值...")
        columns_to_check_for_nan = core_duration_cols
    else:
        print("  - 正在为其他公司执行严格筛选(第1步)：检查所有9个时限的空值...")
        columns_to_check_for_nan = [
            col for col in all_duration_cols if col in data.columns
        ]

    data.dropna(subset=columns_to_check_for_nan, inplace=True)
    count_after_nan_drop = len(data)
    nan_dropped_count = count_before_filter - count_after_nan_drop
    print(f"    -> 因时限存在空值，删除了 {nan_dropped_count} 行。")

    print("  - 正在筛选(第2步)：应用业务逻辑范围条件...")
    base_mask = (
        data["寄出地处理时限"].between(0.1, 48)
        & data["运输时限"].between(0.5, 200)
        & data["寄达地处理时限"].between(0.1, 60)
        & data["投递时限"].between(0, 36)
    )

    if company_key not in EXCLUDED_COMPANIES:
        print("    -> 正在为其他公司增加额外筛选：4个分拣中心时长 > 0")
        new_duration_cols = [
            "揽收-到达寄出地分拣中心时长",
            "到达寄出地分拣中心-离开寄出地城市时长",
            "到达寄达地城市-离开寄达地分拣中心时长",
            "离开寄达地分拣中心-派件",
        ]
        for col in new_duration_cols:
            if col in data.columns:
                base_mask &= data[col] > 0

    filtered_data = data.loc[base_mask].copy()

    count_after_value_filter = len(filtered_data)
    value_dropped_count = count_after_nan_drop - count_after_value_filter
    total_dropped_this_stage = nan_dropped_count + value_dropped_count
    print(f"    -> 因时限值不符合业务范围，又删除了 {value_dropped_count} 行。")
    print(f"  - [小计] 时效筛选阶段共删除 {total_dropped_this_stage} 行。")

    total_dropped_count = time_dropped_count + total_dropped_this_stage

    if filtered_data.empty:
        print("筛选后无有效数据，无法生成报告。跳过此文件。")
        return None

    # --- 5. 为下游计算准备标签列 ---
    filtered_data["T+1_achieved"] = filtered_data["签收时间"].dt.normalize() <= (
        filtered_data["揽收时间"].dt.normalize() + pd.Timedelta(days=1)
    )
    filtered_data["T+2_achieved"] = filtered_data["签收时间"].dt.normalize() <= (
        filtered_data["揽收时间"].dt.normalize() + pd.Timedelta(days=2)
    )

    AIR_KEYWORDS_MAP = {"EMS": "飞往|发往航站准备安检", "快包": "飞往|发往航站准备安检"}
    air_keyword = AIR_KEYWORDS_MAP.get(company_key, "")
    if air_keyword:
        filtered_data["is_air"] = (
            filtered_data["完整物流信息"]
            .fillna("")
            .str.contains(air_keyword, regex=True)
        )
    else:
        filtered_data["is_air"] = False

    # --- 6. 生成报告的三个Sheet ---
    basic_metrics_df = generate_basic_metrics_df(
        filtered_data, original_data_count, top_30_cities, total_dropped_count
    )

    # '线路详细数据'
    final_detailed_cols = [
        "单号",
        "寄出省份",
        "寄出城市",
        "寄达省份",
        "寄达城市",
        "公里",
        "揽收时间",
        "离开寄件城市时间",
        "到达收件城市时间",
        "派送时间",
        "签收时间",
        "到达分拣中心时间",
        "离开收件城市分拣中心时间",
        "完整物流信息",
        "全程时限",
        "寄出地处理时限",
        "寄达地处理时限",
        "投递时限",
        "运输时限",
        "揽收-到达寄出地分拣中心时长",
        "到达寄出地分拣中心-离开寄出地城市时长",
        "到达寄达地城市-离开寄达地分拣中心时长",
        "离开寄达地分拣中心-派件",
        "T+1_achieved",
        "T+2_achieved",
        "is_air",
    ]
    detailed_routes_df = filtered_data[
        [col for col in final_detailed_cols if col in filtered_data.columns]
    ].round(3)

    # '线路汇总数据'
    print("  - 正在计算'线路汇总数据'...")
    grouped = filtered_data.groupby(["寄出城市", "寄达城市"])

    agg_dict = {
        "快递数量": ("单号", "count"),
        "全程时限": ("全程时限", "mean"),
        "寄出地处理时限": ("寄出地处理时限", "mean"),
        "运输时限": ("运输时限", "mean"),
        "寄达地处理时限": ("寄达地处理时限", "mean"),
        "投递时限": ("投递时限", "mean"),
        "揽收-到达寄出地分拣中心时长": ("揽收-到达寄出地分拣中心时长", "mean"),
        "到达寄出地分拣中心-离开寄出地城市时长": (
            "到达寄出地分拣中心-离开寄出地城市时长",
            "mean",
        ),
        "到达寄达地城市-离开寄达地分拣中心时长": (
            "到达寄达地城市-离开寄达地分拣中心时长",
            "mean",
        ),
        "离开寄达地分拣中心-派件": ("离开寄达地分拣中心-派件", "mean"),
    }
    summary_df = grouped.agg(**agg_dict)

    total_counts = grouped.size()
    summary_df["72小时准时率"] = (
        filtered_data[filtered_data["全程时限"] <= 72]
        .groupby(["寄出城市", "寄达城市"])
        .size()
        / total_counts
    ).fillna(0)
    summary_df["48小时准时率"] = (
        filtered_data[filtered_data["全程时限"] <= 48]
        .groupby(["寄出城市", "寄达城市"])
        .size()
        / total_counts
    ).fillna(0)
    filtered_data["送达天数"] = (
        filtered_data["签收时间"].dt.normalize()
        - filtered_data["揽收时间"].dt.normalize()
    ).dt.days
    summary_df["送达天数_80分位"] = filtered_data.groupby(["寄出城市", "寄达城市"])[
        "送达天数"
    ].quantile(0.8, interpolation="higher")
    summary_df.reset_index(inplace=True)
    summary_df["路线"] = summary_df["寄出城市"] + "-" + summary_df["寄达城市"]
    summary_df = summary_df.round(4)

    print("  - 正在合并中转次数数据...")
    transit_filename_key = company_name_from_file
    if company_key == "快包":
        transit_filename_key = "邮政"
    transfer_file = transit_data_path / f"{transit_filename_key}_transit_data.xlsx"
    if transfer_file.exists():
        try:
            df_transfer = pd.read_excel(transfer_file, engine="openpyxl")
            if "出发城市" in df_transfer.columns and "到达城市" in df_transfer.columns:
                df_transfer["路线"] = (
                    df_transfer["出发城市"] + "-" + df_transfer["到达城市"]
                )
                df_agg = (
                    df_transfer.groupby("路线")[["平均中转次数"]]
                    .mean()
                    .rename(columns={"平均中转次数": "中转次数"})
                )
                summary_df = pd.merge(summary_df, df_agg, on="路线", how="left")
        except Exception as e:
            print(f"   -> 警告: 处理中转文件 {transfer_file.name} 失败: {e}")
    else:
        print(f"   -> 提示: 未找到对应的中转文件: {transfer_file.name}")

    if "中转次数" not in summary_df.columns:
        summary_df["中转次数"] = np.nan

    final_summary_cols = [
        "寄出城市",
        "寄达城市",
        "路线",
        "快递数量",
        "全程时限",
        "寄出地处理时限",
        "运输时限",
        "寄达地处理时限",
        "投递时限",
        "揽收-到达寄出地分拣中心时长",
        "到达寄出地分拣中心-离开寄出地城市时长",
        "到达寄达地城市-离开寄达地分拣中心时长",
        "离开寄达地分拣中心-派件",
        "72小时准时率",
        "48小时准时率",
        "送达天数_80分位",
        "中转次数",
    ]
    summary_df = summary_df[
        [col for col in final_summary_cols if col in summary_df.columns]
    ]

    # --- 7. 写入Excel ---
    output_file_path = (
        output_path / f"{company_name_from_file}_data_analysis_result.xlsx"
    )
    with pd.ExcelWriter(output_file_path) as writer:
        basic_metrics_df.to_excel(writer, sheet_name="基础指标", index=False)
        detailed_routes_df.to_excel(writer, sheet_name="线路详细数据", index=False)
        summary_df.to_excel(writer, sheet_name="线路汇总数据", index=False)
    print(f"分析完成, 结果已保存至: {output_file_path}")

    # 【修改点 3】: 返回统计数据字典
    stats_dict = {
        "公司 (Company)": company_key,
        "原始数据量": original_data_count,
        "因时间戳缺失删除": time_dropped_count,
        "因时限空值删除": nan_dropped_count,
        "因范围不符删除": value_dropped_count,
    }
    return stats_dict


if __name__ == "__main__":
    main()

库导入完成。
项目文件夹结构设置/检查完毕。
输入数据(Logistics数据)应位于: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis_202507/报告数据/temp/4_logistics数据
分析报告将输出至: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis_202507/报告数据/输出/data_analysis_result

正在从 'basic_data.xlsx' 加载 Top 30 城市列表...
成功加载 30 个 Top 30 城市。

找到 10 个待处理文件，开始分析...

--- 正在处理文件: EMS_logistics_data.xlsx ---
原始数据量: 41287 条
  - 因核心时间戳缺失，删除了 247 行。
  - 正在为其他公司执行严格筛选(第1步)：检查所有9个时限的空值...
    -> 因时限存在空值，删除了 645 行。
  - 正在筛选(第2步)：应用业务逻辑范围条件...
    -> 正在为其他公司增加额外筛选：4个分拣中心时长 > 0
    -> 因时限值不符合业务范围，又删除了 1847 行。
  - [小计] 时效筛选阶段共删除 2492 行。
  - 正在计算'线路汇总数据'...
  - 正在合并中转次数数据...
分析完成, 结果已保存至: /Users/lava/Documents/国家邮政局发展研究中心实习/python_data_analysis_202507/报告数据/输出/data_analysis_result/EMS_data_analysis_result.xlsx

--- 正在处理文件: 中通_logistics_data.xlsx ---
原始数据量: 97785 条
  - 因核心时间戳缺失，删除了 0 行。
  - 正在为中通/顺丰执行宽松筛选(第1步)：仅检查5个核心时限的空值...
    -> 因时限存在空值，删除了 0 行。
  - 正在筛选(第2步)：应用业务逻辑范围条件...
    -> 因时限值不符合业务范围，又删除了 23655 行。
  - [小计] 时效筛选阶段共删除 23655 行。
  - 正在计算'线