In [1]:
import pandas as pd
import numpy as np
import os

# ==============================================================================
# 函数库定义区 (保持不变)
# ==============================================================================

def preprocess_panel_data_flexible(panel_df: pd.DataFrame, 
                                   lower_quantile: float = 0.005, 
                                   upper_quantile: float = 0.98) -> pd.DataFrame:
    """【升级版】对宽面板数据进行预处理，支持非对称的Winsorization（缩尾）。"""
    print(f"--- [步骤1] 开始执行非对称数据预处理（Winsorization） ---")
    print(f"处理规则: 下限={lower_quantile*100:.1f}%, 上限={upper_quantile*100:.1f}%")
    processed_df = panel_df.copy()
    for col_name in processed_df.columns:
        if not pd.api.types.is_numeric_dtype(processed_df[col_name]):
            continue
        column_data = processed_df[col_name]
        valid_data = column_data.dropna()
        if valid_data.empty: continue
        lower_bound = valid_data.quantile(lower_quantile)
        upper_bound = valid_data.quantile(upper_quantile)
        processed_df[col_name] = column_data.clip(lower=lower_bound, upper=upper_bound)
    print("--- [步骤1] 数据预处理完成 ---")
    return processed_df

def calculate_stability_factor(data_df: pd.DataFrame, factor_name: str) -> pd.Series:
    """计算稳定性因子：即指标的历史平均值的Z-score。"""
    print(f"\n--- [步骤2] 开始计算 {factor_name} 稳定性因子 ---")
    historical_avg = data_df.mean(axis=1)
    print("步骤2.1: 计算出的各公司历史平均值（基于ROA）:")
    print(historical_avg.head())
    
    avg_mean, avg_std = historical_avg.mean(), historical_avg.std()
    stability_zscore = (historical_avg - avg_mean) / avg_std
    stability_zscore.name = factor_name
    print(f"\n--- [步骤2] {factor_name} 稳定性因子计算完成 ---")
    return stability_zscore

# ==============================================================================
# 主流程执行区
# ==============================================================================

if __name__ == "__main__":
    
    # --- 参数配置 ---
    INPUT_FILE_PATH = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/初步数据/stock_financial_indicators_roa_yearly_wide_panel.xlsx'
    
    # *** 修改点：更新输出文件的完整路径 ***
    OUTPUT_FILE_PATH = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/Z值-ROA-平均值.xlsx'
    
    print(f"====== 开始处理文件: {INPUT_FILE_PATH} ======")
    
    # --- 1. 加载原始Q-ROE数据 ---
    try:
        raw_q_roe_df = pd.read_excel(INPUT_FILE_PATH, index_col=0)
        print(f"\n成功加载Q-ROE原始数据，形状: {raw_q_roe_df.shape}")
    except FileNotFoundError:
        raise SystemExit(f"[错误] 文件未找到: {INPUT_FILE_PATH}")

    # --- 2. 对原始Q-ROE数据进行非对称预处理 ---
    clean_q_roe_df = preprocess_panel_data_flexible(
        raw_q_roe_df,
        lower_quantile=0.005,
        upper_quantile=0.98
    )
    
    # --- 3. 基于预处理后的数据，直接计算稳定性因子 ---
    roe_stability_score = calculate_stability_factor(
        data_df=clean_q_roe_df,
        factor_name='roe_stability_score' # 因子得分列的名称
    )

    # --- 4. 保存结果到您指定的新路径 ---
    try:
        # 直接使用我们定义好的完整文件路径进行保存
        roe_stability_score.to_excel(OUTPUT_FILE_PATH, header=True)
        print("-" * 60)
        print(f"\n[成功] 已将最终的因子得分导出到: {OUTPUT_FILE_PATH}")
        print("最终因子数据（前5行）:")
        print(roe_stability_score.sort_values(ascending=False).head())

    except Exception as e:
        # 如果指定的目录不存在，可能会报错，我们增加一个自动创建目录的步骤
        print(f"\n[提示] 文件直接保存失败，尝试创建目录后重试...")
        try:
            # 获取目录路径
            output_dir = os.path.dirname(OUTPUT_FILE_PATH)
            # 创建目录
            os.makedirs(output_dir, exist_ok=True)
            # 再次尝试保存
            roe_stability_score.to_excel(OUTPUT_FILE_PATH, header=True)
            print(f"\n[成功] 已创建目录并成功将文件导出到: {OUTPUT_FILE_PATH}")
            print("最终因子数据（前5行）:")
            print(roe_stability_score.sort_values(ascending=False).head())
        except Exception as final_e:
            print(f"\n[失败] 创建目录后保存仍然失败: {final_e}")


成功加载Q-ROE原始数据，形状: (5413, 29)
--- [步骤1] 开始执行非对称数据预处理（Winsorization） ---
处理规则: 下限=0.5%, 上限=98.0%
--- [步骤1] 数据预处理完成 ---

--- [步骤2] 开始计算 roe_stability_score 稳定性因子 ---
步骤2.1: 计算出的各公司历史平均值（基于ROA）:
ts_code
000001.SZ    0.863386
000002.SZ    1.225217
000004.SZ   -7.043448
000006.SZ    2.329907
000007.SZ   -3.285101
dtype: float64

--- [步骤2] roe_stability_score 稳定性因子计算完成 ---
------------------------------------------------------------

[成功] 已将最终的因子得分导出到: /Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/Z值-ROA-平均值.xlsx
最终因子数据（前5行）:
ts_code
603444.SH    3.433465
300628.SZ    3.373967
600519.SH    3.346370
834950.BJ    3.203722
300529.SZ    3.145954
Name: roe_stability_score, dtype: float64


In [2]:
import pandas as pd
import numpy as np
import os

# ==============================================================================
# 函数库定义区 (包含微小升级)
# ==============================================================================

def preprocess_panel_data_flexible(panel_df: pd.DataFrame, 
                                   lower_quantile: float = 0.005, 
                                   upper_quantile: float = 0.98) -> pd.DataFrame:
    """【预处理函数】对宽面板数据进行非对称的Winsorization（缩尾）。"""
    print(f"--- [步骤1] 开始执行非对称数据预处理 ---")
    processed_df = panel_df.copy()
    for col_name in processed_df.columns:
        if not pd.api.types.is_numeric_dtype(processed_df[col_name]): continue
        column_data = processed_df[col_name]
        valid_data = column_data.dropna()
        if valid_data.empty: continue
        lower_bound = valid_data.quantile(lower_quantile)
        upper_bound = valid_data.quantile(upper_quantile)
        processed_df[col_name] = column_data.clip(lower=lower_bound, upper=upper_bound)
    print(f"--- [步骤1] 数据预处理完成 ---")
    return processed_df

def calculate_stable_growth(row, target_years, 
                            base_window_half_size=2, 
                            min_periods_in_window=3,
                            latest_window_size=4): # *** 新增参数 ***
    """【可配置】成长率计算函数，基于窗口平均值，以平滑异常值。"""
    series = row.dropna()
    # *** 修改点：使用可配置的最新窗口大小 ***
    if len(series) < latest_window_size:
        return np.nan
    latest_window = series.iloc[-latest_window_size:]
    latest_avg = latest_window.mean()
    base_avg, actual_years = np.nan, np.nan
    for shift in range(6):
        base_period_center_idx = -(target_years * 4) - 1 - shift
        window_start_idx = base_period_center_idx - base_window_half_size
        window_end_idx = base_period_center_idx + base_window_half_size + 1
        if window_start_idx >= -len(series) and window_end_idx <= 0:
            base_window = series.iloc[window_start_idx:window_end_idx]
            if base_window.count() >= min_periods_in_window:
                base_avg = base_window.mean()
                actual_years = (series.index.get_loc(latest_window.index[-1]) - series.index.get_loc(base_window.index[-1])) / 4.0
                break
    if pd.isna(base_avg): return np.nan
    if base_avg > 0 and latest_avg > 0:
        if actual_years < 1: actual_years = 1.0
        return (latest_avg / base_avg) ** (1 / actual_years) - 1
    else: return -10

def build_composite_growth_factor(panel_data: pd.DataFrame, 
                                  factor_prefix: str,
                                  years_list: list = [3, 4, 5],
                                  latest_window_size: int = 4): # *** 新增参数 ***
    """【成长因子构建函数】根据输入的宽面板数据，构建一个复合成长因子。"""
    print(f"\n--- [步骤2] 开始构建复合成长因子: {factor_prefix}_growth ---")
    results_df = pd.DataFrame(index=panel_data.index)
    for years in years_list:
        col_name = f'{factor_prefix}_cagr_{years}yr_stable'
        # *** 修改点：将参数传递给辅助函数 ***
        results_df[col_name] = panel_data.apply(
            calculate_stable_growth, axis=1, target_years=years, latest_window_size=latest_window_size
        )
    zscore_cols = []
    for years in years_list:
        cagr_col = f'{factor_prefix}_cagr_{years}yr_stable'
        zscore_col = f'{factor_prefix}_zscore_{years}yr_stable'
        zscore_cols.append(zscore_col)
        mean, std = results_df[cagr_col].mean(), results_df[cagr_col].std()
        results_df[zscore_col] = (results_df[cagr_col] - mean) / std
    MIN_VALID_SCORES = 2
    valid_score_counts = results_df[zscore_cols].count(axis=1)
    results_df_filtered = results_df[valid_score_counts >= MIN_VALID_SCORES].copy()
    results_df_filtered.loc[:, zscore_cols] = results_df_filtered.loc[:, zscore_cols].fillna(0)
    final_score_col = f'composite_{factor_prefix}_growth_score'
    composite_avg = results_df_filtered[zscore_cols].mean(axis=1)
    final_mean, final_std = composite_avg.mean(), composite_avg.std()
    results_df_filtered.loc[:, final_score_col] = (composite_avg - final_mean) / final_std
    results_df_filtered.sort_values(by=final_score_col, ascending=False, inplace=True)
    print(f"--- [步骤2] 因子构建完成 ---")
    return results_df_filtered

# ==============================================================================
# 主流程执行区
# ==============================================================================

if __name__ == "__main__":
    
    # --- 参数配置 ---
    INPUT_FILE_PATH = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/初步数据/stock_financial_indicators_roa_yearly_wide_panel.xlsx'
    OUTPUT_DIR = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/'
    FACTOR_PREFIX = 'roa' # 因子前缀现在是ROE
    
    print(f"====== 开始为 {FACTOR_PREFIX.upper()} 构建成长性因子 ======")
    
    # --- 1. 加载并预处理数据 ---
    try:
        raw_df = pd.read_excel(INPUT_FILE_PATH, index_col=0)
    except FileNotFoundError:
        raise SystemExit(f"[错误] 输入文件未找到: {INPUT_FILE_PATH}")

    clean_df = preprocess_panel_data_flexible(
        raw_df, lower_quantile=0.005, upper_quantile=0.98 
    )
    
    # --- 2. 基于预处理后的数据，计算成长性因子 ---
    roe_growth_factor = build_composite_growth_factor(
        panel_data=clean_df,
        factor_prefix=FACTOR_PREFIX,
        latest_window_size=4  # 明确使用最近4期数据作为当前值
    )

    # --- 3. 保存结果 ---
    output_filename = f"Z值-{FACTOR_PREFIX.upper()}-成长性.xlsx"
    output_path = os.path.join(OUTPUT_DIR, output_filename)
    
    try:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        roe_growth_factor.to_excel(output_path)
        print(f"\n[成功] 已将ROA成长性因子得分导出到: {output_path}")
        print("最终因子数据（前5行）:")
        print(roe_growth_factor.head())
    except Exception as e:
        print(f"\n[失败] 最终文件导出失败: {e}")
        
    print("\n====== 工作流全部执行完毕！ ======")

--- [步骤1] 开始执行非对称数据预处理 ---
--- [步骤1] 数据预处理完成 ---

--- [步骤2] 开始构建复合成长因子: roa_growth ---
--- [步骤2] 因子构建完成 ---

[成功] 已将ROA成长性因子得分导出到: /Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/Z值-ROA-成长性.xlsx
最终因子数据（前5行）:
           roa_cagr_3yr_stable  roa_cagr_4yr_stable  roa_cagr_5yr_stable  \
ts_code                                                                    
603725.SH             6.516228             0.107188             0.842760   
600066.SH             3.151974             1.370739             0.474805   
300611.SZ             3.887982             0.133152             0.207654   
600318.SH             3.863930             0.160064             0.052374   
002096.SZ             1.391894             1.102236             1.586860   

           roa_zscore_3yr_stable  roa_zscore_4yr_stable  \
ts_code                                                   
603725.SH               2.110986               0.721339   
600066.SH               1.380203               0.996927