In [1]:
import pandas as pd
import numpy as np
import os

# ==============================================================================
# 函数库定义区 (复用我们之前建立的三个核心函数)
# ==============================================================================

def preprocess_panel_winsorize(panel_df: pd.DataFrame, 
                               lower_quantile: float = 0.02, 
                               upper_quantile: float = 0.98) -> pd.DataFrame:
    """【预处理函数】对宽面板数据进行对称的、标准的缩尾处理。"""
    print(f"\n--- [步骤C] 开始执行对称缩尾处理（Winsorize） ---")
    print(f"处理规则: 下限={lower_quantile*100:.1f}%, 上限={upper_quantile*100:.1f}%")
    processed_df = panel_df.copy()
    for col_name in processed_df.columns:
        if not pd.api.types.is_numeric_dtype(processed_df[col_name]): continue
        column_data = processed_df[col_name]
        valid_data = column_data.dropna()
        if valid_data.empty: continue
        lower_bound = valid_data.quantile(lower_quantile)
        upper_bound = valid_data.quantile(upper_quantile)
        processed_df[col_name] = column_data.clip(lower=lower_bound, upper=upper_bound)
    print(f"--- [步骤C] 对称缩尾处理完成 ---")
    return processed_df

def calculate_stability_factor(data_df: pd.DataFrame, factor_name: str) -> pd.Series:
    """【稳定性因子计算函数】"""
    print(f"\n--- [分支任务A] 开始计算 {factor_name} 稳定性因子 ---")
    historical_avg = data_df.mean(axis=1)
    avg_mean, avg_std = historical_avg.mean(), historical_avg.std()
    stability_zscore = (historical_avg - avg_mean) / avg_std
    stability_zscore.name = factor_name
    print(f"--- [分支任务A] {factor_name} 稳定性因子计算完成 ---")
    return stability_zscore

def build_composite_growth_factor(panel_data: pd.DataFrame, factor_prefix: str, years_list: list = [3, 4, 5], latest_window_size: int = 4):
    """【成长性因子计算函数】"""
    print(f"\n--- [分支任务B] 开始构建 {factor_prefix.upper()} 复合成长因子 ---")
    # ... (内部实现与之前完全相同) ...
    results_df = pd.DataFrame(index=panel_data.index)
    for years in years_list:
        col_name = f'{factor_prefix}_cagr_{years}yr_stable'
        results_df[col_name] = panel_data.apply(calculate_stable_growth, axis=1, target_years=years, latest_window_size=latest_window_size)
    zscore_cols = []
    for years in years_list:
        cagr_col = f'{factor_prefix}_cagr_{years}yr_stable'
        zscore_col = f'{factor_prefix}_zscore_{years}yr_stable'
        zscore_cols.append(zscore_col)
        mean, std = results_df[cagr_col].mean(), results_df[cagr_col].std()
        results_df[zscore_col] = (results_df[cagr_col] - mean) / std
    MIN_VALID_SCORES = 2
    valid_score_counts = results_df[zscore_cols].count(axis=1)
    results_df_filtered = results_df[valid_score_counts >= MIN_VALID_SCORES].copy()
    results_df_filtered.loc[:, zscore_cols] = results_df_filtered.loc[:, zscore_cols].fillna(0)
    final_score_col = f'composite_{factor_prefix}_growth_score'
    composite_avg = results_df_filtered[zscore_cols].mean(axis=1)
    final_mean, final_std = composite_avg.mean(), composite_avg.std()
    results_df_filtered.loc[:, final_score_col] = (composite_avg - final_mean) / final_std
    results_df_filtered.sort_values(by=final_score_col, ascending=False, inplace=True)
    print(f"--- [分支任务B] {factor_prefix.upper()} 复合成长因子构建完成 ---")
    return results_df_filtered

def calculate_stable_growth(row, target_years, base_window_half_size=2, min_periods_in_window=3, latest_window_size=4):
    """【成长性因子的辅助计算函数】"""
    series = row.dropna()
    if len(series) < latest_window_size: return np.nan
    latest_window = series.iloc[-latest_window_size:]
    latest_avg = latest_window.mean()
    base_avg, actual_years = np.nan, np.nan
    for shift in range(6):
        base_period_center_idx = -(target_years * 4) - 1 - shift
        window_start_idx = base_period_center_idx - base_window_half_size
        window_end_idx = base_period_center_idx + base_window_half_size + 1
        if window_start_idx >= -len(series) and window_end_idx <= 0:
            base_window = series.iloc[window_start_idx:window_end_idx]
            if base_window.count() >= min_periods_in_window:
                base_avg = base_window.mean()
                actual_years = (series.index.get_loc(latest_window.index[-1]) - series.index.get_loc(base_window.index[-1])) / 4.0
                break
    if pd.isna(base_avg): return np.nan
    if base_avg > 0 and latest_avg > 0:
        if actual_years < 1: actual_years = 1.0
        return (latest_avg / base_avg) ** (1 / actual_years) - 1
    else: return -10

# ==============================================================================
# 主流程执行区
# ==============================================================================
if __name__ == "__main__":
    
    # --- 参数配置 ---
    GM_FILE_PATH = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/初步数据/stock_financial_indicators_gross_margin_wide_panel.xlsx'
    ASSETS_FILE_PATH = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/初步数据/stock_balance_sheets_total_assets_wide_panel.xlsx'
    OUTPUT_DIR = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/'
    FACTOR_PREFIX = 'gm_ta_ratio'
    
    print(f"====== 开始为新因子 '毛利润/总资产' 构建评分 ======")
    
    # --- 步骤 A: 加载数据 ---
    print("\n--- [步骤A] 加载毛利润和总资产数据 ---")
    try:
        gm_df = pd.read_excel(GM_FILE_PATH, index_col=0)
        assets_df = pd.read_excel(ASSETS_FILE_PATH, index_col=0)
    except FileNotFoundError as e:
        raise SystemExit(f"[错误] 文件未找到: {e.filename}")

    # --- 步骤 B: 构造原始比率指标 ---
    print("\n--- [步骤B] 构造 '毛利润/总资产' 原始比率指标 ---")
    gm_df, assets_df = gm_df.align(assets_df, join='inner') # 对齐数据
    raw_ratio_df = gm_df / assets_df
    raw_ratio_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("原始比率指标构造完成。")

    # --- 步骤 C: 对原始比率进行对称缩尾预处理 ---
    clean_ratio_df = preprocess_panel_winsorize(
        raw_ratio_df,
        lower_quantile=0.02, # 使用对称的2%-98%缩尾
        upper_quantile=0.98
    )
    print("\n----------------------------------------------------")
    
    # --- 步骤 D: 计算并保存“稳定性”因子 ---
    stability_score = calculate_stability_factor(
        data_df=clean_ratio_df, 
        factor_name='gm_ta_stability_score'
    )
    output_stability_path = os.path.join(OUTPUT_DIR, f"Z值-毛利润比总资产-稳定性.xlsx")
    try:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        stability_score.to_excel(output_stability_path, header=True)
        print(f"[成功] 已将“稳定性”因子保存至: {output_stability_path}")
    except Exception as e:
        print(f"[失败] “稳定性”因子保存失败: {e}")
        
    print("\n----------------------------------------------------")
        
    # --- 步骤 E: 计算并保存“成长性”因子 ---
    growth_score = build_composite_growth_factor(
        panel_data=clean_ratio_df,
        factor_prefix=FACTOR_PREFIX,
        latest_window_size=4
    )
    output_growth_path = os.path.join(OUTPUT_DIR, f"Z值-毛利润比总资产-成长性.xlsx")
    try:
        growth_score.to_excel(output_growth_path)
        print(f"[成功] 已将“成长性”因子保存至: {output_growth_path}")
    except Exception as e:
        print(f"[失败] “成长性”因子保存失败: {e}")

    print("\n====== '毛利润/总资产'双因子全部计算并保存完毕！ ======")


--- [步骤A] 加载毛利润和总资产数据 ---

--- [步骤B] 构造 '毛利润/总资产' 原始比率指标 ---
原始比率指标构造完成。

--- [步骤C] 开始执行对称缩尾处理（Winsorize） ---
处理规则: 下限=2.0%, 上限=98.0%
--- [步骤C] 对称缩尾处理完成 ---

----------------------------------------------------

--- [分支任务A] 开始计算 gm_ta_stability_score 稳定性因子 ---
--- [分支任务A] gm_ta_stability_score 稳定性因子计算完成 ---
[成功] 已将“稳定性”因子保存至: /Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/Z值-毛利润比总资产-稳定性.xlsx

----------------------------------------------------

--- [分支任务B] 开始构建 GM_TA_RATIO 复合成长因子 ---
--- [分支任务B] GM_TA_RATIO 复合成长因子构建完成 ---
[成功] 已将“成长性”因子保存至: /Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/Z值-毛利润比总资产-成长性.xlsx

