In [1]:
import pandas as pd
import numpy as np
import os # 引入os模块来处理文件路径

# ==============================================================================
#  步骤 0: 从您指定的Excel文件路径加载数据
# ==============================================================================
print("--- 步骤 0: 从Excel文件加载准备好的毛利率宽表数据 ---")

file_path = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/毛利率/stock_financial_indicators_grossprofit_margin_wide_panel.xlsx'

try:
    df_gross_margin = pd.read_excel(file_path, index_col=0)
    print(f"成功从路径 '{file_path}' 加载数据！")
    print("数据预览：")
    print(df_gross_margin.head())

except FileNotFoundError:
    print(f"[错误] 文件未找到！请检查路径是否正确: {file_path}")
    df_gross_margin = pd.DataFrame()
except Exception as e:
    print(f"读取Excel文件时出错: {e}")
    df_gross_margin = pd.DataFrame()

# ==============================================================================
#  数据预处理与因子计算流程
# ==============================================================================

if not df_gross_margin.empty:
    print("\n--- 开始执行数据预处理和因子计算 ---")
    
    # --- 步骤 1: 数据充分性筛选 ---
    print("\n--- 步骤 1: 数据充分性筛选 ---")
    valid_counts = df_gross_margin.notna().sum(axis=1)
    MIN_PERIODS = 12
    df_filtered = df_gross_margin[valid_counts >= MIN_PERIODS]
    print(f"原始公司数量: {len(df_gross_margin)}")
    print(f"筛选后剩余公司数量: {len(df_filtered)}")

    if not df_filtered.empty:
        # --- 步骤 2: 极端值处理 (缩尾) ---
        print("\n--- 步骤 2: 极端值处理 (缩尾) ---")
        lower_bound = df_filtered.stack().quantile(0.01)
        upper_bound = df_filtered.stack().quantile(0.99)
        df_winsorized = df_filtered.clip(lower=lower_bound, upper=upper_bound)
        print(f"已对数据在 [{lower_bound:.2f}, {upper_bound:.2f}] 范围内进行缩尾处理。")

        # --- 步骤 3: 计算处理后数据的平均值 ---
        print("\n--- 步骤 3: 计算处理后数据的平均值 ---")
        average_gm = df_winsorized.mean(axis=1)
        
        # --- 步骤 4: Z-Score 标准化 ---
        print("\n--- 步骤 4: Z-Score 标准化 ---")
        z_scores_gm = (average_gm - average_gm.mean()) / average_gm.std()
        print("Z-Score 计算完成。")

        # ==============================================================================
        #  新增部分：步骤 5 - 导出最终结果到Excel
        # ==============================================================================
        print("\n--- 步骤 5: 导出Z-score因子值到Excel文件 ---")
        
        # 定义输出的文件夹路径和完整文件路径
        output_dir = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores'
        output_filename = 'Z值-毛利率-平均值.xlsx'
        output_path = os.path.join(output_dir, output_filename)
        
        try:
            # 确保输出文件夹存在
            os.makedirs(output_dir, exist_ok=True)
            
            # 将z_scores_gm这个Series转换为DataFrame并导出
            # z_scores_gm.to_frame()会把Series变成一列的DataFrame
            # 我们给这一列命名为 'z_score_gm'
            z_scores_gm.to_frame(name='z_score_gm').to_excel(output_path, index=True)
            
            print(f"\n[成功] 因子已成功导出到: {output_path}")
            
        except Exception as e:
            print(f"\n[失败] 文件导出失败: {e}")
        # ==============================================================================

    else:
        print("经过数据充分性筛选后，没有剩余的公司可供处理。")
else:
    print("数据加载失败或数据为空，未执行后续计算。")

--- 步骤 0: 从Excel文件加载准备好的毛利率宽表数据 ---
成功从路径 '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/毛利率/stock_financial_indicators_grossprofit_margin_wide_panel.xlsx' 加载数据！
数据预览：
           2018-03-31  2018-06-30  2018-09-30  2018-12-31  2019-03-31  \
ts_code                                                                 
000002.SZ     34.1720     34.4372     34.7626     37.4816     35.0440   
000004.SZ     79.8477     81.9358     82.4253     81.8792     76.6223   
000006.SZ     22.9650     38.9920     38.9828     38.7866     39.2680   
000007.SZ     79.1646     73.3189     70.7013     69.3740     67.1958   
000008.SZ     57.7922     51.1164     48.5656     48.9184     53.0718   

           2019-06-30  2019-09-30  2019-12-31  2020-03-31  2020-06-30  ...  \
ts_code                                                                ...   
000002.SZ     36.2453     35.9903     36.2451     31.2754     31.8089  ...   
000004.SZ     75.9448     75.5123     73.0102     83.0721    

In [2]:
import pandas as pd
import numpy as np

# ----------------------------------------------------------------
# 假设 preprocess_panel_data 函数已经定义好（如我们上一轮讨论）
# 我在这里重新贴出，方便您直接复制使用
# ----------------------------------------------------------------
def preprocess_panel_data(panel_df: pd.DataFrame, 
                          winsorize_limits: tuple = (0.02, 0.02)) -> pd.DataFrame:
    """
    对宽面板数据进行预处理，核心是执行Winsorization（缩尾）去极值。
    """
    print("--- 开始执行数据预处理（Winsorization） ---")
    processed_df = panel_df.copy()
    for col_name in processed_df.columns:
        if not pd.api.types.is_numeric_dtype(processed_df[col_name]):
            print(f"警告：列 '{col_name}' 非数值类型，已跳过处理。")
            continue
        column_data = processed_df[col_name]
        valid_data = column_data.dropna()
        if valid_data.empty:
            continue
        lower_bound = valid_data.quantile(winsorize_limits[0])
        upper_bound = valid_data.quantile(1 - winsorize_limits[1])
        processed_df[col_name] = column_data.clip(lower=lower_bound, upper=upper_bound)
        print(f"已处理列: {col_name} | "
              f"下边界({winsorize_limits[0]*100:.0f}%): {lower_bound:.4f}, "
              f"上边界({(1-winsorize_limits[1])*100:.0f}%): {upper_bound:.4f}")
    print("--- 数据预处理完成 ---")
    return processed_df
# ----------------------------------------------------------------


# --- 主流程开始 ---

# 1. 定义您要加载的文件路径
#    请确保您的环境中已经安装了 openpyxl (pip install openpyxl)
file_path = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/毛利率/stock_financial_indicators_grossprofit_margin_wide_panel.xlsx'

# 2. 加载Excel文件为DataFrame
try:
    # index_col=0 表示我们把Excel的第一列作为DataFrame的索引（通常是股票代码）
    raw_panel_df = pd.read_excel(file_path, index_col=0)
    print(f"成功加载文件: {file_path}")
    print("原始数据（前5行）:")
    print(raw_panel_df.head())
    print("-" * 60)

except FileNotFoundError:
    print(f"[错误] 文件未找到: {file_path}")
    print("请确认文件路径是否正确。")
    # 如果文件不存在，则退出后续操作
    raw_panel_df = None

# 3. 如果数据加载成功，则进行预处理
if raw_panel_df is not None:
    # 调用预处理函数，得到“干净”的数据
    clean_panel_df = preprocess_panel_data(raw_panel_df)
    
    print("-" * 60)
    print("预处理后的“干净”数据（前5行）:")
    print(clean_panel_df.head())

成功加载文件: /Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/毛利率/stock_financial_indicators_grossprofit_margin_wide_panel.xlsx
原始数据（前5行）:
           2018-03-31  2018-06-30  2018-09-30  2018-12-31  2019-03-31  \
ts_code                                                                 
000002.SZ     34.1720     34.4372     34.7626     37.4816     35.0440   
000004.SZ     79.8477     81.9358     82.4253     81.8792     76.6223   
000006.SZ     22.9650     38.9920     38.9828     38.7866     39.2680   
000007.SZ     79.1646     73.3189     70.7013     69.3740     67.1958   
000008.SZ     57.7922     51.1164     48.5656     48.9184     53.0718   

           2019-06-30  2019-09-30  2019-12-31  2020-03-31  2020-06-30  ...  \
ts_code                                                                ...   
000002.SZ     36.2453     35.9903     36.2451     31.2754     31.8089  ...   
000004.SZ     75.9448     75.5123     73.0102     83.0721     75.8113  ...   
000006.SZ     39.807

In [6]:
import pandas as pd
import numpy as np
import os

# ==============================================================================
# 函数库定义区 (这部分保持不变)
# ==============================================================================

def preprocess_panel_data(panel_df: pd.DataFrame, 
                          winsorize_limits: tuple = (0.02, 0.02)) -> pd.DataFrame:
    """步骤1：对宽面板数据进行预处理，核心是执行Winsorization（缩尾）去极值。"""
    print("--- [步骤1] 开始执行数据预处理（Winsorization） ---")
    processed_df = panel_df.copy()
    for col_name in processed_df.columns:
        if not pd.api.types.is_numeric_dtype(processed_df[col_name]):
            continue
        column_data = processed_df[col_name]
        valid_data = column_data.dropna()
        if valid_data.empty:
            continue
        lower_bound = valid_data.quantile(winsorize_limits[0])
        upper_bound = valid_data.quantile(1 - winsorize_limits[1])
        processed_df[col_name] = column_data.clip(lower=lower_bound, upper=upper_bound)
    print("--- [步骤1] 数据预处理完成 ---")
    return processed_df

def calculate_stable_growth(row, target_years, 
                            base_window_half_size=2, 
                            min_periods_in_window=3):
    """【可配置】成长率计算函数，基于窗口平均值，以平滑异常值。"""
    series = row.dropna()
    if len(series) < 3: return np.nan
    latest_window = series.iloc[-3:]
    latest_avg = latest_window.mean()
    base_avg = np.nan
    actual_years = np.nan
    for shift in range(6):
        base_period_center_idx = -(target_years * 4) - 1 - shift
        window_start_idx = base_period_center_idx - base_window_half_size
        window_end_idx = base_period_center_idx + base_window_half_size + 1
        if window_start_idx >= -len(series) and window_end_idx <= 0:
            base_window = series.iloc[window_start_idx:window_end_idx]
            if base_window.count() >= min_periods_in_window:
                base_avg = base_window.mean()
                actual_years = (series.index.get_loc(latest_window.index[-1]) - series.index.get_loc(base_window.index[-1])) / 4.0
                break
    if pd.isna(base_avg): return np.nan
    if base_avg > 0 and latest_avg > 0:
        if actual_years < 1: actual_years = 1.0
        return (latest_avg / base_avg) ** (1 / actual_years) - 1
    else: return -10

def build_composite_growth_factor(panel_data: pd.DataFrame, 
                                  factor_prefix: str,
                                  years_list: list = [3, 4, 5]):
    """步骤2：根据输入的宽表面板数据，构建一个复合成长因子。"""
    print(f"\n--- [步骤2] 开始构建复合成长因子: {factor_prefix}_growth ---")
    results_df = pd.DataFrame(index=panel_data.index)
    for years in years_list:
        col_name = f'{factor_prefix}_cagr_{years}yr_stable'
        results_df[col_name] = panel_data.apply(
            calculate_stable_growth, axis=1, target_years=years
        )
    zscore_cols = []
    for years in years_list:
        cagr_col = f'{factor_prefix}_cagr_{years}yr_stable'
        zscore_col = f'{factor_prefix}_zscore_{years}yr_stable'
        zscore_cols.append(zscore_col)
        mean, std = results_df[cagr_col].mean(), results_df[cagr_col].std()
        results_df[zscore_col] = (results_df[cagr_col] - mean) / std
    MIN_VALID_SCORES = 2
    valid_score_counts = results_df[zscore_cols].count(axis=1)
    results_df_filtered = results_df[valid_score_counts >= MIN_VALID_SCORES].copy()
    results_df_filtered.loc[:, zscore_cols] = results_df_filtered.loc[:, zscore_cols].fillna(0)
    final_score_col = f'composite_{factor_prefix}_growth_score'
    composite_avg = results_df_filtered[zscore_cols].mean(axis=1)
    final_mean, final_std = composite_avg.mean(), composite_avg.std()
    results_df_filtered.loc[:, final_score_col] = (composite_avg - final_mean) / final_std
    results_df_filtered.sort_values(by=final_score_col, ascending=False, inplace=True)
    print(f"--- [步骤2] 因子构建完成 ---")
    return results_df_filtered

# ==============================================================================
# 主流程执行区
# ==============================================================================

if __name__ == "__main__":
    
    # --- 参数配置 (*** 已修改 ***) ---
    INPUT_FILE_PATH = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/初步数据/毛利率/stock_financial_indicators_grossprofit_margin_wide_panel.xlsx'
    
    # 【修改点1】: 不再定义输出目录，而是直接定义一个完整的输出文件路径
    OUTPUT_FILE_PATH = '/Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/Z值-gm-复合增长因子(完整流程).xlsx'
    
    FACTOR_PREFIX = 'gm'
    
    print("====== 工作流开始 ======")
    
    try:
        raw_df = pd.read_excel(INPUT_FILE_PATH, index_col=0)
        print(f"成功加载原始数据，形状: {raw_df.shape}")
    except FileNotFoundError:
        raise SystemExit(f"[错误] 输入文件未找到: {INPUT_FILE_PATH}")

    clean_df = preprocess_panel_data(raw_df)
    
    final_factor_df = build_composite_growth_factor(
        panel_data=clean_df, 
        factor_prefix=FACTOR_PREFIX
    )

    # --- 步骤3: 保存结果 (*** 已修改 ***) ---
    try:
        # 【修改点2】: 删除 os.makedirs(...) 和 os.path.join(...)
        # 直接使用我们定义好的完整文件路径进行保存
        final_factor_df.to_excel(OUTPUT_FILE_PATH)
        
        print(f"\n[成功] 已将最终的因子得分导出到: {OUTPUT_FILE_PATH}")
        print("最终因子数据（前5行）:")
        print(final_factor_df.head())
    except Exception as e:
        print(f"\n[失败] 最终文件导出失败: {e}")
        
    print("\n====== 工作流全部执行完毕！ ======")

成功加载原始数据，形状: (5316, 29)
--- [步骤1] 开始执行数据预处理（Winsorization） ---
--- [步骤1] 数据预处理完成 ---

--- [步骤2] 开始构建复合成长因子: gm_growth ---
--- [步骤2] 因子构建完成 ---

[成功] 已将最终的因子得分导出到: /Users/alan-hopiy/Documents/quantitative-learning-journey/outputs/z-scores/Z值-gm-复合增长因子(完整流程).xlsx
最终因子数据（前5行）:
           gm_cagr_3yr_stable  gm_cagr_4yr_stable  gm_cagr_5yr_stable  \
ts_code                                                                 
600608.SH            1.605767            1.092978            0.896045   
688221.SH            1.415457            0.603046            0.860012   
002366.SZ            1.580103            0.696888            0.581768   
600004.SH            1.735270            0.817677            0.227037   
603099.SH            1.562059            0.917333            0.212496   

           gm_zscore_3yr_stable  gm_zscore_4yr_stable  gm_zscore_5yr_stable  \
ts_code                                                                       
600608.SH              1.550152              1.106004  