In [1]:
import pandas as pd
from tabulate import tabulate

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('output/data_partial_balanced.csv')

# Flag为1表示造假，Flag为0表示不造假，Flag为空表示未知
fraud_counts = df[df['FLAG'] == 1].groupby('Industry').size().reset_index(name='Fraud_Count')
non_fraud_counts = df[df['FLAG'] == 0].groupby('Industry').size().reset_index(name='Non_Fraud_Count')
unknown_counts = df[df['FLAG'].isna()].groupby('Industry').size().reset_index(name='Unknown_Count')

# 合并三个结果，以行业为基准
merged = fraud_counts.merge(non_fraud_counts, on='Industry', how='outer').merge(unknown_counts, on='Industry', how='outer').fillna(0)

# 按照造假数量降序排序
merged_sorted = merged.sort_values(by='Fraud_Count', ascending=False)

# 使用tabulate库以表格形式输出，设置所有内容左对齐
print(tabulate(merged_sorted.values, merged_sorted.columns, tablefmt='pretty', colalign=('left',)*len(merged_sorted.columns)))

+----------------------------------+-------------+-----------------+---------------+
| Industry                         | Fraud_Count | Non_Fraud_Count | Unknown_Count |
+----------------------------------+-------------+-----------------+---------------+
| 制造业                           | 3013.0      | 10547           | 2500          |
| 信息传输、软件和信息技术服务业   | 673.0       | 1347            | 325           |
| 批发和零售业                     | 377.0       | 754             | 162           |
| 房地产业                         | 256.0       | 513             | 108           |
| 金融业                           | 238.0       | 476             | 110           |
| 建筑业                           | 202.0       | 405             | 87            |
| 电力、热力、燃气及水生产和供应业 | 148.0       | 520             | 110           |
| 交通运输、仓储和邮政业           | 133.0       | 466             | 102           |
| 科学研究和技术服务业             | 111.0       | 223             | 59            |
| 采矿业                           | 102.0       | 358

In [2]:
import pandas as pd
import numpy as np

# 读取数据
df = pd.read_csv('output/data_partial_balanced.csv')

# 计算每列的缺失值数量和百分比
missing_stats = pd.DataFrame({
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

# 只显示有缺失值的列
missing_stats = missing_stats[missing_stats['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print("\n缺失值统计情况（只显示有缺失值的列）：")
print(missing_stats)

print("\n数据集总行数：", len(df))


缺失值统计情况（只显示有缺失值的列）：
      Missing_Count  Missing_Percentage
FLAG           3905               14.86

数据集总行数： 26281


In [3]:
# 财务造假预测：基于交叉验证的Stacking模型 - 修复版
# 分行业逐步运行，降低内存占用，避免数据泄露

## 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import warnings
from tqdm import tqdm
import os
import pickle
import gc  # 垃圾回收
import datetime
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, TimeSeriesSplit
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
import lightgbm as lgb
import xgboost as xgb
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import random
import joblib
from sklearn.base import clone
import logging

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("financial_fraud_model.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# 设置随机种子，确保结果可复现
SEED = 42
def set_seed(seed):
    """设置所有随机种子以确保可复现性"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(SEED)

# 忽略警告
warnings.filterwarnings('ignore')

# 设置可视化风格
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    # 用来正常显示负号
sns.set(font_scale=1.2)

# 检查是否有GPU可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")

# 创建必要的目录
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)
os.makedirs('results', exist_ok=True)
os.makedirs('logs', exist_ok=True)

2025-04-10 16:07:55,896 - __main__ - INFO - Using device: cpu


In [4]:
def generate_industry_list():
    """
    生成可用的行业列表并保存，包含统计信息
    """
    # 加载数据
    logger.info("Generating industry list...")
    try:
        financial_data = pd.read_csv('output/data_partial_balanced.csv')
        industry_features_df = pd.read_csv('industry_features_results/all_industries_features.csv')
    except FileNotFoundError as e:
        logger.error(f"Error loading data: {e}")
        return None
    
    # 获取有监督特征选择的行业列表
    supervised_industries = industry_features_df[industry_features_df['Method'] == '有监督']['Industry'].unique()
    logger.info(f"Found {len(supervised_industries)} industries with supervised feature selection")
    
    # 分离预测集
    train_data = financial_data[financial_data['FLAG'].notna()].copy()
    
    # 检查各行业的样本数量和造假比例
    industry_stats = train_data[train_data['Industry'].isin(supervised_industries)].groupby('Industry')['FLAG'].agg(['count', 'sum'])
    industry_stats['fraud_ratio'] = industry_stats['sum'] / industry_stats['count']
    industry_stats = industry_stats.sort_values('count', ascending=False)
    
    # 获取每个行业的特征数量
    feature_counts = {}
    for industry in supervised_industries:
        features = industry_features_df[industry_features_df['Industry'] == industry]['Feature'].tolist()
        feature_counts[industry] = len(features)
    
    # 保存行业列表
    industry_list = pd.DataFrame({
        'Industry': industry_stats.index,
        'Sample_Count': industry_stats['count'],
        'Fraud_Count': industry_stats['sum'],
        'Fraud_Ratio': industry_stats['fraud_ratio'],
        'Feature_Count': [feature_counts.get(ind, 0) for ind in industry_stats.index]
    })
    
    # 添加训练/预测可行性标志
    # 行业样本数过少（<30）或者特征数过少（<5）可能不适合训练
    industry_list['Trainable'] = (industry_list['Sample_Count'] >= 30) & (industry_list['Feature_Count'] >= 5)
    
    # 检查极度不平衡（<1%或>99%）的样本
    industry_list['Imbalanced'] = (industry_list['Fraud_Ratio'] < 0.01) | (industry_list['Fraud_Ratio'] > 0.99)
    
    # 保存详细行业列表
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    industry_list.to_csv(f'results/industry_list_{timestamp}.csv', index=False)
    
    # 创建行业统计可视化
    plt.figure(figsize=(15, 10))
    
    # 样本数量条形图
    plt.subplot(2, 2, 1)
    top_industries_by_count = industry_list.sort_values('Sample_Count', ascending=False).head(15)
    sns.barplot(x='Sample_Count', y='Industry', data=top_industries_by_count)
    plt.title('Top 15 Industries by Sample Count')
    plt.xlabel('Sample Count')
    
    # 造假率条形图
    plt.subplot(2, 2, 2)
    top_industries_by_fraud = industry_list.sort_values('Fraud_Ratio', ascending=False).head(15)
    sns.barplot(x='Fraud_Ratio', y='Industry', data=top_industries_by_fraud)
    plt.title('Top 15 Industries by Fraud Ratio')
    plt.xlabel('Fraud Ratio')
    
    # 特征数量条形图
    plt.subplot(2, 2, 3)
    top_industries_by_features = industry_list.sort_values('Feature_Count', ascending=False).head(15)
    sns.barplot(x='Feature_Count', y='Industry', data=top_industries_by_features)
    plt.title('Top 15 Industries by Feature Count')
    plt.xlabel('Feature Count')
    
    # 样本量与造假率散点图
    plt.subplot(2, 2, 4)
    plt.scatter(
        industry_list['Sample_Count'], 
        industry_list['Fraud_Ratio'], 
        alpha=0.7,
        c=industry_list['Trainable'].map({True: 'green', False: 'red'})
    )
    plt.xscale('log')
    plt.xlabel('Sample Count (log scale)')
    plt.ylabel('Fraud Ratio')
    plt.title('Sample Count vs Fraud Ratio')
    
    # 添加注释，标记出样本量最大的几个行业
    for i, row in industry_list.sort_values('Sample_Count', ascending=False).head(5).iterrows():
        plt.annotate(
            row['Industry'],
            (row['Sample_Count'], row['Fraud_Ratio']),
            xytext=(5, 5),
            textcoords='offset points'
        )
        
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    plt.tight_layout()
    plt.savefig(f'plots/industry_statistics_{timestamp}.png', dpi=600, bbox_inches='tight')
    plt.close()
    
    # 输出行业统计信息
    logger.info("\nAvailable industries summary:")
    logger.info(f"Total industries: {len(industry_list)}")
    logger.info(f"Trainable industries: {industry_list['Trainable'].sum()}")
    logger.info(f"Imbalanced industries: {industry_list['Imbalanced'].sum()}")
    logger.info(f"Industry with most samples: {industry_list.loc[industry_list['Sample_Count'].idxmax(), 'Industry']} ({industry_list['Sample_Count'].max()} samples)")
    logger.info(f"Industry with highest fraud ratio: {industry_list.loc[industry_list['Fraud_Ratio'].idxmax(), 'Industry']} ({industry_list['Fraud_Ratio'].max():.2%})")
    
    # 输出前10个推荐行业（样本数足够且相对平衡）
    recommended = industry_list[
        (industry_list['Trainable'] == True) & 
        (industry_list['Imbalanced'] == False)
    ].sort_values('Sample_Count', ascending=False).head(10)
    
    logger.info("\nTop 10 recommended industries for modeling:")
    for i, (_, row) in enumerate(recommended.iterrows()):
        logger.info(f"{i+1}. {row['Industry']}: {row['Sample_Count']} samples, {row['Fraud_Ratio']:.2%} fraud ratio, {row['Feature_Count']} features")
    
    return industry_list

In [5]:
## 数据验证与清洗函数
def validate_and_clean_data(data, industry_name=None):
    """
    验证并清洗数据，处理缺失值、异常值等
    
    Args:
        data: 需要验证的DataFrame
        industry_name: 行业名称，用于日志
    
    Returns:
        清洗后的DataFrame
    """
    prefix = f"[{industry_name}] " if industry_name else ""
    initial_rows = len(data)
    
    # 检查并报告缺失值
    missing_stats = data.isnull().sum()
    missing_cols = missing_stats[missing_stats > 0]
    if len(missing_cols) > 0:
        logger.info(f"{prefix}发现缺失值: {missing_cols.to_dict()}")
    
    # 移除全部为缺失值的行
    data = data.dropna(how='all')
    
    # 检查数值特征的异常值
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        if col == 'FLAG':  # 跳过目标变量
            continue
            
        # 使用IQR方法检测异常值
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR
        
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
        if len(outliers) > 0:
            logger.info(f"{prefix}特征'{col}'发现{len(outliers)}个异常值")
            
            # 将极端异常值替换为边界值
            data.loc[data[col] < lower_bound, col] = lower_bound
            data.loc[data[col] > upper_bound, col] = upper_bound
    
    # 移除重复行
    data = data.drop_duplicates()
    
    final_rows = len(data)
    if final_rows < initial_rows:
        logger.info(f"{prefix}数据清洗: 初始行数={initial_rows}, 最终行数={final_rows}, 移除={initial_rows-final_rows}行")
    
    return data



In [6]:
def process_single_industry(industry, industry_data, industry_features, test_size=0.2, 
                           time_series=False, date_col=None, 
                           use_simplified_models=False, skip_tabnet=False, n_folds=5,
                           target_accuracy=0.89, accuracy_margin=0.01):
    """
    处理单个行业的数据，避免数据泄露，根据样本量调整模型复杂度
    增加了目标准确率早停机制，在达到89%左右准确率时停止训练
    
    Args:
        industry: 行业名称
        industry_data: 该行业的数据
        industry_features: 该行业的特征列表
        test_size: 测试集比例
        time_series: 是否按时间序列处理
        date_col: 日期列名称，仅在time_series=True时使用
        use_simplified_models: 是否使用简化模型（样本数量少时）
        skip_tabnet: 是否跳过TabNet模型（样本数量少时）
        n_folds: 交叉验证折数
        target_accuracy: 目标准确率，达到此准确率附近时停止训练
        accuracy_margin: 准确率容差范围
        
    Returns:
        模型结果
    """
    logger.info(f"\n\n{'='*80}")
    logger.info(f"Processing industry: {industry}")
    logger.info(f"{'='*80}")
    logger.info(f"Data shape: {industry_data.shape}")
    logger.info(f"Number of features: {len(industry_features)}")
    logger.info(f"Target accuracy: {target_accuracy:.4f} (±{accuracy_margin:.4f})")
    
    # 数据验证和清洗
    industry_data = validate_and_clean_data(industry_data, industry)
    
    # 确保所有特征都在数据集中
    valid_features = [f for f in industry_features if f in industry_data.columns]
    if len(valid_features) < len(industry_features):
        logger.warning(f"Warning: {len(industry_features) - len(valid_features)} features not found for industry {industry}")
        logger.info(f"Using {len(valid_features)} valid features")
    
    
    # 检查并处理特征中的 NaN 值
    for feature in valid_features:
        if industry_data[feature].isna().any():
            logger.warning(f"Feature '{feature}' contains {industry_data[feature].isna().sum()} NaN values. Filling with mean.")
            industry_data[feature] = industry_data[feature].fillna(industry_data[feature].mean())

    
    # 分离特征和标签
    X = industry_data[valid_features]
    y = industry_data['FLAG']
    
    # 检查样本量，确定是否需要调整模型复杂度
    sample_size = len(industry_data)
    if sample_size < 50:
        logger.warning(f"Very small sample size ({sample_size}) for industry {industry}. Results may be unreliable.")
        logger.warning("Using simplified models and increasing regularization.")
        use_simplified_models = True
    elif sample_size < 100:
        logger.warning(f"Small sample size ({sample_size}) for industry {industry}. Consider collecting more data.")
        logger.warning("Using moderately complex models.")
        use_simplified_models = True
    else:
        use_simplified_models = False
    
    # 跳过复杂模型的样本阈值
    skip_tabnet_threshold = 100  # 如果样本少于100，跳过TabNet
    skip_tabnet = sample_size < skip_tabnet_threshold
    
    # 检查极度不平衡数据
    fraud_ratio = y.mean()
    logger.info(f"Fraud ratio: {fraud_ratio:.4f} ({y.sum()} fraud cases out of {len(y)} total)")
    
    if fraud_ratio < 0.01 or fraud_ratio > 0.99:
        logger.warning(f"Highly imbalanced data: fraud ratio = {fraud_ratio:.4f}")
        # 可以在此处理不平衡问题，如SMOTE或类权重调整
    
    # 检查最小类别的样本数
    min_class_count = min(y.sum(), len(y) - y.sum())
    if min_class_count < 5:
        logger.error(f"Insufficient samples for minority class ({min_class_count}). Minimum 5 samples required.")
        logger.error("Cannot build reliable model. Skipping industry.")
        return None
    
    # 数据划分策略
    if time_series and date_col is not None and date_col in industry_data.columns:
        # 按时间排序
        industry_data = industry_data.sort_values(date_col)
        
        # 时间分割 - 最后20%为测试集
        split_idx = int(len(industry_data) * (1 - test_size))
        
        # 训练+验证集
        X_train_val = X.iloc[:split_idx]
        y_train_val = y.iloc[:split_idx]
        
        # 最终测试集 - 只用于最终评估
        X_test = X.iloc[split_idx:]
        y_test = y.iloc[split_idx:]
        
        # 从训练+验证集中再次划分训练集和验证集
        # 时间分割 - 最后的20%为验证集
        val_split_idx = int(len(X_train_val) * 0.8)
        X_train = X_train_val.iloc[:val_split_idx]
        X_val = X_train_val.iloc[val_split_idx:]
        y_train = y_train_val.iloc[:val_split_idx]
        y_val = y_train_val.iloc[val_split_idx:]
        
        logger.info(f"Time series split: ")
        logger.info(f"  Training set: {len(X_train)} samples ({X_train.index.min()} to {X_train.index.max()})")
        logger.info(f"  Validation set: {len(X_val)} samples ({X_val.index.min()} to {X_val.index.max()})")
        logger.info(f"  Test set: {len(X_test)} samples ({X_test.index.min()} to {X_test.index.max()})")
    else:
        # 随机分层划分 - 三重划分
        # 首先将数据分为训练+验证集和最终测试集
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X, y, test_size=0.2, random_state=SEED, stratify=y
        )
        
        # 然后将训练+验证集进一步分为训练集和验证集
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.2, random_state=SEED, stratify=y_train_val
        )
        
        logger.info(f"Random stratified split: ")
        logger.info(f"  Training set: {len(X_train)} samples")
        logger.info(f"  Validation set: {len(X_val)} samples")
        logger.info(f"  Test set: {len(X_test)} samples")
    
    # 检查各个集合的标签分布
    logger.info(f"Training set fraud ratio: {y_train.mean():.4f}")
    logger.info(f"Validation set fraud ratio: {y_val.mean():.4f}")
    logger.info(f"Test set fraud ratio: {y_test.mean():.4f}")
    
    # 检查测试集大小
    if len(X_test) < 30:
        logger.warning(f"Test set is very small ({len(X_test)} samples). Results may be unreliable.")
    
    # 增加交叉验证折数处理
    if sample_size < 60:
        # 小样本量使用留一交叉验证或者5折交叉验证
        n_folds = min(5, min_class_count)
        logger.info(f"Using {n_folds}-fold cross-validation due to small sample size")
    else:
        # 正常样本量使用5折交叉验证
        n_folds = 5
    
    # 构建模型
    try:
#         results = build_stacking_model(
#             industry=industry,
#             X_train=X_train,
#             y_train=y_train,
#             X_test=X_test,
#             y_test=y_test,
#             time_series=time_series,
#             n_folds=n_folds,
#             X_val=X_val if 'X_val' in locals() else None,
#             y_val=y_val if 'y_val' in locals() else None,
#             use_simplified_models=use_simplified_models,
#             skip_tabnet=skip_tabnet,
#             target_accuracy=target_accuracy,
#             accuracy_margin=accuracy_margin
        results = build_stacking_model(
            industry=industry,
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            time_series=time_series,
            n_folds=n_folds,
            X_val=X_val if 'X_val' in locals() else None,
            y_val=y_val if 'y_val' in locals() else None,
            use_simplified_models=use_simplified_models,
            skip_tabnet=skip_tabnet,
            target_accuracy=target_accuracy,
            accuracy_margin=accuracy_margin
        )
        # 检查可疑的高准确率
        if results and results.get('test_auc', 0) > 0.95:
            logger.warning(f"Suspiciously high AUC ({results['test_auc']:.4f}) may indicate data leakage or overfitting.")
            logger.warning("Consider these results with caution and perform additional validation.")
        
        return results
    except Exception as e:
        logger.error(f"Error processing industry {industry}: {e}", exc_info=True)
        return None

In [7]:
# def nested_cv_optimize(X, y, base_model, param_grid, cv_outer=5, cv_inner=3, scoring='roc_auc', time_series=False):
#     """
#     使用嵌套交叉验证优化模型参数，避免数据泄露
    
#     Args:
#         X: 特征
#         y: 标签
#         base_model: 基础模型
#         param_grid: 参数网格搜索空间
#         cv_outer: 外层交叉验证折数
#         cv_inner: 内层交叉验证折数
#         scoring: 评分指标
#         time_series: 是否使用时间序列分割
    
#     Returns:
#         最佳参数字典、外层交叉验证得分列表和最佳模型
#     """
#     # 修复1: 首先划分一个保留集，避免优化过程接触到最终测试数据
#     # 这个保留集将用于独立验证最终选择的参数
#     if not time_series:
#         X_opt, X_holdout, y_opt, y_holdout = train_test_split(
#             X, y, test_size=0.2, random_state=SEED, stratify=y
#         )
#     else:
#         # 时间序列数据按时间顺序划分
#         split_idx = int(len(X) * 0.8)
#         X_opt, X_holdout = X.iloc[:split_idx], X.iloc[split_idx:]
#         y_opt, y_holdout = y.iloc[:split_idx], y.iloc[split_idx:]
    
#     # 选择交叉验证策略
#     if time_series:
#         cv_outer_split = TimeSeriesSplit(n_splits=cv_outer)
#     else:
#         cv_outer_split = StratifiedKFold(n_splits=cv_outer, shuffle=True, random_state=SEED)
    
#     # 外层交叉验证得分
#     cv_scores = []
#     best_params_list = []
#     best_models = []
    
#     # 外层交叉验证 - 仅使用优化集(X_opt)
#     for train_idx, test_idx in cv_outer_split.split(X_opt, y_opt):
#         X_train, X_test = X_opt.iloc[train_idx], X_opt.iloc[test_idx]
#         y_train, y_test = y_opt.iloc[train_idx], y_opt.iloc[test_idx]
        
#         # 修复2: 确保标准化过程不会泄露信息
#         scaler = StandardScaler()
#         X_train_scaled = pd.DataFrame(
#             scaler.fit_transform(X_train),
#             columns=X_train.columns,
#             index=X_train.index
#         )
#         X_test_scaled = pd.DataFrame(
#             scaler.transform(X_test),
#             columns=X_test.columns,
#             index=X_test.index
#         )
        
#         # 使用Optuna优化参数
#         study = optuna.create_study(direction='maximize')
        
#         # 定义内层交叉验证优化函数
#         def objective(trial):
#             # 构建参数字典
#             params = {}
#             for param, values in param_grid.items():
#                 if isinstance(values, tuple) and len(values) == 3 and values[0] == 'float':
#                     params[param] = trial.suggest_float(param, values[1], values[2])
#                 elif isinstance(values, tuple) and len(values) == 3 and values[0] == 'int':
#                     params[param] = trial.suggest_int(param, values[1], values[2])
#                 elif isinstance(values, tuple) and len(values) == 4 and values[0] == 'float_log':
#                     params[param] = trial.suggest_float(param, values[1], values[2], log=values[3])
#                 elif isinstance(values, list):
#                     params[param] = trial.suggest_categorical(param, values)
            
#             # 创建模型
#             model = clone(base_model)
#             model.set_params(**params)
            
#             # 内层交叉验证 - 使用正确的交叉验证分割器
#             if time_series:
#                 cv_inner_split = TimeSeriesSplit(n_splits=cv_inner)
#             else:
#                 cv_inner_split = StratifiedKFold(n_splits=cv_inner, shuffle=True, random_state=SEED)
            
#             scores = cross_val_score(model, X_train_scaled, y_train, cv=cv_inner_split, scoring=scoring)
#             return scores.mean()
        
#         # 运行参数优化
#         study.optimize(objective, n_trials=50)
        
#         # 获取最佳参数
#         best_params = study.best_params
#         best_params_list.append(best_params)
        
#         # 使用最佳参数训练模型
#         best_model = clone(base_model)
#         best_model.set_params(**best_params)
#         best_model.fit(X_train_scaled, y_train)
#         best_models.append(best_model)
        
#         # 在测试集上评估
#         if hasattr(best_model, 'predict_proba'):
#             y_pred = best_model.predict_proba(X_test_scaled)[:, 1]
#         else:
#             y_pred = best_model.predict(X_test_scaled)
        
#         # 计算得分
#         score = roc_auc_score(y_test, y_pred)
#         cv_scores.append(score)
        
#         logger.info(f"Fold score: {score:.4f}, Best params: {best_params}")
    
#     # 计算最佳参数
#     best_params_combined = {}
#     for param in best_params_list[0].keys():
#         param_values = [bp[param] for bp in best_params_list]
#         if all(isinstance(x, (int, float)) for x in param_values):
#             best_params_combined[param] = sum(param_values) / len(param_values)
#             if all(isinstance(x, int) for x in param_values):
#                 best_params_combined[param] = int(best_params_combined[param])
#         else:
#             # 对于分类参数，使用众数
#             from collections import Counter
#             best_params_combined[param] = Counter(param_values).most_common(1)[0][0]
    
#     # 修复3: 使用保留集进行最终验证
#     # 标准化保留集
#     holdout_scaler = StandardScaler()
#     X_holdout_scaled = pd.DataFrame(
#         holdout_scaler.fit_transform(X_holdout),
#         columns=X_holdout.columns,
#         index=X_holdout.index
#     )
    
#     # 训练一个使用最佳参数的模型
#     final_model = clone(base_model)
#     final_model.set_params(**best_params_combined)
    
#     # 在所有优化数据上训练
#     X_opt_scaled = pd.DataFrame(
#         holdout_scaler.transform(X_opt),  # 使用相同的scaler
#         columns=X_opt.columns,
#         index=X_opt.index
#     )
#     final_model.fit(X_opt_scaled, y_opt)
    
#     # 在保留集上验证
#     if hasattr(final_model, 'predict_proba'):
#         holdout_pred = final_model.predict_proba(X_holdout_scaled)[:, 1]
#     else:
#         holdout_pred = final_model.predict(X_holdout_scaled)
    
#     # 计算保留集得分
#     holdout_score = roc_auc_score(y_holdout, holdout_pred)
#     logger.info(f"Final holdout score: {holdout_score:.4f}")
    
#     # 比较交叉验证平均分数与保留集分数
#     cv_mean = np.mean(cv_scores)
#     logger.info(f"CV mean score: {cv_mean:.4f}, Holdout score: {holdout_score:.4f}")
#     if abs(cv_mean - holdout_score) > 0.1:  # 差异超过0.1
#         logger.warning(f"Large gap between CV ({cv_mean:.4f}) and holdout ({holdout_score:.4f}) scores!")
#         logger.warning("This may indicate potential overfitting or data leakage.")
    
#     # 返回最佳参数、得分和最佳模型
#     return best_params_combined, cv_scores, best_models

In [8]:
def optimize_lgbm(X, y, time_series=False, use_simplified_models=False):
    """
    优化LightGBM模型超参数，根据样本量自动调整模型复杂度
    
    Args:
        X: 特征矩阵
        y: 目标变量
        time_series: 是否为时间序列数据
        use_simplified_models: 是否使用简化模型（样本数量少时）
        
    Returns:
        最佳参数字典和训练好的模型列表
    """
    # 定义超参数搜索空间
    if use_simplified_models:
        # 简化模型 - 减少深度和叶节点，增加正则化
        param_grid = {
            'num_leaves': ('int', 8, 31),  # 减少叶节点数
            'max_depth': ('int', 3, 5),    # 减少树深度
            'learning_rate': ('float', 0.01, 0.1),
            'min_child_samples': ('int', 10, 30),  # 增加每个叶节点最小样本数
            'subsample': ('float', 0.7, 0.9),      # 增加行抽样比例，减少随机性
            'colsample_bytree': ('float', 0.7, 0.9), # 增加列抽样比例
            'reg_alpha': ('float', 0.1, 1.0),      # 增强L1正则化
            'reg_lambda': ('float', 0.1, 1.0),     # 增强L2正则化
            'min_split_gain': ('float', 0.1, 0.5)  # 增加分裂增益阈值，减少过拟合
        }
        n_trials = 20  # 减少搜索次数
    else:
        # 完整模型 - 正常参数范围
        param_grid = {
            'num_leaves': ('int', 15, 255),
            'max_depth': ('int', 3, 12),
            'learning_rate': ('float', 0.01, 0.2),
            'min_child_samples': ('int', 5, 100),
            'subsample': ('float', 0.5, 1.0),
            'colsample_bytree': ('float', 0.5, 1.0),
            'reg_alpha': ('float', 0, 10),
            'reg_lambda': ('float', 0, 10),
            'min_split_gain': ('float', 0, 1)
        }
        n_trials = 50  # 正常搜索次数
    
    # 检查样本量，进一步调整参数
    if len(X) < 50:
        # 极小样本量，进一步简化模型
        param_grid['max_depth'] = ('int', 2, 4)  # 更浅的树
        param_grid['num_leaves'] = ('int', 4, 16)  # 更少的叶节点
        param_grid['min_child_samples'] = ('int', 5, 15)  # 更小的叶节点样本要求
        param_grid['reg_alpha'] = ('float', 0.5, 2.0)  # 更强的正则化
        param_grid['reg_lambda'] = ('float', 0.5, 2.0)  # 更强的正则化
        n_trials = 15  # 进一步减少搜索次数
        
    # 使用交叉验证
    if time_series:
        cv = TimeSeriesSplit(n_splits=min(5, max(3, len(X) // 10)))  # 根据样本量动态调整折数
    else:
        cv = StratifiedKFold(n_splits=min(5, max(3, len(X) // 10)), shuffle=True, random_state=SEED)
    
    # 创建优化函数
    def objective(trial):
        # 构建参数
        params = {}
        for param, values in param_grid.items():
            if values[0] == 'int':
                params[param] = trial.suggest_int(param, values[1], values[2])
            elif values[0] == 'float':
                params[param] = trial.suggest_float(param, values[1], values[2])
        
        # 添加固定参数
        params['objective'] = 'binary'
        params['verbosity'] = -1
        params['random_state'] = SEED
        
        # 创建模型
        model = lgb.LGBMClassifier(**params)
        
        # 使用交叉验证评估
        scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
        
        # 返回平均分数
        return scores.mean()
    
    # 创建Optuna研究
    study = optuna.create_study(direction='maximize')
    
    # 减少日志输出
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    
    # 运行优化
    logger.info(f"Optimizing LightGBM with {n_trials} trials{'(simplified model)' if use_simplified_models else ''}")
    study.optimize(objective, n_trials=n_trials)
    
    # 获取最佳参数
    best_params = study.best_params
    # 不要在best_params中添加'objective'参数
    
    # 记录最佳参数
    logger.info(f"LightGBM best params: {best_params}")
    logger.info(f"LightGBM best score: {study.best_value:.4f}")
    
    # 训练最终模型 - 在这里可以添加'objective'参数
    final_models = []
    final_model_params = best_params.copy()
    final_model_params['objective'] = 'binary'  # 只在这里使用
    final_model_params['verbosity'] = -1
    final_model_params['random_state'] = SEED
    
    # 拟合全部数据
    final_model = lgb.LGBMClassifier(**final_model_params)
    final_model.fit(X, y)
    final_models.append(final_model)
    
    # 返回最佳参数和模型
    return best_params, final_models  # 返回不包含'objective'的参数

In [9]:
def optimize_xgb(X, y, time_series=False, use_simplified_models=False):
    """
    优化XGBoost模型参数
    """
    import xgboost as xgb
    # 显式重置XGBoost全局配置
    xgb.config.set_config(verbosity=0)
    
    # 首先确保XGBoost的全局配置是正确的
    try:
        import xgboost as xgb
        # 显式重置XGBoost全局配置
        xgb.config.set_config(verbosity=0)
    except Exception as e:
        logger.error(f"设置XGBoost全局配置时出错: {str(e)}")
    
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'alpha': trial.suggest_float('alpha', 0, 10),
            'lambda': trial.suggest_float('lambda', 0, 10),
            'objective': 'binary:logistic',  # 使用正确的objective值
            'verbosity': 0,  # 确保在有效范围0-3内
            'use_label_encoder': False,  # 避免警告
            'eval_metric': 'logloss',  # 添加明确的评估指标
            'random_state': 42
        }
        
        if use_simplified_models:
            # 简化模型参数，降低复杂度
            params['max_depth'] = trial.suggest_int('max_depth', 2, 6)
            params['min_child_weight'] = trial.suggest_int('min_child_weight', 1, 5)
            params['subsample'] = trial.suggest_float('subsample', 0.7, 1.0)
            params['colsample_bytree'] = trial.suggest_float('colsample_bytree', 0.7, 1.0)
        
        try:
            if time_series:
                # 时间序列数据使用时间分割验证
                tscv = TimeSeriesSplit(n_splits=5)
                model = xgb.XGBClassifier(**params)
                score = cross_val_score(model, X, y, cv=tscv, scoring='accuracy').mean()
            else:
                # 非时间序列数据使用分层K折交叉验证
                skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                model = xgb.XGBClassifier(**params)
                score = cross_val_score(model, X, y, cv=skf, scoring='accuracy').mean()
                
            return score
        except Exception as e:
            logger.error(f"交叉验证出错: {str(e)}")
            return 0.0  # 返回最低分数以避免中断优化过程
    
    try:
        # 使用Optuna优化XGBoost参数
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=50)
        
        # 获取最佳参数并训练最终模型
        best_params = study.best_params
        best_params.update({
            'objective': 'binary:logistic',
            'verbosity': 0,
            'use_label_encoder': False,
            'eval_metric': 'logloss',
            'random_state': 42
        })
        
        # 记录最佳参数和得分
        logger.info(f"XGBoost best params: {best_params}")
        logger.info(f"XGBoost best score: {study.best_value:.4f}")
        
        # 使用最佳参数训练模型
        final_model = xgb.XGBClassifier(**best_params)
        final_model.fit(X, y)
        
        # 返回最佳参数和训练好的模型
        return best_params, final_model
    
    except Exception as e:
        logger.error(f"XGBoost优化过程出错: {str(e)}")
        # 使用默认参数作为备选
        default_params = {
            'max_depth': 6,
            'learning_rate': 0.1,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'gamma': 0,
            'alpha': 0,
            'lambda': 1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'use_label_encoder': False,
            'eval_metric': 'logloss',
            'random_state': 42
        }
        logger.info("使用默认XGBoost参数")
        try:
            # 在这里创建一个新的XGBClassifier而不是使用旧的实例
            xgb.config.set_config(verbosity=0)  # 再次确保全局配置正确
            final_model = xgb.XGBClassifier(**default_params)
            final_model.fit(X, y)
            return default_params, final_model
        except Exception as inner_e:
            logger.error(f"使用默认参数训练XGBoost时出错: {str(inner_e)}")
            # 可能需要回退到其他类型的模型
            from sklearn.ensemble import RandomForestClassifier
            logger.info("回退到RandomForest模型")
            rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
            rf_model.fit(X, y)
            return {"fallback_to": "RandomForest"}, rf_model

In [10]:
def optimize_tabnet(X, y, time_series=False, use_simplified_models=False):
    """
    优化TabNet模型超参数，根据样本量自动调整模型复杂度
    
    Args:
        X: 特征矩阵
        y: 目标变量
        time_series: 是否为时间序列数据
        use_simplified_models: 是否使用简化模型（样本数量少时）
        
    Returns:
        最佳参数字典和训练好的模型列表
    """
    # 定义超参数搜索空间
    if use_simplified_models:
        # 简化模型 - 减少网络复杂性、步骤和稀疏性
        param_grid = {
            'n_d': ('int', 8, 32),              # 减少特征转换维度
            'n_a': ('int', 8, 32),              # 减少注意力维度
            'n_steps': ('int', 2, 4),           # 减少决策步骤数
            'gamma': ('float', 1.0, 1.5),       # 减少特征选择熵正则化参数
            'momentum': ('float', 0.01, 0.3),   # 动量范围
            'lambda_sparse': ('float_log', 1e-5, 1e-2, True), # 增加稀疏性约束，减少过拟合
            'n_independent': ('int', 1, 2),      # 减少独立网络数量
            'n_shared': ('int', 1, 2)            # 减少共享网络数量
        }
        n_trials = 15  # 减少搜索次数
        epochs = 50    # 减少训练轮数
        patience = 10  # 减少早停耐心值
    else:
        # 完整模型 - 正常参数范围
        param_grid = {
            'n_d': ('int', 16, 64),
            'n_a': ('int', 16, 64),
            'n_steps': ('int', 3, 10),
            'gamma': ('float', 1.0, 2.0),
            'momentum': ('float', 0.01, 0.4),
            'lambda_sparse': ('float_log', 1e-6, 1e-3, True),
            'n_independent': ('int', 1, 5),
            'n_shared': ('int', 1, 5)
        }
        n_trials = 30  # 正常搜索次数
        epochs = 100   # 正常训练轮数
        patience = 20  # 正常早停耐心值
    
    # 检查样本量，进一步调整参数
    if len(X) < 50:
        # 极小样本量，进一步简化模型
        param_grid['n_d'] = ('int', 4, 16)  # 更小的特征转换维度
        param_grid['n_a'] = ('int', 4, 16)  # 更小的注意力维度
        param_grid['n_steps'] = ('int', 1, 3)  # 更少的决策步骤
        param_grid['n_independent'] = ('int', 1, 1)  # 固定独立网络数量为1
        param_grid['n_shared'] = ('int', 1, 1)  # 固定共享网络数量为1
        param_grid['lambda_sparse'] = ('float_log', 1e-4, 1e-2, True)  # 更强的稀疏性约束
        n_trials = 10  # 进一步减少搜索次数
        epochs = 30    # 进一步减少训练轮数
        patience = 5   # 进一步减少早停耐心值
        
    # 使用交叉验证
    if time_series:
        # 时间序列分割
        n_splits = min(5, max(3, len(X) // 10))  # 根据样本量动态调整折数
        cv = TimeSeriesSplit(n_splits=n_splits)
    else:
        # 分层k折交叉验证
        n_splits = min(5, max(3, len(X) // 10))  # 根据样本量动态调整折数
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    # 由于TabNet不容易集成到sklearn的CV流程中，我们手动实现交叉验证
    def objective(trial):
        # 构建参数
        params = {}
        for param, values in param_grid.items():
            if values[0] == 'int':
                params[param] = trial.suggest_int(param, values[1], values[2])
            elif values[0] == 'float':
                params[param] = trial.suggest_float(param, values[1], values[2])
            elif values[0] == 'float_log':
                params[param] = trial.suggest_float(param, values[1], values[2], log=values[3])
        
        # 初始化分数存储
        scores = []
        
        # 创建标准化器
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # 交叉验证
        for train_idx, valid_idx in cv.split(X_scaled, y):
            X_train_fold, X_valid_fold = X_scaled[train_idx], X_scaled[valid_idx]
            y_train_fold, y_valid_fold = y.iloc[train_idx].values, y.iloc[valid_idx].values
            
            # 创建模型
            model = TabNetClassifier(
                **params,
                optimizer_fn=torch.optim.Adam,
                optimizer_params=dict(lr=0.02),
                scheduler_params={"mode": "min", "factor": 0.7, "patience": 5},
                scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                seed=SEED,
                device_name=device
            )
            
            # 训练模型
            model.fit(
                X_train=X_train_fold, 
                y_train=y_train_fold,
                eval_set=[(X_valid_fold, y_valid_fold)],
                max_epochs=epochs,
                patience=patience,
                batch_size=min(1024, len(X_train_fold)),  # 调整batch size避免小样本问题
                virtual_batch_size=min(128, len(X_train_fold) // 4),  # 调整虚拟batch size
                num_workers=0,
                drop_last=False
            )
            
            # 评估模型
            y_pred = model.predict_proba(X_valid_fold)[:, 1]
            score = roc_auc_score(y_valid_fold, y_pred)
            scores.append(score)
            
            # 清理内存
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        # 返回平均分数
        return np.mean(scores)
    
    # 创建Optuna研究
    study = optuna.create_study(direction='maximize')
    
    # 减少日志输出
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    
    # 运行优化
    logger.info(f"Optimizing TabNet with {n_trials} trials{'(simplified model)' if use_simplified_models else ''}")
    
    try:
        study.optimize(objective, n_trials=n_trials)
        
        # 获取最佳参数
        best_params = study.best_params
        
        # 记录最佳参数
        logger.info(f"TabNet best params: {best_params}")
        logger.info(f"TabNet CV scores: {study.trials_dataframe()['value'].tolist()}, Mean: {study.best_value:.4f}, Std: {study.trials_dataframe()['value'].std():.4f}")
        
        # 训练最终模型
        final_model = TabNetClassifier(
            **best_params,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=0.02),
            scheduler_params={"mode": "min", "factor": 0.7, "patience": 5},
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            seed=SEED,
            device_name=device
        )
        
        # 不在这里拟合最终模型，因为在build_stacking_model中会进行拟合
        final_models = []
        
        # 返回最佳参数和模型
        return best_params, final_models
    except Exception as e:
        logger.error(f"Error optimizing TabNet: {e}", exc_info=True)
        # 如果优化失败，返回默认参数
        default_params = {
            'n_d': 16 if not use_simplified_models else 8,
            'n_a': 16 if not use_simplified_models else 8,
            'n_steps': 3 if not use_simplified_models else 2,
            'gamma': 1.3,
            'momentum': 0.02,
            'lambda_sparse': 0.001 if not use_simplified_models else 0.01,
            'n_independent': 2 if not use_simplified_models else 1,
            'n_shared': 2 if not use_simplified_models else 1
        }
        logger.warning(f"Using default TabNet parameters: {default_params}")
        return default_params, []

In [11]:
def save_model(model, industry_name):
    """
    保存模型到文件
    
    Args:
        model: 完整的模型对象
        industry_name: 行业名称
    """
    import os
    import pickle
    import joblib
    
    # 创建模型目录（如果不存在）
    os.makedirs('models', exist_ok=True)
    
    # 生成模型文件名
    model_pkl_path = f'models/{industry_name}_stacking_model.pkl'
    model_joblib_path = f'models/{industry_name}_stacking_model.joblib'
    
    try:
        # 使用pickle保存
        with open(model_pkl_path, 'wb') as f:
            pickle.dump(model, f)
        logger.info(f"Model saved to: {model_pkl_path}")
        
        # 同时也使用joblib保存（通常更可靠，尤其是对于大型模型）
        joblib.dump(model, model_joblib_path)
        logger.info(f"Model saved to: {model_joblib_path}")
        
        return True
        
    except Exception as e:
        logger.error(f"Error saving model for {industry_name}: {e}", exc_info=True)
        return False

In [12]:
def build_stacking_model(industry, X_train, y_train, X_test, y_test, time_series=False, n_folds=5, 
                   X_val=None, y_val=None, use_simplified_models=False, skip_tabnet=False,
                   target_accuracy=0.89, accuracy_margin=0.01):
    """
    构建堆叠模型，避免数据泄露，计算并使用最佳阈值，根据样本量自动调整模型复杂度
    增加了目标准确率早停机制，在达到目标准确率附近时停止训练
    
    Args:
        industry: 行业名称
        X_train: 训练特征
        y_train: 训练标签
        X_test: 测试特征
        y_test: 测试标签
        time_series: 是否按时间序列处理
        n_folds: 交叉验证折数
        X_val: 验证集特征 (可选)
        y_val: 验证集标签 (可选)
        use_simplified_models: 是否使用简化模型 (样本数量少时)
        skip_tabnet: 是否跳过TabNet模型 (样本数量少时)
        target_accuracy: 目标准确率，达到此准确率附近时停止训练
        accuracy_margin: 准确率容差范围
    
    Returns:
        模型结果字典
    """
    logger.info(f"\n=============== Building Stacking Model for {industry} ===============")
    logger.info(f"Target accuracy: {target_accuracy:.4f} (±{accuracy_margin:.4f})")
    results = {}
    trained_base_models = {}
    
    # 检查样本是否极度不平衡
    fraud_ratio = y_train.mean()
    if fraud_ratio < 0.01 or fraud_ratio > 0.99:
        logger.warning(f"Highly imbalanced data: fraud ratio = {fraud_ratio:.4f}")
        # 设置类权重
        class_weights = {
            0: 1, 
            1: (1-fraud_ratio)/fraud_ratio if fraud_ratio < 0.5 else fraud_ratio/(1-fraud_ratio)
        }
        logger.info(f"Using class weights: {class_weights}")
    else:
        class_weights = None
    
    # 日志记录模型复杂度设置
    if use_simplified_models:
        logger.info("Using simplified models due to small sample size")
    if skip_tabnet:
        logger.info("Skipping TabNet model due to small sample size")
    
    # 第一步：优化各个基模型的超参数
    lgbm_params, lgbm_models = optimize_lgbm(X_train, y_train, time_series=time_series, use_simplified_models=use_simplified_models)
    xgb_params, xgb_models = optimize_xgb(X_train, y_train, time_series=time_series, use_simplified_models=use_simplified_models)
    
    # 只在样本足够时使用TabNet
    if not skip_tabnet:
        tabnet_params, _ = optimize_tabnet(X_train, y_train, time_series=time_series, use_simplified_models=use_simplified_models)
    else:
        # 跳过TabNet时设置空参数
        tabnet_params = {}
    
    # 第二步：创建交叉验证折
    if time_series:
        kf = TimeSeriesSplit(n_splits=n_folds)
    else:
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
    
    # 创建存储交叉验证预测结果的数组
    # 如果跳过TabNet，则只使用2个基模型
    n_base_models = 2 if skip_tabnet else 3
    cv_train_meta = np.zeros((X_train.shape[0], n_base_models))
    
    # 存储训练好的模型和标准化器
    trained_models = {
        'lgbm': [],
        'xgb': [],
        'tabnet': [] if not skip_tabnet else None
    }
    
    trained_scalers = []


    # 在训练数据上拟合标准化器
    train_scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(
        train_scaler.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )

    # 将训练数据上拟合的标准化器应用到测试数据
    X_test_scaled_global = pd.DataFrame(
        train_scaler.transform(X_test),  # 只transform，不fit
        columns=X_test.columns,
        index=X_test.index
    )
    
    # 为存储测试集上每个模型的预测结果创建数组
    test_meta_features = np.zeros((X_test.shape[0], n_base_models))
    
    # 创建早停标志和当前准确率跟踪
    early_stopping_triggered = False
    current_accuracy = 0
    accuracy_history = []  # 存储每个fold的准确率
    
    # 第三步：通过交叉验证训练基模型并生成元特征
    for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):
#         if early_stopping_triggered:
#             logger.info(f"Early stopping triggered at fold {fold_idx}/{n_folds}: accuracy {current_accuracy:.4f} is within target range")
#             break
            
        logger.info(f"\nTraining fold {fold_idx+1}/{n_folds}")
        X_train_fold, X_valid_fold = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        
        # 对训练集进行标准化
        scaler = StandardScaler()
        X_train_fold_scaled = pd.DataFrame(
            scaler.fit_transform(X_train_fold), 
            columns=X_train_fold.columns,
            index=X_train_fold.index
        )
        
        # 保存标准化器
        trained_scalers.append(scaler)
        
        # 使用相同的scaler转换验证集
        X_valid_fold_scaled = pd.DataFrame(
            scaler.transform(X_valid_fold), 
            columns=X_valid_fold.columns,
            index=X_valid_fold.index
        )
        
        # 训练LightGBM并获取预测
        logger.info("Training LightGBM...")
        lgbm_params_complete = lgbm_params.copy()
        lgbm_params_complete['objective'] = 'binary'
        lgbm_params_complete['verbosity'] = -1
        lgbm_params_complete['random_state'] = SEED

        if class_weights:
            lgbm_params_complete['class_weight'] = class_weights

        lgbm_model = lgb.LGBMClassifier(**lgbm_params_complete, n_jobs=-1)
        lgbm_model.fit(X_train_fold_scaled, y_train_fold)
        cv_train_meta[valid_idx, 0] = lgbm_model.predict_proba(X_valid_fold_scaled)[:, 1]
        trained_models['lgbm'].append(lgbm_model)
        
        # 训练XGBoost并获取预测
#         logger.info("Training XGBoost...")
#         xgb_params_complete = xgb_params.copy()
#         xgb_params_complete['objective'] = 'binary:logistic'
#         xgb_params_complete['eval_metric'] = 'auc'
#         xgb_params_complete['use_label_encoder'] = False
#         xgb_params_complete['random_state'] = SEED

        # 重置XGBoost全局配置
        try:
            import xgboost as xgb
            xgb.config.set_config(verbosity=0)
        except Exception as e:
            logger.warning(f"无法重置XGBoost全局配置: {str(e)}")

        # 在训练XGBoost的部分做如下修改
        logger.info("Training XGBoost...")
        xgb_params_complete = xgb_params.copy()
        xgb_params_complete['objective'] = 'binary:logistic'
        xgb_params_complete['eval_metric'] = 'auc'
        xgb_params_complete['use_label_encoder'] = False
        xgb_params_complete['random_state'] = SEED
        # 确保verbosity设置正确
        xgb_params_complete['verbosity'] = 0  # 确保在0-3范围内

        if class_weights and fraud_ratio < 0.5:
            # XGBoost使用scale_pos_weight参数
            xgb_params_complete['scale_pos_weight'] = class_weights[1]

        xgb_model = xgb.XGBClassifier(**xgb_params_complete, n_jobs=-1)
        xgb_model.fit(X_train_fold_scaled, y_train_fold)
        cv_train_meta[valid_idx, 1] = xgb_model.predict_proba(X_valid_fold_scaled)[:, 1]
        trained_models['xgb'].append(xgb_model)
        
        # 只在不跳过TabNet时训练TabNet
        if not skip_tabnet:
            # 训练TabNet并获取预测
            logger.info("Training TabNet...")
            tabnet_model = TabNetClassifier(
                **tabnet_params,
                optimizer_fn=torch.optim.Adam,
                optimizer_params=dict(lr=0.02),
                scheduler_params={"mode": "min", "factor": 0.7, "patience": 5},
                scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                seed=SEED,
                device_name=device
            )
            
            # 计算类权重（如果需要）
            if class_weights and fraud_ratio < 0.5:
                weights = np.ones(len(y_train_fold))
                weights[y_train_fold == 1] = class_weights[1]

                # 确保权重的格式正确 - 使用numpy数组
                sample_weights = np.array(weights)

                # 确保权重与训练样本长度一致
                assert len(sample_weights) == len(y_train_fold), "Weights length doesn't match training samples"

                # 使用带权重的训练
                tabnet_model.fit(
                    X_train=X_train_fold_scaled.values, 
                    y_train=y_train_fold.values,
                    eval_set=[(X_valid_fold_scaled.values, y_valid_fold.values)],
                    max_epochs=100 if not use_simplified_models else 50,  # 简化模型时减少epoch数
                    patience=20 if not use_simplified_models else 10,    # 简化模型时提前停止
                    batch_size=1024,
                    virtual_batch_size=128,
                    num_workers=0,
                    drop_last=False,
                    weights=sample_weights
                )
            else:
                # 不使用权重训练
                tabnet_model.fit(
                    X_train=X_train_fold_scaled.values, 
                    y_train=y_train_fold.values,
                    eval_set=[(X_valid_fold_scaled.values, y_valid_fold.values)],
                    max_epochs=100 if not use_simplified_models else 50,  # 简化模型时减少epoch数
                    patience=20 if not use_simplified_models else 10,    # 简化模型时提前停止
                    batch_size=1024,
                    virtual_batch_size=128,
                    num_workers=0,
                    drop_last=False
                )
            
            # 分批处理TabNet预测以减少内存使用
            batch_size = 2048
            
            # 验证集预测
            num_valid_samples = X_valid_fold_scaled.shape[0]
            valid_preds = []
            for i in range(0, num_valid_samples, batch_size):
                end_idx = min(i + batch_size, num_valid_samples)
                batch_preds = tabnet_model.predict_proba(X_valid_fold_scaled.iloc[i:end_idx].values)[:, 1]
                valid_preds.append(batch_preds)
            cv_train_meta[valid_idx, 2] = np.concatenate(valid_preds)
            
            trained_models['tabnet'].append(tabnet_model)
        
        # 清理内存
        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        # 早停检查 - 使用当前fold的验证集计算准确率
        # 注意：这里我们使用单个fold的验证集，而不是完整的测试集
        # 构建当前fold的元特征
        current_fold_meta = np.zeros((len(valid_idx), n_base_models))
        current_fold_meta[:, 0] = lgbm_model.predict_proba(X_valid_fold_scaled)[:, 1]
        current_fold_meta[:, 1] = xgb_model.predict_proba(X_valid_fold_scaled)[:, 1]
        
        if not skip_tabnet:
            # TabNet预测已经计算过了
            current_fold_meta[:, 2] = np.concatenate(valid_preds)
        
        # 使用简单的LR作为元模型进行快速检查
        meta_model = LogisticRegression(random_state=SEED, max_iter=1000)
        meta_model.fit(current_fold_meta, y_valid_fold)
        
        # 预测并计算准确率
#         fold_predictions = meta_model.predict(current_fold_meta)
#         from sklearn.metrics import accuracy_score
#         fold_accuracy = accuracy_score(y_valid_fold, fold_predictions)
#         current_accuracy = fold_accuracy
        fold_predictions = meta_model.predict(current_fold_meta)
        from sklearn.metrics import accuracy_score
        fold_accuracy = accuracy_score(y_valid_fold, fold_predictions)
        current_accuracy = fold_accuracy
        accuracy_history.append(fold_accuracy)
        
        logger.info(f"Fold {fold_idx+1} accuracy: {fold_accuracy:.4f}")
        
#         # 检查是否达到目标准确率
#         if abs(fold_accuracy - target_accuracy) <= accuracy_margin:
#             logger.info(f"Reached target accuracy range: {fold_accuracy:.4f} is within {target_accuracy:.4f}±{accuracy_margin:.4f}")
#             early_stopping_triggered = True
#         elif fold_accuracy > target_accuracy + accuracy_margin:
#             logger.info(f"Exceeded target accuracy: {fold_accuracy:.4f} > {target_accuracy:.4f}+{accuracy_margin:.4f}")
#             early_stopping_triggered = True
        # 更积极的早停策略
        # 1. 不等待精确匹配，使用下限作为触发条件
        # 2. 只要达到或超过目标准确率下限就立即停止
        if fold_accuracy >= target_accuracy - accuracy_margin:
            logger.info(f"Reached minimum target accuracy threshold: {fold_accuracy:.4f} >= {target_accuracy-accuracy_margin:.4f}")
            logger.info("Triggering early stopping")
            early_stopping_triggered = True
            break

        # 清理内存
        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
#     # 如果早停被触发，记录
#     if early_stopping_triggered:
#         logger.info(f"Early stopping triggered at accuracy {current_accuracy:.4f}")
    
    # 如果早停被触发，记录
    if early_stopping_triggered:
        logger.info(f"Early stopping triggered at accuracy {current_accuracy:.4f}")
        logger.info(f"Completed {fold_idx+1} of {n_folds} folds")
    else:
        logger.info(f"Completed all {n_folds} folds without triggering early stopping")
    
    # 在测试集上进行预测
    # 修复2: 在所有fold训练完成后，使用所有训练好的模型对测试集进行一次预测
    logger.info("\nGenerating test set predictions from all models...")
    
    # 对于每个模型类型，使用所有fold的模型进行预测，然后取平均值
    # LightGBM模型预测
    lgbm_preds = []
    for model in trained_models['lgbm']:
        lgbm_preds.append(model.predict_proba(X_test_scaled_global)[:, 1])
    test_meta_features[:, 0] = np.mean(lgbm_preds, axis=0)
    
    # XGBoost模型预测
    xgb_preds = []
    for model in trained_models['xgb']:
        xgb_preds.append(model.predict_proba(X_test_scaled_global)[:, 1])
    test_meta_features[:, 1] = np.mean(xgb_preds, axis=0)
    
    # TabNet模型预测 (如果不跳过)
    if not skip_tabnet:
        tabnet_preds = []
        for model in trained_models['tabnet']:
            batch_size = 2048
            num_test_samples = X_test_scaled_global.shape[0]
            model_preds = []
            
            for i in range(0, num_test_samples, batch_size):
                end_idx = min(i + batch_size, num_test_samples)
                batch_preds = model.predict_proba(X_test_scaled_global.iloc[i:end_idx].values)[:, 1]
                model_preds.append(batch_preds)
                
            tabnet_preds.append(np.concatenate(model_preds))
        
        test_meta_features[:, 2] = np.mean(tabnet_preds, axis=0)
    
    # 继续其他处理,基模型的结果可能会有NaN
    # 检查和填充 meta 特征中的 NaN
    def check_and_fill_nan(features, feature_name):
        """增强版NaN检查和填充，可处理全NaN列"""

        # 检查是否有NaN值
        if np.isnan(features).any():
            nan_count = np.isnan(features).sum()
            logger.warning(f"Found {nan_count} NaN values in {feature_name}.")

            # 检查并处理全NaN列
            all_nan_cols = []
            for col in range(features.shape[1]):
                col_data = features[:, col]
                if np.isnan(col_data).all():
                    logger.warning(f"Column {col} in {feature_name} is ALL NaN! Replacing with zeros.")
                    features[:, col] = 0
                    all_nan_cols.append(col)
                elif np.isnan(col_data).any():
                    # 使用列均值填充部分NaN
                    col_mean = np.nanmean(col_data)
                    features[:, col] = np.nan_to_num(col_data, nan=col_mean)

            if all_nan_cols:
                logger.warning(f"Replaced {len(all_nan_cols)} columns that were all NaN in {feature_name} with zeros")

            # 最后确保没有任何NaN残留
            features = np.nan_to_num(features, nan=0.0)

        # 断言确认没有NaN
        assert not np.isnan(features).any(), f"NaN values still exist in {feature_name} after imputation!"

        return features

    # 检查和填充 meta 特征中的 NaN
    cv_train_meta = check_and_fill_nan(cv_train_meta, "cv_train_meta")
    test_meta_features = check_and_fill_nan(test_meta_features, "test_meta_features")
    
    # 修复3: 使用传入的验证集或从训练集划分
    if X_val is not None and y_val is not None:
        logger.info("Using provided validation set")
        # 使用提供的验证集
        X_val_meta = np.zeros((X_val.shape[0], n_base_models))
        
        # 为验证集创建标准化器
        val_scaler = StandardScaler()
        X_val_scaled = pd.DataFrame(
            val_scaler.fit_transform(X_val),
            columns=X_val.columns,
            index=X_val.index
        )
        
        # 生成验证集的元特征
        for fold in range(len(trained_models['lgbm'])):
            # 获取对应fold的模型
            lgbm_model = trained_models['lgbm'][fold]
            xgb_model = trained_models['xgb'][fold]
            
            # LightGBM预测
            lgbm_val_preds = lgbm_model.predict_proba(X_val_scaled)[:, 1]
            
            # XGBoost预测
            xgb_val_preds = xgb_model.predict_proba(X_val_scaled)[:, 1]
            
            # 累加预测
            X_val_meta[:, 0] += lgbm_val_preds
            X_val_meta[:, 1] += xgb_val_preds
            
            # TabNet预测 (如果不跳过)
            if not skip_tabnet:
                tabnet_model = trained_models['tabnet'][fold]
                batch_size = 2048
                num_val_samples = X_val_scaled.shape[0]
                tabnet_val_preds = []
                
                for i in range(0, num_val_samples, batch_size):
                    end_idx = min(i + batch_size, num_val_samples)
                    batch_preds = tabnet_model.predict_proba(X_val_scaled.iloc[i:end_idx].values)[:, 1]
                    tabnet_val_preds.append(batch_preds)
                
                tabnet_val_preds = np.concatenate(tabnet_val_preds)
                X_val_meta[:, 2] += tabnet_val_preds
        
        # 平均预测
        X_val_meta /= len(trained_models['lgbm'])
        
        # 使用验证集标签
        y_val_meta = y_val
        
        # 检查NaN
        if X_val is not None and y_val is not None:
            X_val_meta = check_and_fill_nan(X_val_meta, "X_val_meta")
    else:
        # 从训练集中划分出一部分作为元模型的验证集
        X_train_meta, X_val_meta, y_train_meta, y_val_meta = train_test_split(
            cv_train_meta, y_train, test_size=0.2, random_state=SEED, stratify=y_train
        )
    
    # 修复4: 标准化元特征
    meta_scaler = StandardScaler()
    X_train_meta_scaled = meta_scaler.fit_transform(cv_train_meta)
    X_val_meta_scaled = meta_scaler.transform(X_val_meta)
    # 标准化后再次检查 NaN
    X_train_meta_scaled = check_and_fill_nan(X_train_meta_scaled, "X_train_meta_scaled")
    # 对测试集的元特征进行标准化
    test_meta_features_scaled = meta_scaler.transform(test_meta_features)
    test_meta_features_scaled = check_and_fill_nan(test_meta_features_scaled, "test_meta_features_scaled (final)")
    
    # 第四步：训练和优化元模型（逻辑回归）
    logger.info("\nTraining meta-model (Logistic Regression)...")
    
    # 使用网格搜索优化元模型
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
    
    # 样本量少时使用更强的正则化
    if use_simplified_models:
        C_values = [0.001, 0.01, 0.1, 1.0]  # 更强的正则化
    else:
        C_values = [0.001, 0.01, 0.1, 1, 10, 100]
    
    # 修复5: 考虑类不平衡问题，即使没有显式的class_weights也使用balanced权重
    if class_weights:
        meta_model = LogisticRegression(random_state=SEED, max_iter=2000, class_weight=class_weights, solver='liblinear')
    else:
        meta_model = LogisticRegression(random_state=SEED, max_iter=2000, class_weight='balanced', solver='liblinear')
    
    # 为小样本量减少参数网格规模
    if use_simplified_models:
        meta_param_grid = {'C': C_values, 'penalty': ['l2']}  # 只使用L2正则化
    else:
        meta_param_grid = {'C': C_values, 'penalty': ['l1', 'l2']}
    
    # 小样本量时减少交叉验证折数以避免某些折中类别消失
    cv_folds = 3 if use_simplified_models else 5
    
    # 修复6: 使用f1分数而不是AUC来优化元模型，更关注分类性能
    grid_search = GridSearchCV(
        meta_model, meta_param_grid, cv=cv_folds, scoring='f1', n_jobs=-1
    )
    grid_search.fit(X_train_meta_scaled, y_train)
    
    # 获取最佳元模型
    best_meta_model = grid_search.best_estimator_
    
    # 在验证集上评估元模型
    val_meta_preds = best_meta_model.predict_proba(X_val_meta_scaled)[:, 1]
    val_meta_auc = roc_auc_score(y_val_meta, val_meta_preds)
    
    logger.info(f"Meta-model validation AUC: {val_meta_auc:.4f}")
    logger.info(f"Best meta-model parameters: {grid_search.best_params_}")
    
    # 使用全部训练数据重新训练元模型
    final_meta_model = LogisticRegression(**grid_search.best_params_, random_state=SEED, max_iter=2000, solver='liblinear')
    if class_weights:
        final_meta_model.class_weight = class_weights
    else:
        final_meta_model.class_weight = 'balanced'  # 即使没有显式的class_weights也使用balanced
    
    final_meta_model.fit(X_train_meta_scaled, y_train)
    
    # 使用元模型进行最终预测
    final_preds = final_meta_model.predict_proba(test_meta_features_scaled)[:, 1]
    
    # 计算最终的准确率
    final_preds_binary = (final_preds >= 0.5).astype(int)
    final_accuracy = accuracy_score(y_test, final_preds_binary)
    logger.info(f"Final test accuracy: {final_accuracy:.4f}")
    
    # 评估模型
    from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, classification_report
    evaluation_results = plot_model_evaluation(y_test, final_preds, industry, "Stacking")
    
    # 修复7: 寻找最佳阈值 - 在验证集上寻找，而不是测试集
    thresholds = np.arange(0.05, 1.0, 0.05)
    best_f1 = 0
    best_threshold = 0.5
    threshold_results = []
    
    # 在验证集上寻找最佳阈值
    for threshold in thresholds:
        y_val_pred_binary = (val_meta_preds >= threshold).astype(int)
        
        # 考虑不平衡数据，使用beta=2的F2分数，更注重召回率
        precision = precision_score(y_val_meta, y_val_pred_binary, zero_division=0)
        recall = recall_score(y_val_meta, y_val_pred_binary, zero_division=0)
        
        # 如果想更注重召回率，可以使用F2分数
        beta = 2
        f_beta = (1 + beta**2) * precision * recall / ((beta**2 * precision) + recall) if (precision + recall) > 0 else 0
        
        # 或者使用普通的F1分数
        f1 = f1_score(y_val_meta, y_val_pred_binary, zero_division=0)
        
        threshold_results.append({
            'threshold': threshold,
            'f1': f1,
            'f_beta': f_beta,
            'precision': precision,
            'recall': recall
        })
        
        # 选择使用哪种分数作为优化目标 (F1或F2)
        if f1 > best_f1:  # 或者使用f_beta > best_f1
            best_f1 = f1  # 记得同时更新变量
            best_threshold = threshold
    
    logger.info(f"Best threshold: {best_threshold:.2f} (F1={best_f1:.4f})")
    
    # 使用最佳阈值生成的预测
    best_preds = (final_preds >= best_threshold).astype(int)
    
    # 为了更新最后的准确率报告
    best_threshold_accuracy = accuracy_score(y_test, best_preds)
    logger.info(f"Final accuracy with best threshold ({best_threshold:.2f}): {best_threshold_accuracy:.4f}")
    
    # 分析特征重要性 - 使用训练好的基础模型
    if 'lgbm' in trained_models and len(trained_models['lgbm']) > 0 and 'xgb' in trained_models and len(trained_models['xgb']) > 0:
        base_models_for_importance = {
            'lgbm': trained_models['lgbm'][0],  # 使用第一个fold的模型
            'xgb': trained_models['xgb'][0]
        }
        feature_importance = analyze_feature_importance(industry, X_train, y_train, base_models_for_importance)
    else:
        logger.warning(f"Cannot analyze feature importance for {industry} - trained models are empty or incomplete")
        feature_importance = None

    # 为了保存和部署，创建一个完整模型对象
    complete_model = {
        'industry': industry,
        'meta_model': final_meta_model,
        'lgbm_models': trained_models.get('lgbm', []),  # 使用get来避免键不存在的错误
        'xgb_models': trained_models.get('xgb', []),
        'tabnet_models': trained_models.get('tabnet', []) if not skip_tabnet else None,
        'scalers': trained_scalers,  # 保存标准化器
        'meta_scaler': meta_scaler,  # 保存元特征标准化器
        'lgbm_params': lgbm_params,
        'xgb_params': xgb_params,
        'tabnet_params': tabnet_params if not skip_tabnet else None,
        'features': list(X_train.columns),
        'test_auc': evaluation_results['auc'],
        'pr_auc': evaluation_results['pr_auc'],
        'best_threshold': best_threshold,
        'best_f1': best_f1,
        'threshold_results': threshold_results,
        'class_weights': class_weights,
        'train_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'time_series': time_series,
        'model_version': datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
        'use_simplified_models': use_simplified_models,
        'skip_tabnet': skip_tabnet,
        'n_base_models': n_base_models,
        'early_stopped': early_stopping_triggered,
        'final_accuracy': final_accuracy,
        'best_threshold_accuracy': best_threshold_accuracy,
        'target_accuracy': target_accuracy,
        'accuracy_margin': accuracy_margin,
        'early_stopped': early_stopping_triggered,
        'final_accuracy': final_accuracy,
        'best_threshold_accuracy': best_threshold_accuracy,
        'target_accuracy': target_accuracy,
        'accuracy_margin': accuracy_margin
    }
    
    # 保存模型
    save_model(complete_model, industry)
    
    return complete_model

In [28]:
def predict_unknown_samples(industry=None):
    """
    对未知标签的数据进行预测，使用行业特定的最佳阈值，避免数据泄露
    
    Args:
        industry: 如果指定，则只预测该行业的数据
    
    Returns:
        预测结果DataFrame
    """
    # 加载数据
    logger.info("Loading data for prediction...")
    try:
        financial_data = pd.read_csv('output/data_partial_balanced.csv')
        industry_features_df = pd.read_csv('industry_features_results/all_industries_features.csv')
    except FileNotFoundError as e:
        logger.error(f"Data file not found: {e}")
        return None
    
    # 创建特征字典
    industry_feature_dict = {}
    for ind in industry_features_df['Industry'].unique():
        selected_features = industry_features_df[industry_features_df['Industry'] == ind]['Feature'].tolist()
        industry_feature_dict[ind] = selected_features
    
    # 分离预测集
    pred_data = financial_data[financial_data['FLAG'].isna()].copy()
    pred_data = validate_and_clean_data(pred_data)  # 数据验证和清洗
    
    # 如果指定了行业，则只预测该行业
    if industry:
        pred_data = pred_data[pred_data['Industry'] == industry].copy()
        logger.info(f"Predicting for industry {industry} only. {len(pred_data)} samples to predict.")
    
    # 获取所有可用的模型
    model_files = []
    for ext in ['pkl', 'joblib']:
        model_files.extend([f for f in os.listdir('models') if f.endswith(f'_stacking_model.{ext}')])
    
    if len(model_files) == 0:
        logger.error("No model files found. Please run models for industries first.")
        return None
    
    # 存储预测结果
    pred_results = []
    
    # 按行业进行预测
    for model_file in model_files:
        # 提取行业名称 (支持多种文件扩展名)
        model_industry = model_file.split('_stacking_model.')[0]
        
        # 如果指定了行业且不是当前模型的行业，则跳过
        if industry and industry != model_industry:
            continue
        
        # 避免重复处理同一行业的不同格式模型文件
        if industry is None and any(r['Industry'] == model_industry for r in pred_results):
            continue
        
        # 获取该行业的预测数据
        industry_pred_data = pred_data[pred_data['Industry'] == model_industry].copy()
        
        if len(industry_pred_data) == 0:
            logger.info(f"No prediction data for industry {model_industry}")
            continue
        
        logger.info(f"\nPredicting for industry {model_industry} ({len(industry_pred_data)} samples)")
        
        try:
            # 加载模型 - 优先尝试joblib格式，因为它更适合大型模型
            model_path = f'models/{model_industry}_stacking_model.joblib'
            if not os.path.exists(model_path):
                model_path = f'models/{model_industry}_stacking_model.pkl'
            
            if model_path.endswith('.joblib'):
                model_results = joblib.load(model_path)
            else:
                with open(model_path, 'rb') as f:
                    model_results = pickle.load(f)
            
            # 获取该行业的特征
            features = model_results['features']
            
            # 检查是否存在最佳阈值
            threshold = model_results.get('best_threshold', 0.5)
            logger.info(f"Using threshold: {threshold} for industry {model_industry}")
            
            # 检查是否是时间序列模型
            if 'time_series' in model_results and model_results['time_series'] and model_results['date_col'] in industry_pred_data.columns:
                # 按时间排序预测数据
                industry_pred_data = industry_pred_data.sort_values(model_results['date_col'])
                logger.info(f"Sorted prediction data by {model_results['date_col']} for time series model")
            
            # 确保所有特征都在数据集中
            valid_features = [f for f in features if f in industry_pred_data.columns]
            if len(valid_features) < len(features):
                missing_features = set(features) - set(valid_features)
                logger.warning(f"Warning: {len(missing_features)} features not found for industry {model_industry}")
                logger.debug(f"Missing features: {missing_features}")
                logger.info(f"Using {len(valid_features)} valid features")
            
            # 提取特征
            X_pred = industry_pred_data[valid_features]
            
            # 修复1: 使用模型中保存的标准化器，避免重新拟合
            # 初始化保存预测概率的数组
            base_model_preds = np.zeros((X_pred.shape[0], 3))
            
            # 检查是否有保存的标准化器
            if 'scalers' in model_results and len(model_results['scalers']) > 0:
                # 使用第一个标准化器来处理预测数据
                scaler = model_results['scalers'][0]
                X_pred_scaled = pd.DataFrame(
                    scaler.transform(X_pred),  # 使用transform而不是fit_transform
                    columns=X_pred.columns,
                    index=X_pred.index
                )
                logger.info("Using saved scaler for feature standardization")
            else:
                # 如果没有保存的标准化器，则创建新的（应该避免这种情况）
                logger.warning("No saved scaler found, creating a new one (not recommended)")
                scaler = StandardScaler()
                X_pred_scaled = pd.DataFrame(
                    scaler.fit_transform(X_pred),
                    columns=X_pred.columns,
                    index=X_pred.index
                )
            
            # 获取元模型
            meta_model = model_results['meta_model']
            
            # 修复2: 使用saved模型进行预测，跳过重新训练
            if 'lgbm_models' in model_results and len(model_results['lgbm_models']) > 0:
                # 使用保存的模型
                logger.info("Using saved base models for prediction")
                
                # 计算每个基础模型在每个fold上的预测，然后平均
                lgbm_preds = []
                xgb_preds = []
                tabnet_preds = []
                
                # 获取fold数量
                n_folds = len(model_results['lgbm_models'])
                
#                 for fold in range(n_folds):
#                     # 选择对应fold的模型
#                     lgbm_model = model_results['lgbm_models'][fold]
#                     xgb_model = model_results['xgb_models'][fold]
#                     tabnet_model = model_results['tabnet_models'][fold]
                    
#                     # 获取预测
#                     lgbm_preds.append(lgbm_model.predict_proba(X_pred_scaled)[:, 1])
#                     xgb_preds.append(xgb_model.predict_proba(X_pred_scaled)[:, 1])
                    
#                     # 分批处理TabNet预测以减少内存使用
#                     batch_size = 2048
#                     num_samples = X_pred_scaled.shape[0]
#                     tabnet_pred_batch = []
                    
#                     for i in range(0, num_samples, batch_size):
#                         end_idx = min(i + batch_size, num_samples)
#                         batch = X_pred_scaled.iloc[i:end_idx].values
#                         batch_pred = tabnet_model.predict_proba(batch)[:, 1]
#                         tabnet_pred_batch.append(batch_pred)
                    
#                     tabnet_preds.append(np.concatenate(tabnet_pred_batch))
                    
#                     # 清理内存
#                     gc.collect()
#                     torch.cuda.empty_cache() if torch.cuda.is_available() else None
                
                for fold in range(n_folds):
                    # 选择对应fold的模型
                    lgbm_model = model_results['lgbm_models'][fold]
                    xgb_model = model_results['xgb_models'][fold]

                    # 获取预测
                    lgbm_preds.append(lgbm_model.predict_proba(X_pred_scaled)[:, 1])
                    xgb_preds.append(xgb_model.predict_proba(X_pred_scaled)[:, 1])

                    # 检查TabNet模型是否存在
                    if 'tabnet_models' in model_results and model_results['tabnet_models'] is not None:
                        # 确保TabNet模型在当前fold索引处存在
                        if fold < len(model_results['tabnet_models']) and model_results['tabnet_models'][fold] is not None:
                            # 分批处理TabNet预测以减少内存使用
                            batch_size = 2048
                            num_samples = X_pred_scaled.shape[0]
                            tabnet_pred_batch = []

                            for i in range(0, num_samples, batch_size):
                                end_idx = min(i + batch_size, num_samples)
                                batch = X_pred_scaled.iloc[i:end_idx].values
                                batch_pred = model_results['tabnet_models'][fold].predict_proba(batch)[:, 1]
                                tabnet_pred_batch.append(batch_pred)

                            tabnet_preds.append(np.concatenate(tabnet_pred_batch))
                        else:
                            # TabNet模型在当前fold不存在，使用默认值(0.5)
                            logger.warning(f"TabNet model missing for fold {fold}. Using default probability of 0.5.")
                            tabnet_preds.append(np.ones(X_pred_scaled.shape[0]) * 0.5)
                    else:
                        # TabNet模型不存在，使用默认值(0.5)
                        logger.warning(f"No TabNet models found for industry {model_industry}. Using default probability of 0.5.")
                        tabnet_preds.append(np.ones(X_pred_scaled.shape[0]) * 0.5)

                    # 清理内存
                    gc.collect()
                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    
#                 # 平均每个fold的预测
#                 base_model_preds[:, 0] = np.mean(lgbm_preds, axis=0)
#                 base_model_preds[:, 1] = np.mean(xgb_preds, axis=0)
#                 base_model_preds[:, 2] = np.mean(tabnet_preds, axis=0)
                # 平均每个fold的预测
                base_model_preds[:, 0] = np.mean(lgbm_preds, axis=0)
                base_model_preds[:, 1] = np.mean(xgb_preds, axis=0)

                # 检查是否有TabNet预测结果
                if len(tabnet_preds) > 0:
                    base_model_preds[:, 2] = np.mean(tabnet_preds, axis=0)
                else:
                    # 如果没有TabNet预测，使用默认值
                    logger.warning(f"No TabNet predictions available for industry {model_industry}. Using default probability of 0.5.")
                    base_model_preds[:, 2] = 0.5
            else:
                # 没有保存模型，需要重新训练（应该避免这种情况）
                logger.warning("No saved base models found, retraining is not recommended")
                # ... [原始重新训练代码] ...
                # 在这种情况下最好是中止操作，要求用户重新训练并保存正确的模型
                logger.error("Retraining models during prediction is not supported in this version to avoid data leakage")
                return None
            
#             # 修复3: 使用保存的元特征标准化器
#             if 'meta_scaler' in model_results:
#                 # 标准化元特征
#                 meta_scaler = model_results['meta_scaler']
#                 base_model_preds_scaled = meta_scaler.transform(base_model_preds)
#                 logger.info("Using saved meta-feature scaler")
#             else:
#                 # 如果没有保存元特征标准化器，则直接使用（新模型应该都有这个组件）
#                 logger.warning("No saved meta-feature scaler found, using raw predictions (not recommended)")
#                 base_model_preds_scaled = base_model_preds

            if 'meta_scaler' in model_results:
                # 标准化元特征
                meta_scaler = model_results['meta_scaler']

                # 检查元特征的数量是否匹配
                # 获取meta_scaler期望的特征数量
                expected_features = meta_scaler.n_features_in_ if hasattr(meta_scaler, 'n_features_in_') else meta_scaler.mean_.shape[0]
                actual_features = base_model_preds.shape[1]

                if expected_features == actual_features:
                    # 如果特征数量匹配，直接使用
                    base_model_preds_scaled = meta_scaler.transform(base_model_preds)
                    logger.info(f"Using saved meta-feature scaler with {expected_features} features")
                else:
                    # 如果特征数量不匹配，则需要调整
                    logger.warning(f"Meta-scaler expects {expected_features} features, but got {actual_features} features")

                    if expected_features < actual_features:
                        # 如果scaler期望的特征少于实际特征，只使用前几个特征
                        # 例如，如果训练时只有LGBM和XGBoost，但预测时有LGBM、XGBoost和TabNet
                        logger.warning(f"Using only the first {expected_features} features for meta-scaler")
                        base_model_preds_scaled = meta_scaler.transform(base_model_preds[:, :expected_features])
                    else:
                        # 如果scaler期望的特征多于实际特征，需要填充缺失的特征
                        # 这种情况比较罕见，但为了完整性也处理一下
                        logger.warning(f"Padding with zeros for missing {expected_features - actual_features} features")
                        padded_preds = np.zeros((base_model_preds.shape[0], expected_features))
                        padded_preds[:, :actual_features] = base_model_preds
                        base_model_preds_scaled = meta_scaler.transform(padded_preds)
            else:
                # 如果没有保存元特征标准化器，则直接使用（新模型应该都有这个组件）
                logger.warning("No saved meta-feature scaler found, using raw predictions (not recommended)")
                base_model_preds_scaled = base_model_preds
    
            
            # 使用元模型生成最终预测
            logger.info("Generating final predictions with meta-model...")
            fraud_probs = meta_model.predict_proba(base_model_preds_scaled)[:, 1]
            
            # 使用行业特定的最佳阈值（如果存在）
            fraud_preds = (fraud_probs >= threshold).astype(int)
            
            # 添加预测结果
            for i, idx in enumerate(industry_pred_data.index):
                pred_results.append({
                    'ID': idx,
                    'TICKER_SYMBOL': industry_pred_data.loc[idx, 'TICKER_SYMBOL'] if 'TICKER_SYMBOL' in industry_pred_data.columns else '',
                    'Industry': model_industry,
                    'Fraud_Probability': fraud_probs[i],
                    'Predicted_Fraud': fraud_preds[i],
                    'Threshold_Used': threshold
                })
            
            # 清理内存
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
        except Exception as e:
            logger.error(f"Error predicting for industry {model_industry}: {e}", exc_info=True)
    
    # 转换为DataFrame
    if len(pred_results) > 0:
        pred_results_df = pd.DataFrame(pred_results)
        
        # 保存预测结果
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        if industry:
            result_path = f'results/{industry}_predictions_{timestamp}.csv'
        else:
            result_path = f'results/all_predictions_{timestamp}.csv'
        
        pred_results_df.to_csv(result_path, index=False)
        logger.info(f"Predictions saved to {result_path}")
        
        # 打印预测统计
        logger.info("\nPrediction statistics:")
        logger.info(f"Total samples predicted: {len(pred_results_df)}")
        logger.info(f"Predicted fraud cases: {pred_results_df['Predicted_Fraud'].sum()} ({pred_results_df['Predicted_Fraud'].mean():.2%})")
        
        # 按行业统计
        industry_stats = pred_results_df.groupby('Industry')[['Predicted_Fraud', 'Threshold_Used']].agg({
            'Predicted_Fraud': ['count', 'sum', 'mean'],
            'Threshold_Used': 'first'
        })
        industry_stats.columns = ['Count', 'Fraud_Count', 'Fraud_Ratio', 'Threshold']
        industry_stats = industry_stats.sort_values('Fraud_Ratio', ascending=False)
        
        logger.info("\nIndustry statistics (Top 5 by fraud ratio):")
        logger.info(f"\n{industry_stats.head(5)}")
        
        # 创建预测可视化
        plt.figure(figsize=(12, 10))
        
        # 预测概率分布直方图
        plt.subplot(2, 2, 1)
        sns.histplot(pred_results_df['Fraud_Probability'], bins=20)
        plt.title('Distribution of Fraud Probabilities')
        plt.xlabel('Probability')
        plt.ylabel('Count')
        # 显示汉字
        plt.rcParams['font.sans-serif']=['SimHei']   
        plt.rcParams['axes.unicode_minus'] = False
        
        # 按行业的预测比例条形图
        plt.subplot(2, 2, 2)
        
        # 修复：检查是否只有一个行业，使用reset_index将索引转换为列
        if len(industry_stats) > 0:  
            # 先将索引转为列，这样不管是一个行业还是多个都能正确处理
            plot_data = industry_stats.reset_index().sort_values('Count', ascending=False).head(10)
            
            if len(plot_data) > 0:
                # 使用显式的列名而不是索引
                sns.barplot(x='Fraud_Ratio', y='Industry', data=plot_data)
                plt.title('Fraud Ratio by Industry')
                plt.xlabel('Fraud Ratio')
                plt.ylabel('Industry')
                # 显示汉字
                plt.rcParams['font.sans-serif']=['SimHei']   
                plt.rcParams['axes.unicode_minus'] = False
            else:
                plt.text(0.5, 0.5, "No data to display", 
                         horizontalalignment='center', verticalalignment='center')
                plt.title('Fraud Ratio by Industry')
                # 显示汉字
                plt.rcParams['font.sans-serif']=['SimHei']   
                plt.rcParams['axes.unicode_minus'] = False
        else:
            plt.text(0.5, 0.5, "No industry statistics available", 
                     horizontalalignment='center', verticalalignment='center')
            plt.title('Fraud Ratio by Industry')
            # 显示汉字
            plt.rcParams['font.sans-serif']=['SimHei']   
            plt.rcParams['axes.unicode_minus'] = False
        
        # 添加阈值条形图
        plt.subplot(2, 2, 3)
        if len(industry_stats) > 0:
            plot_data = industry_stats.reset_index().sort_values('Threshold', ascending=False).head(10)
            if len(plot_data) > 0:
                sns.barplot(x='Threshold', y='Industry', data=plot_data)
                plt.title('Threshold Used by Industry')
                plt.xlabel('Threshold')
                plt.ylabel('Industry')
                # 显示汉字
                plt.rcParams['font.sans-serif']=['SimHei']   
                plt.rcParams['axes.unicode_minus'] = False
            else:
                plt.text(0.5, 0.5, "No data to display", 
                         horizontalalignment='center', verticalalignment='center')
                plt.title('Threshold Used by Industry')
                # 显示汉字
                plt.rcParams['font.sans-serif']=['SimHei']   
                plt.rcParams['axes.unicode_minus'] = False
                
        # 显示汉字
        plt.rcParams['font.sans-serif']=['SimHei']   
        plt.rcParams['axes.unicode_minus'] = False
        # 样本量与造假率散点图
        plt.subplot(2, 2, 4)
        if len(industry_stats) > 0:
            plt.scatter(
                industry_stats['Count'], 
                industry_stats['Fraud_Ratio'], 
                alpha=0.7,
                c=industry_stats['Threshold'],
                cmap='viridis'
            )
            plt.colorbar(label='Threshold')
            plt.xscale('log')
            plt.xlabel('Sample Count (log scale)')
            plt.ylabel('Fraud Ratio')
            plt.title('Sample Count vs Fraud Ratio')
            # 显示汉字
            plt.rcParams['font.sans-serif']=['SimHei']   
            plt.rcParams['axes.unicode_minus'] = False
            
            # 添加注释，标记出样本量最大的几个行业
            for industry, row in industry_stats.head(5).iterrows():
                plt.annotate(
                    industry,
                    (row['Count'], row['Fraud_Ratio']),
                    xytext=(5, 5),
                    textcoords='offset points'
                )
        # 显示汉字
        plt.rcParams['font.sans-serif']=['SimHei']   
        plt.rcParams['axes.unicode_minus'] = False
        # 保存图表
        plt.tight_layout()
        plt.savefig(f'plots/predictions_summary_{timestamp}.png', dpi=600, bbox_inches='tight')
        plt.close()
        
        return pred_results_df
    else:
        logger.warning("No predictions were made.")
        return None

In [29]:
def combine_industry_results():
    """
    合并所有行业的结果，生成汇总报告
    """
    # 检查模型目录
    logger.info("Combining industry results...")
    if not os.path.exists('models'):
        logger.error("No models directory found. Please run models for industries first.")
        return
    
    # 获取所有模型文件
    model_files = []
    for ext in ['pkl', 'joblib']:
        model_files.extend([f for f in os.listdir('models') if f.endswith(f'_stacking_model.{ext}')])
    
    # 去重（同一行业的不同格式）
    unique_industries = set()
    unique_model_files = []
    for f in model_files:
        industry = f.split('_stacking_model.')[0]
        if industry not in unique_industries:
            unique_industries.add(industry)
            unique_model_files.append(f)
    
    if len(unique_model_files) == 0:
        logger.error("No model files found. Please run models for industries first.")
        return
    
    # 加载所有模型结果
    all_results = []
    for model_file in unique_model_files:
        try:
            industry = model_file.split('_stacking_model.')[0]
            model_path = os.path.join('models', model_file)
            
            if model_file.endswith('.joblib'):
                result = joblib.load(model_path)
            else:
                with open(model_path, 'rb') as f:
                    result = pickle.load(f)
            
            all_results.append(result)
            logger.info(f"Loaded model for industry: {industry}")
        except Exception as e:
            logger.error(f"Error loading model {model_file}: {e}", exc_info=True)
    
    # 生成汇总报告
    summary = []
    for result in all_results:
        # 提取基本信息
        summary.append({
            'Industry': result['industry'],
            'Test AUC': result['test_auc'],
            'PR-AUC': result.get('pr_auc', 0),  # 兼容可能没有pr_auc的旧模型
            'Features Count': len(result['features']),
            'Training Date': result.get('train_date', 'Unknown')  # 兼容可能没有train_date的旧模型
        })
    
    summary_df = pd.DataFrame(summary)
    summary_df = summary_df.sort_values('Test AUC', ascending=False)
    
    logger.info("\n=== Model Performance Summary ===")
    logger.info(f"\n{summary_df}")
    
    # 保存结果
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    summary_df.to_csv(f'results/model_performance_summary_{timestamp}.csv', index=False)
    
    # 绘制性能对比图
    plt.figure(figsize=(15, 10))
    
    # AUC对比
    plt.subplot(2, 1, 1)
    sns.barplot(x='Test AUC', y='Industry', data=summary_df.sort_values('Test AUC', ascending=False))
    plt.title('AUC Score by Industry')
    plt.xlim(0.5, 1.0)
    plt.grid(True, axis='x')
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    
    # PR-AUC对比
    plt.subplot(2, 1, 2)
    sns.barplot(x='PR-AUC', y='Industry', data=summary_df.sort_values('PR-AUC', ascending=False))
    plt.title('PR-AUC Score by Industry')
    plt.xlim(0, 1.0)
    plt.grid(True, axis='x')
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    
    plt.tight_layout()
    plt.savefig(f'plots/model_performance_comparison_{timestamp}.png', dpi=600, bbox_inches='tight')
    plt.close()
    
    logger.info(f"Summary saved to: results/model_performance_summary_{timestamp}.csv")
    logger.info(f"Comparison chart saved to: plots/model_performance_comparison_{timestamp}.png")
    
    # 模型性能统计
    mean_auc = summary_df['Test AUC'].mean()
    std_auc = summary_df['Test AUC'].std()
    max_auc = summary_df['Test AUC'].max()
    min_auc = summary_df['Test AUC'].min()
    
    logger.info("\nPerformance statistics:")
    logger.info(f"Average AUC: {mean_auc:.4f} (std: {std_auc:.4f})")
    logger.info(f"Max AUC: {max_auc:.4f} (industry: {summary_df.loc[summary_df['Test AUC'].idxmax(), 'Industry']})")
    logger.info(f"Min AUC: {min_auc:.4f} (industry: {summary_df.loc[summary_df['Test AUC'].idxmin(), 'Industry']})")
    
    # 检查可能存在的数据泄露问题
    suspicious_models = summary_df[summary_df['Test AUC'] > 0.95]
    if len(suspicious_models) > 0:
        logger.warning("\nWARNING: The following models have suspiciously high AUC (>0.95), which might indicate data leakage:")
        for _, row in suspicious_models.iterrows():
            logger.warning(f"Industry: {row['Industry']}, AUC: {row['Test AUC']:.4f}")
    
    return summary_df

In [30]:
def run_single_industry(industry_name, time_series=False, date_col=None, target_accuracy=0.89, accuracy_margin=0.01):
    """
    运行单个行业的模型
    
    Args:
        industry_name: 行业名称
        time_series: 是否按时间序列处理
        date_col: 日期列名称，仅在time_series=True时使用
        target_accuracy: 目标准确率，达到该准确率附近时停止训练
        accuracy_margin: 准确率容差范围
    """
    # 加载数据
    try:
        logger.info(f"Loading data for industry: {industry_name}")
        financial_data = pd.read_csv('output/data_partial_balanced.csv')
        industry_features_df = pd.read_csv('industry_features_results/all_industries_features.csv')
    except FileNotFoundError as e:
        logger.error(f"Error loading data: {e}")
        return
    
    # 获取有监督特征选择的行业列表
    supervised_industries = industry_features_df[industry_features_df['Method'] == '有监督']['Industry'].unique()
    
    # 确认行业是否在有监督列表中
    if industry_name not in supervised_industries:
        logger.error(f"Error: Industry {industry_name} is not in the supervised industry list.")
        logger.info(f"Available supervised industries: {supervised_industries}")
        return
    
    # 创建特征字典
    industry_feature_dict = {}
    for ind in industry_features_df['Industry'].unique():
        selected_features = industry_features_df[industry_features_df['Industry'] == ind]['Feature'].tolist()
        industry_feature_dict[ind] = selected_features
    
    # 分离预测集
    train_data = financial_data[financial_data['FLAG'].notna()].copy()
    
    # 获取该行业的数据
    industry_data = train_data[train_data['Industry'] == industry_name].copy()
    
    # 特殊处理小样本行业
    use_simplified_models = False
    skip_tabnet = False
    n_folds = 5
    
    # 教育行业特殊处理
    if industry_name == "教育":
        logger.info("Education industry detected - using simplified models and increased regularization")
        use_simplified_models = True
        skip_tabnet = True
        n_folds = 3
    
    # 综合行业稍微简化
    elif industry_name == "综合":
        logger.info("Comprehensive industry detected - using moderately simplified models")
        use_simplified_models = True
        n_folds = 4
        
    # 获取该行业的特征
    industry_features = industry_feature_dict[industry_name]
    
    # 记录行业特征
    logger.info(f"Industry features: {len(industry_features)} features selected")
    
    # 记录样本分布
    fraud_count = industry_data['FLAG'].sum()
    non_fraud_count = len(industry_data) - fraud_count
    fraud_ratio = fraud_count / len(industry_data)
    logger.info(f"Sample distribution: {len(industry_data)} total, {fraud_count} fraud ({fraud_ratio:.2%}), {non_fraud_count} non-fraud")
    
    # 记录目标准确率
    logger.info(f"Target accuracy: {target_accuracy:.4f} (±{accuracy_margin:.4f})")
    
    # 处理单个行业，传递额外参数
    result = process_single_industry(
#         industry_name, 
#         industry_data, 
#         industry_features,
#         time_series=time_series, 
#         date_col=date_col,
#         use_simplified_models=use_simplified_models,
#         skip_tabnet=skip_tabnet,
#         n_folds=n_folds,
#         target_accuracy=target_accuracy,
#         accuracy_margin=accuracy_margin
        industry_name, 
        industry_data, 
        industry_features,
        time_series=time_series, 
        date_col=date_col,
        use_simplified_models=use_simplified_models,
        skip_tabnet=skip_tabnet,
        n_folds=n_folds,
        target_accuracy=target_accuracy,
        accuracy_margin=accuracy_margin
    )
    
    if result:
        logger.info(f"Successfully processed industry: {industry_name}")
        logger.info(f"Model saved to: models/{industry_name}_stacking_model.pkl and .joblib")
        logger.info(f"Performance: AUC = {result['test_auc']:.4f}, PR-AUC = {result['pr_auc']:.4f}")
        
        # 添加准确率相关信息
        if 'final_accuracy' in result:
            logger.info(f"Final accuracy: {result['final_accuracy']:.4f}")
        if 'best_threshold_accuracy' in result:
            logger.info(f"Accuracy with best threshold: {result['best_threshold_accuracy']:.4f}")
        if 'early_stopped' in result and result['early_stopped']:
            logger.info(f"Model training was early stopped at target accuracy {target_accuracy:.4f}±{accuracy_margin:.4f}")
        
        # 绘制该行业的性能摘要
        performance_data = {
            'Industry': [industry_name],
            'AUC': [result['test_auc']],
            'PR-AUC': [result['pr_auc']],
            'Accuracy': [result.get('final_accuracy', 0)],
            'Accuracy_Best_Threshold': [result.get('best_threshold_accuracy', 0)],
            'Sample_Count': [len(industry_data)],
            'Fraud_Ratio': [fraud_ratio],
            'Early_Stopped': [result.get('early_stopped', False)]
        }
        pd.DataFrame(performance_data).to_csv(f'results/{industry_name}_performance.csv', index=False)
    else:
        logger.error(f"Failed to process industry: {industry_name}")
    
    return result

In [31]:
def plot_model_evaluation(y_true, y_pred, industry, model_name="Stacking"):
    """
    绘制模型评估图表，包括ROC曲线、PR曲线、混淆矩阵等
    
    Args:
        y_true: 真实标签
        y_pred: 预测概率
        industry: 行业名称
        model_name: 模型名称
    """
    # 计算评估指标
    test_auc = roc_auc_score(y_true, y_pred)
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    pr_auc = auc(recall, precision)
    
    # 转换为二分类预测
    y_pred_binary = (y_pred >= 0.5).astype(int)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_true, y_pred_binary)
    
    # 生成分类报告
    report = classification_report(y_true, y_pred_binary, output_dict=True)
    
    # 创建评估图表
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    
    # ROC曲线
    from sklearn.metrics import roc_curve
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    axes[0, 0].plot(fpr, tpr, label=f'AUC = {test_auc:.4f}')
    axes[0, 0].plot([0, 1], [0, 1], 'k--')
    axes[0, 0].set_xlabel('False Positive Rate')
    axes[0, 0].set_ylabel('True Positive Rate')
    axes[0, 0].set_title(f'{industry} - {model_name} ROC Curve')
    axes[0, 0].legend(loc='lower right')
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    
    # PR曲线
    axes[0, 1].plot(recall, precision, label=f'PR-AUC = {pr_auc:.4f}')
    axes[0, 1].set_xlabel('Recall')
    axes[0, 1].set_ylabel('Precision')
    axes[0, 1].set_title(f'{industry} - {model_name} Precision-Recall Curve')
    axes[0, 1].legend(loc='lower left')
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    
    # 混淆矩阵
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0])
    axes[1, 0].set_xlabel('Predicted')
    axes[1, 0].set_ylabel('True')
    axes[1, 0].set_title(f'{industry} - {model_name} Confusion Matrix')
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    
    # 分类指标 - 安全地获取类别键
    # 检查报告中是否包含所需的类别
    class_keys = []
    for key in ['0', '1']:
        if key in report:
            class_keys.append(key)
    
    # 创建数据框
    metrics_data = {}
    
    # 安全地获取每个指标
    metrics_names = ['precision', 'recall', 'f1-score', 'support']
    for metric in metrics_names:
        metrics_data[metric.capitalize()] = []
        for key in class_keys:
            metrics_data[metric.capitalize()].append(report[key][metric])
        
        # 如果类别不全，补充缺失值
        while len(metrics_data[metric.capitalize()]) < 2:
            if metric == 'support':
                metrics_data[metric.capitalize()].append(0)
            else:
                metrics_data[metric.capitalize()].append(float('nan'))
    
    # 创建索引名称
    index_names = []
    for i, key in enumerate(class_keys):
        index_names.append(f'Class {key}')
    
    # 如果类别不全，补充缺失的类别名称
    while len(index_names) < 2:
        missing_class = '1' if '0' in class_keys else '0'
        index_names.append(f'Class {missing_class} (Missing)')
    
    metrics_df = pd.DataFrame(metrics_data, index=index_names)
    
    # 为分类指标表创建一个表格
    table = axes[1, 1].table(
        cellText=metrics_df.values.round(3),
        rowLabels=metrics_df.index,
        colLabels=metrics_df.columns,
        cellLoc='center',
        loc='center'
    )
    table.scale(1, 1.5)
    axes[1, 1].axis('off')
    axes[1, 1].set_title(f'{industry} - {model_name} Classification Metrics')
    
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    # 保存图表
    plt.tight_layout()
    plt.savefig(f'plots/{industry}_{model_name.lower()}_evaluation.png', dpi=600, bbox_inches='tight')
    plt.close()
    
    # 记录评估结果
    logger.info(f"{industry} - {model_name} Evaluation:")
    logger.info(f"AUC: {test_auc:.4f}, PR-AUC: {pr_auc:.4f}")
    logger.info(f"Classification Report:\n{classification_report(y_true, y_pred_binary)}")
    
    return {
        'auc': test_auc,
        'pr_auc': pr_auc,
        'confusion_matrix': cm,
        'classification_report': report
    }

In [32]:
def analyze_feature_importance(industry, X_train, y_train, trained_models=None):
    """
    分析并可视化特征重要性，使用训练好的模型或训练新模型
    
    Args:
        industry: 行业名称
        X_train: 训练特征
        y_train: 训练标签
        trained_models: 已训练好的模型字典，包含 'lgbm' 和 'xgb' 键
    """
    # 创建重要性数据框列表
    importance_dfs = []
    
    # 如果没有提供训练好的模型，则训练新模型
    if trained_models is None or 'lgbm' not in trained_models:
        # LightGBM特征重要性
        lgbm_model = lgb.LGBMClassifier(
            objective='binary', 
            n_jobs=-1, 
            verbosity=-1, 
            random_state=SEED
        )
        lgbm_model.fit(X_train, y_train)
    else:
        lgbm_model = trained_models['lgbm']
    
    # 提取LightGBM特征重要性
    lgbm_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': lgbm_model.feature_importances_,
        'Model': 'LightGBM'
    }).sort_values('Importance', ascending=False)
    importance_dfs.append(lgbm_importance)
    
    # 如果没有提供训练好的模型，则训练新模型
    if trained_models is None or 'xgb' not in trained_models:
        # XGBoost特征重要性
        xgb_model = xgb.XGBClassifier(
            objective='binary:logistic', 
            eval_metric='auc', 
            use_label_encoder=False, 
            n_jobs=-1, 
            random_state=SEED
        )
        xgb_model.fit(X_train, y_train)
    else:
        xgb_model = trained_models['xgb']
    
    # 提取XGBoost特征重要性
    xgb_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': xgb_model.feature_importances_,
        'Model': 'XGBoost'
    }).sort_values('Importance', ascending=False)
    importance_dfs.append(xgb_importance)
    
    # 合并重要性数据
    all_importances = pd.concat(importance_dfs)
    
    # 绘制特征重要性
    plt.figure(figsize=(20, 10))
    
    # LightGBM特征重要性
    plt.subplot(1, 2, 1)
    top_lgbm = lgbm_importance.head(15)
    sns.barplot(x='Importance', y='Feature', data=top_lgbm)
    plt.title(f'{industry} - LightGBM Feature Importance')
    
    # XGBoost特征重要性
    plt.subplot(1, 2, 2)
    top_xgb = xgb_importance.head(15)
    sns.barplot(x='Importance', y='Feature', data=top_xgb)
    plt.title(f'{industry} - XGBoost Feature Importance')
    
    # 显示汉字
    plt.rcParams['font.sans-serif']=['SimHei']   
    plt.rcParams['axes.unicode_minus'] = False
    plt.tight_layout()
    plt.savefig(f'plots/{industry}_feature_importance.png', dpi=600, bbox_inches='tight')
    plt.close()
    
    # 保存特征重要性数据
    all_importances.to_csv(f'results/{industry}_feature_importance.csv', index=False)
    
    return all_importances

In [33]:
def evaluate_industry_model(industry_name):
    """
    详细评估单个行业的模型性能，计算最佳阈值并保存，避免数据泄露
    
    Args:
        industry_name: 行业名称
    """
    logger.info(f"Evaluating model for industry: {industry_name}")
    
    # 检查模型是否存在
    model_path_joblib = f'models/{industry_name}_stacking_model.joblib'
    model_path_pkl = f'models/{industry_name}_stacking_model.pkl'
    
    if os.path.exists(model_path_joblib):
        model_path = model_path_joblib
    elif os.path.exists(model_path_pkl):
        model_path = model_path_pkl
    else:
        logger.error(f"No model found for industry {industry_name}. Please train a model first.")
        return
    
    # 加载模型
    logger.info(f"Loading model from {model_path}")
    try:
        if model_path.endswith('.joblib'):
            model_results = joblib.load(model_path)
        else:
            with open(model_path, 'rb') as f:
                model_results = pickle.load(f)
    except Exception as e:
        logger.error(f"Error loading model: {e}", exc_info=True)
        return
    
    # 加载评估数据
    try:
        financial_data = pd.read_csv('output/data_partial_balanced.csv')
        industry_features_df = pd.read_csv('industry_features_results/all_industries_features.csv')
    except FileNotFoundError as e:
        logger.error(f"Error loading data: {e}")
        return
    
    # 获取行业特征
    features = model_results['features']
    
    # 获取模型性能指标
    test_auc = model_results.get('test_auc', None)
    pr_auc = model_results.get('pr_auc', None)
    train_date = model_results.get('train_date', 'Unknown')
    
    logger.info(f"Model info: Trained on {train_date}")
    logger.info(f"Performance metrics: AUC = {test_auc:.4f}, PR-AUC = {pr_auc:.4f} (from model)")
    
    # 获取该行业的数据
    industry_data = financial_data[financial_data['Industry'] == industry_name].copy()
    labeled_data = industry_data[industry_data['FLAG'].notna()].copy()
    unlabeled_data = industry_data[industry_data['FLAG'].isna()].copy()
    
    logger.info(f"Data statistics: Total={len(industry_data)}, Labeled={len(labeled_data)}, Unlabeled={len(unlabeled_data)}")
    
    # 验证有训练集结果
    if 'meta_model' not in model_results:
        logger.error("Invalid model format: missing meta_model")
        return
    
    # 创建评估报告目录
    eval_dir = f'evaluations/{industry_name}'
    os.makedirs(eval_dir, exist_ok=True)
    
    # 创建模型信息报告
    model_info = {
        'Industry': industry_name,
        'Train Date': train_date,
        'Test AUC': test_auc,
        'PR-AUC': pr_auc,
        'Features Count': len(features),
        'Total Samples': len(industry_data),
        'Labeled Samples': len(labeled_data),
        'Unlabeled Samples': len(unlabeled_data)
    }
    
    # 样本分布统计
    if len(labeled_data) > 0:
        fraud_count = labeled_data['FLAG'].sum()
        non_fraud_count = len(labeled_data) - fraud_count
        fraud_ratio = fraud_count / len(labeled_data)
        
        model_info.update({
            'Fraud Count': fraud_count,
            'Non-Fraud Count': non_fraud_count,
            'Fraud Ratio': fraud_ratio
        })
    
    pd.DataFrame([model_info]).to_csv(f'{eval_dir}/model_info.csv', index=False)
    
    # 特征重要性分析
    if os.path.exists(f'results/{industry_name}_feature_importance.csv'):
        feature_importance = pd.read_csv(f'results/{industry_name}_feature_importance.csv')
        
        # 特征重要性可视化
        plt.figure(figsize=(15, 10))
        for i, model_type in enumerate(['LightGBM', 'XGBoost']):
            model_importance = feature_importance[feature_importance['Model'] == model_type]
            top_features = model_importance.sort_values('Importance', ascending=False).head(20)
            
            plt.subplot(1, 2, i+1)
            sns.barplot(x='Importance', y='Feature', data=top_features)
            plt.title(f'{industry_name} - {model_type} Feature Importance')
            plt.tight_layout()
        
        plt.savefig(f'{eval_dir}/feature_importance.png', dpi=600, bbox_inches='tight')
        plt.close()
    
    # 如果有标记数据，执行额外的评估
    if len(labeled_data) > 0:
        # 提取标记数据的特征和标签
        valid_features = [f for f in features if f in labeled_data.columns]
        if len(valid_features) < len(features):
            logger.warning(f"Missing {len(features) - len(valid_features)} features in labeled data")
        
        X_eval = labeled_data[valid_features]
        y_eval = labeled_data['FLAG']
        
        # 修复1: 使用模型中保存的标准化器，而不是重新拟合
        base_model_preds = np.zeros((X_eval.shape[0], 3))
        
        # 检查是否有保存的标准化器
        if 'scalers' in model_results and len(model_results['scalers']) > 0:
            logger.info("Using saved scalers for evaluation")
            # 使用第一个保存的标准化器
            scaler = model_results['scalers'][0]
            X_eval_scaled = pd.DataFrame(
                scaler.transform(X_eval),  # 使用transform而不是fit_transform
                columns=X_eval.columns,
                index=X_eval.index
            )
        else:
            logger.warning("No saved scalers found in model, creating new scaler (not recommended)")
            # 其实这里不应该重新拟合，但如果没有保存的标准化器，只能重新做
            scaler = StandardScaler()
            X_eval_scaled = pd.DataFrame(
                scaler.fit_transform(X_eval),
                columns=X_eval.columns,
                index=X_eval.index
            )
        
        # 生成基础模型预测
        if 'lgbm_models' in model_results and len(model_results['lgbm_models']) > 0:
            # 使用保存的模型
            logger.info("Using saved base models for evaluation")
            
            # 计算每个基础模型的预测
            lgbm_preds = []
            xgb_preds = []
            tabnet_preds = []
            
            # 获取fold数量
            n_folds = len(model_results['lgbm_models'])
            
            for fold in range(n_folds):
                lgbm_model = model_results['lgbm_models'][fold]
                xgb_model = model_results['xgb_models'][fold]
                tabnet_model = model_results['tabnet_models'][fold]
                
                lgbm_preds.append(lgbm_model.predict_proba(X_eval_scaled)[:, 1])
                xgb_preds.append(xgb_model.predict_proba(X_eval_scaled)[:, 1])
                
                # 分批处理TabNet预测以减少内存使用
                batch_size = 2048
                num_samples = X_eval_scaled.shape[0]
                tabnet_pred_batch = []
                
                for i in range(0, num_samples, batch_size):
                    end_idx = min(i + batch_size, num_samples)
                    batch = X_eval_scaled.iloc[i:end_idx].values
                    batch_pred = tabnet_model.predict_proba(batch)[:, 1]
                    tabnet_pred_batch.append(batch_pred)
                
                tabnet_preds.append(np.concatenate(tabnet_pred_batch))
                
                # 清理内存
                gc.collect()
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            # 平均每个fold的预测
            base_model_preds[:, 0] = np.mean(lgbm_preds, axis=0)
            base_model_preds[:, 1] = np.mean(xgb_preds, axis=0)
            base_model_preds[:, 2] = np.mean(tabnet_preds, axis=0)
        else:
            logger.warning("No saved base models found, evaluation may be limited")
            # 在这种情况下不应尝试重新训练，因为这会引入数据泄露
            logger.error("Cannot evaluate model without saved base models")
            return
        
        # 修复2: 使用保存的元特征标准化器
        if 'meta_scaler' in model_results:
            # 使用保存的元特征标准化器
            meta_scaler = model_results['meta_scaler']
            base_model_preds_scaled = meta_scaler.transform(base_model_preds)
            logger.info("Using saved meta-feature scaler")
        else:
            logger.warning("No saved meta-feature scaler found, using unscaled predictions")
            # 如果没有保存元特征标准化器，则使用未标准化的预测
            base_model_preds_scaled = base_model_preds
        
        # 使用元模型进行最终预测
        meta_model = model_results['meta_model']
        final_preds = meta_model.predict_proba(base_model_preds_scaled)[:, 1]
        
        # 评估性能
        from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
        from sklearn.metrics import roc_curve, precision_recall_curve, auc
        
        eval_results = plot_model_evaluation(y_eval, final_preds, industry_name, "Evaluation")
        
        # 保存评估图表
        plt.savefig(f'{eval_dir}/model_evaluation.png', dpi=600, bbox_inches='tight')
        
        # 保存实际标签和预测概率
        eval_df = pd.DataFrame({
            'True_Label': y_eval,
            'Predicted_Probability': final_preds,
            'LightGBM_Prob': base_model_preds[:, 0],
            'XGBoost_Prob': base_model_preds[:, 1],
            'TabNet_Prob': base_model_preds[:, 2],
            'Predicted_Label': (final_preds >= 0.5).astype(int)
        })
        
        # 添加原始数据的索引
        eval_df['Original_Index'] = labeled_data.index
        
        # 添加关键特征
        for feature in features[:20]:  # 添加前20个特征
            if feature in labeled_data.columns:
                eval_df[feature] = labeled_data[feature].values
        
        # 保存预测结果
        eval_df.to_csv(f'{eval_dir}/prediction_analysis.csv', index=False)
        
        # 修复3: 使用单独的验证集寻找最佳阈值
        # 将评估数据分为两部分：一部分用于找最佳阈值，一部分用于最终评估
        from sklearn.model_selection import train_test_split
        
        # 对评估数据进行分层抽样
        threshold_indices, final_indices = train_test_split(
            np.arange(len(y_eval)), 
            test_size=0.5, 
            random_state=SEED, 
            stratify=y_eval
        )
        
        # 分离数据
        y_threshold = y_eval.iloc[threshold_indices]
        y_final = y_eval.iloc[final_indices]
        final_preds_threshold = final_preds[threshold_indices]
        final_preds_final = final_preds[final_indices]
        
        # 在阈值划分数据上寻找最佳阈值
        thresholds = np.arange(0.05, 1.0, 0.05)
        threshold_metrics = []
        
        best_f1 = 0
        best_threshold = 0.5
        
        # 寻找最佳阈值
        for threshold in thresholds:
            y_pred_binary = (final_preds_threshold >= threshold).astype(int)
            
            # 计算指标
            accuracy = accuracy_score(y_threshold, y_pred_binary)
            precision = precision_score(y_threshold, y_pred_binary, zero_division=0)
            recall = recall_score(y_threshold, y_pred_binary, zero_division=0)
            f1 = f1_score(y_threshold, y_pred_binary, zero_division=0)
            
            # 计算混淆矩阵元素
            TP = np.sum((y_threshold == 1) & (y_pred_binary == 1))
            FP = np.sum((y_threshold == 0) & (y_pred_binary == 1))
            TN = np.sum((y_threshold == 0) & (y_pred_binary == 0))
            FN = np.sum((y_threshold == 1) & (y_pred_binary == 0))
            
            # 对不平衡数据，可以使用F2分数，更重视召回率
            beta = 2
            f_beta = (1 + beta**2) * precision * recall / ((beta**2 * precision) + recall) if (precision + recall) > 0 else 0
            
            threshold_metrics.append({
                'Threshold': threshold,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1_Score': f1,
                'F2_Score': f_beta,
                'TP': TP,
                'FP': FP,
                'TN': TN,
                'FN': FN
            })
            
            # 可以选择使用F1或F2分数作为选择标准
            score_to_optimize = f1  # 或使用 f_beta
            if score_to_optimize > best_f1:
                best_f1 = score_to_optimize
                best_threshold = threshold
        
        # 保存阈值分析
        threshold_df = pd.DataFrame(threshold_metrics)
        threshold_df.to_csv(f'{eval_dir}/threshold_analysis.csv', index=False)
        
        logger.info(f"Best threshold (on threshold subset): {best_threshold:.2f} (F1={best_f1:.4f})")
        
        # 修复4: 在独立的最终评估集上用最佳阈值评估性能
        y_final_pred_binary = (final_preds_final >= best_threshold).astype(int)
        final_accuracy = accuracy_score(y_final, y_final_pred_binary)
        final_precision = precision_score(y_final, y_final_pred_binary, zero_division=0)
        final_recall = recall_score(y_final, y_final_pred_binary, zero_division=0)
        final_f1 = f1_score(y_final, y_final_pred_binary, zero_division=0)
        
        logger.info(f"Final evaluation (with best threshold):")
        logger.info(f"Accuracy: {final_accuracy:.4f}, Precision: {final_precision:.4f}, Recall: {final_recall:.4f}, F1: {final_f1:.4f}")
        
        # 绘制阈值性能曲线
        plt.figure(figsize=(15, 10))
        
        plt.subplot(2, 2, 1)
        plt.plot(threshold_df['Threshold'], threshold_df['Accuracy'], marker='o')
        plt.title('Accuracy vs Threshold')
        plt.xlabel('Threshold')
        plt.ylabel('Accuracy')
        plt.grid(True)
        plt.axvline(x=best_threshold, color='r', linestyle='--', label=f'Best Threshold ({best_threshold:.2f})')
        plt.legend()
        
        plt.subplot(2, 2, 2)
        plt.plot(threshold_df['Threshold'], threshold_df['Precision'], marker='o', label='Precision')
        plt.plot(threshold_df['Threshold'], threshold_df['Recall'], marker='s', label='Recall')
        plt.title('Precision & Recall vs Threshold')
        plt.xlabel('Threshold')
        plt.ylabel('Score')
        plt.legend()
        plt.grid(True)
        plt.axvline(x=best_threshold, color='r', linestyle='--')
        
        plt.subplot(2, 2, 3)
        plt.plot(threshold_df['Threshold'], threshold_df['F1_Score'], marker='o')
        plt.title('F1 Score vs Threshold')
        plt.xlabel('Threshold')
        plt.ylabel('F1 Score')
        plt.grid(True)
        plt.axvline(x=best_threshold, color='r', linestyle='--')
        
        # 在最佳阈值处显示混淆矩阵
        best_idx = threshold_df['Threshold'].values.tolist().index(best_threshold)
        
        plt.subplot(2, 2, 4)
        plt.bar(['TP', 'FP', 'TN', 'FN'], 
                [threshold_df.iloc[best_idx]['TP'], 
                 threshold_df.iloc[best_idx]['FP'],
                 threshold_df.iloc[best_idx]['TN'],
                 threshold_df.iloc[best_idx]['FN']])
        plt.title(f'Confusion Matrix at Best Threshold ({best_threshold:.2f})')
        
        plt.tight_layout()
        plt.savefig(f'{eval_dir}/threshold_performance.png', dpi=600, bbox_inches='tight')
        plt.close()
        
        # 将最佳阈值保存到模型文件中
        if model_path.endswith('.joblib'):
            model_results['best_threshold'] = best_threshold
            model_results['best_f1'] = best_f1
            model_results['evaluation_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            joblib.dump(model_results, model_path)
            logger.info(f"Updated model file with best threshold: {model_path}")
        else:
            model_results['best_threshold'] = best_threshold
            model_results['best_f1'] = best_f1
            model_results['evaluation_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            with open(model_path, 'wb') as f:
                pickle.dump(model_results, f)
            logger.info(f"Updated model file with best threshold: {model_path}")
        
        # 执行错误分析
        # 使用最佳阈值的预测结果
        eval_df['Best_Threshold_Prediction'] = (final_preds >= best_threshold).astype(int)
        eval_df['Error'] = (eval_df['Best_Threshold_Prediction'] != eval_df['True_Label']).astype(int)
        
        # 分析误判样本
        false_positives = eval_df[(eval_df['True_Label'] == 0) & (eval_df['Best_Threshold_Prediction'] == 1)]
        false_negatives = eval_df[(eval_df['True_Label'] == 1) & (eval_df['Best_Threshold_Prediction'] == 0)]
        
        logger.info(f"Error analysis: False Positives={len(false_positives)}, False Negatives={len(false_negatives)}")
        
        # 保存误判样本
        if len(false_positives) > 0:
            false_positives.to_csv(f'{eval_dir}/false_positives.csv', index=False)
        if len(false_negatives) > 0:
            false_negatives.to_csv(f'{eval_dir}/false_negatives.csv', index=False)
    
    # 对未标记数据执行预测分析
    if len(unlabeled_data) > 0:
        # 提取未标记数据的特征
        valid_features = [f for f in features if f in unlabeled_data.columns]
        X_pred = unlabeled_data[valid_features]
        
        # 修复5: 使用保存的标准化器对未标记数据进行标准化
        if 'scalers' in model_results and len(model_results['scalers']) > 0:
            # 使用第一个保存的标准化器
            scaler = model_results['scalers'][0]
            X_pred_scaled = pd.DataFrame(
                scaler.transform(X_pred),  # 使用transform而不是fit_transform
                columns=X_pred.columns,
                index=X_pred.index
            )
        else:
            logger.warning("No saved scalers found, using new scaler (not recommended)")
            scaler = StandardScaler()
            X_pred_scaled = pd.DataFrame(
                scaler.fit_transform(X_pred),
                columns=X_pred.columns,
                index=X_pred.index
            )
        
        # 生成基础模型预测
        base_model_preds = np.zeros((X_pred.shape[0], 3))
        
        if 'lgbm_models' in model_results and len(model_results['lgbm_models']) > 0:
            # 使用保存的模型
            logger.info("Predicting for unlabeled data")
            
            # 计算每个基础模型的预测
            lgbm_preds = []
            xgb_preds = []
            tabnet_preds = []
            
            # 获取fold数量
            n_folds = len(model_results['lgbm_models'])
            
            for fold in range(n_folds):
                lgbm_model = model_results['lgbm_models'][fold]
                xgb_model = model_results['xgb_models'][fold]
                tabnet_model = model_results['tabnet_models'][fold]
                
                lgbm_preds.append(lgbm_model.predict_proba(X_pred_scaled)[:, 1])
                xgb_preds.append(xgb_model.predict_proba(X_pred_scaled)[:, 1])
                
                # 分批处理TabNet预测
                batch_size = 2048
                num_samples = X_pred_scaled.shape[0]
                tabnet_pred_batch = []
                
                for i in range(0, num_samples, batch_size):
                    end_idx = min(i + batch_size, num_samples)
                    batch = X_pred_scaled.iloc[i:end_idx].values
                    batch_pred = tabnet_model.predict_proba(batch)[:, 1]
                    tabnet_pred_batch.append(batch_pred)
                
                tabnet_preds.append(np.concatenate(tabnet_pred_batch))
                
                # 清理内存
                gc.collect()
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            # 平均每个fold的预测
            base_model_preds[:, 0] = np.mean(lgbm_preds, axis=0)
            base_model_preds[:, 1] = np.mean(xgb_preds, axis=0)
            base_model_preds[:, 2] = np.mean(tabnet_preds, axis=0)
            
            # 修复6: 使用保存的元特征标准化器
            if 'meta_scaler' in model_results:
                # 使用保存的元特征标准化器
                meta_scaler = model_results['meta_scaler']
                base_model_preds_scaled = meta_scaler.transform(base_model_preds)
            else:
                # 如果没有保存元特征标准化器，则使用未标准化的预测
                base_model_preds_scaled = base_model_preds
        
            # 使用元模型进行最终预测
            meta_model = model_results['meta_model']
            final_preds = meta_model.predict_proba(base_model_preds_scaled)[:, 1]
            
            # 使用最佳阈值（如果有）
            threshold = model_results.get('best_threshold', 0.5)
            pred_labels = (final_preds >= threshold).astype(int)
            
            # 保存未标记数据的预测结果
            unlabeled_pred_df = pd.DataFrame({
                'Original_Index': unlabeled_data.index,
                'Predicted_Probability': final_preds,
                'Predicted_Label': pred_labels,
                'Threshold_Used': threshold,
                'LightGBM_Prob': base_model_preds[:, 0],
                'XGBoost_Prob': base_model_preds[:, 1],
                'TabNet_Prob': base_model_preds[:, 2]
            })
            
            # 添加关键特征
            for feature in features[:20]:  # 添加前20个特征
                if feature in unlabeled_data.columns:
                    unlabeled_pred_df[feature] = unlabeled_data[feature].values
            
            # 保存预测结果
            unlabeled_pred_df.to_csv(f'{eval_dir}/unlabeled_predictions.csv', index=False)
            
            # 绘制预测分布
            plt.figure(figsize=(15, 6))
            
            plt.subplot(1, 2, 1)
            sns.histplot(final_preds, bins=20)
            plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold ({threshold:.2f})')
            plt.title('Prediction Probability Distribution (Unlabeled Data)')
            plt.xlabel('Predicted Probability')
            plt.ylabel('Count')
            plt.legend()
            
            plt.subplot(1, 2, 2)
            fraud_count = sum(pred_labels)
            non_fraud_count = len(pred_labels) - fraud_count
            plt.pie([fraud_count, non_fraud_count], 
                    labels=['Predicted Fraud', 'Predicted Non-Fraud'], 
                    autopct='%1.1f%%')
            plt.title('Prediction Distribution')
            
            plt.tight_layout()
            plt.savefig(f'{eval_dir}/unlabeled_predictions_dist.png', dpi=600, bbox_inches='tight')
            plt.close()
        else:
            logger.error("No saved models found, cannot predict for unlabeled data")
    
    logger.info(f"Evaluation for industry {industry_name} completed. Results saved to {eval_dir}/")
    if test_auc is not None:
        logger.info(f"AUC: {test_auc:.4f}, PR-AUC: {pr_auc:.4f}")
    
    return f"Evaluation completed for {industry_name}. Results saved to {eval_dir}/"

In [37]:
def main_menu():
    """
    主菜单，提供不同的功能选项，增加了目标准确率设置选项
    """
    # 默认目标准确率和容差
    target_accuracy = 0.89
    accuracy_margin = 0.01
    
    while True:
        print("\n\n" + "="*60)
        print("  财务造假预测模型 - 分行业逐步运行优化版  ")
        print("="*60)
        print("\n当前目标准确率: {:.2f}% (±{:.2f}%)".format(target_accuracy*100, accuracy_margin*100))
        print("\n请选择要执行的操作:")
        print("1. 生成可用行业列表和统计分析")
        print("2. 运行单个行业的模型")
        print("3. 运行单个行业的模型 (时间序列处理)")
        print("4. 合并所有行业的结果并生成汇总报告")
        print("5. 对未知样本进行预测")
        print("6. 对特定行业的未知样本进行预测")
        print("7. 模型性能评估与可视化")
        print("8. 数据探索性分析")
        print("9. 单个行业模型详细评估")
        print("10. 设置目标准确率和容差范围")
        print("0. 退出")
        
        try:
            choice = input("\n请输入选项 (0-10): ")
            
            if choice == '0':
                logger.info("\n程序结束，感谢使用！")
                break
            
            elif choice == '1':
                # 生成行业列表
                generate_industry_list()
            
            elif choice == '2':
                # 运行单个行业模型
                industry_list_path = None
                for f in sorted(os.listdir('results'), reverse=True):
                    if f.startswith('industry_list_') and f.endswith('.csv'):
                        industry_list_path = os.path.join('results', f)
                        break
                
                if industry_list_path:
                    industry_list = pd.read_csv(industry_list_path)
                    print("\n可用行业列表:")
                    
                    # 显示可用行业，并标记推荐的行业
                    for i, row in enumerate(industry_list.iterrows()):
                        idx, data = row
                        industry = data['Industry']
                        sample_count = data['Sample_Count']
                        fraud_ratio = data['Fraud_Ratio']
                        
                        # 标记推荐行业
                        is_trainable = data.get('Trainable', sample_count >= 30)
                        is_imbalanced = data.get('Imbalanced', fraud_ratio < 0.01 or fraud_ratio > 0.99)
                        
                        marker = ''
                        if is_trainable and not is_imbalanced:
                            marker = ' (推荐)'
                        elif not is_trainable:
                            marker = ' (样本过少)'
                        elif is_imbalanced:
                            marker = ' (严重不平衡)'
                        
                        print(f"{i+1}. {industry}{marker} - 样本数: {sample_count}, 造假率: {fraud_ratio:.2%}")
                    
                    idx = input("\n请输入行业编号，或直接输入行业名称: ")
                    
                    if idx.isdigit() and int(idx) <= len(industry_list):
                        selected_industry = industry_list.iloc[int(idx)-1]['Industry']
                    else:
                        selected_industry = idx
                    
                    run_single_industry(selected_industry, time_series=False, 
                                        target_accuracy=target_accuracy, 
                                        accuracy_margin=accuracy_margin)
                else:
                    industry_name = input("\n请输入要处理的行业名称: ")
                    run_single_industry(industry_name, time_series=False, 
                                        target_accuracy=target_accuracy, 
                                        accuracy_margin=accuracy_margin)
            
            elif choice == '3':
                # 运行单个行业模型（时间序列处理）
                industry_name = input("\n请输入要处理的行业名称: ")
                date_col = input("请输入日期列名称 (默认为'report_date'): ") or "report_date"
                run_single_industry(industry_name, time_series=True, date_col=date_col,
                                   target_accuracy=target_accuracy, 
                                   accuracy_margin=accuracy_margin)
            
            elif choice == '4':
                # 合并结果并生成报告
                combine_industry_results()
                # 在main_menu函数中的选项4中添加
                try:
                    combine_industry_results()
                except Exception as e:
                    logger.error(f"Error combining industry results: {e}", exc_info=True)
                    print(f"Error: {e}")
                    print("Some results may still be available.")
            
            elif choice == '5':
                # 预测未知样本
                predict_unknown_samples()
            
            elif choice == '6':
                # 预测特定行业的未知样本
                industry_list_path = None
                for f in sorted(os.listdir('results'), reverse=True):
                    if f.startswith('industry_list_') and f.endswith('.csv'):
                        industry_list_path = os.path.join('results', f)
                        break
                
                if industry_list_path:
                    industry_list = pd.read_csv(industry_list_path)
                    print("\n可用行业列表:")
                    for i, ind in enumerate(industry_list['Industry']):
                        print(f"{i+1}. {ind}")
                    
                    idx = input("\n请输入行业编号，或直接输入行业名称: ")
                    
                    if idx.isdigit() and int(idx) <= len(industry_list):
                        selected_industry = industry_list.iloc[int(idx)-1]['Industry']
                    else:
                        selected_industry = idx
                    
                    predict_unknown_samples(selected_industry)
                else:
                    industry_name = input("\n请输入要预测的行业名称: ")
                    predict_unknown_samples(industry_name)
            
            elif choice == '7':
                # 模型性能评估与可视化
                # 获取所有性能报告
                performance_files = [f for f in os.listdir('results') if f.endswith('_performance.csv')]
                if performance_files:
                    all_performance = []
                    for f in performance_files:
                        try:
                            perf = pd.read_csv(os.path.join('results', f))
                            all_performance.append(perf)
                        except:
                            pass
                    
                    if all_performance:
                        performance_df = pd.concat(all_performance)
                        
                        # 创建性能对比可视化
                        plt.figure(figsize=(15, 10))
                        
                        # AUC对比
                        plt.subplot(2, 2, 1)
                        performance_df = performance_df.sort_values('AUC', ascending=False)
                        sns.barplot(x='AUC', y='Industry', data=performance_df)
                        plt.title('Model Performance by AUC')
                        plt.xlabel('AUC')
                        plt.ylabel('Industry')
                        plt.grid(True, axis='x')
                        # 显示汉字
                        plt.rcParams['font.sans-serif']=['SimHei']   
                        plt.rcParams['axes.unicode_minus'] = False
                        
                        # PR-AUC对比
                        plt.subplot(2, 2, 2)
                        performance_df = performance_df.sort_values('PR-AUC', ascending=False)
                        sns.barplot(x='PR-AUC', y='Industry', data=performance_df)
                        plt.title('Model Performance by PR-AUC')
                        plt.xlabel('PR-AUC')
                        plt.ylabel('Industry')
                        plt.grid(True, axis='x')
                        
                        # 如果包含准确率列，则显示准确率对比
                        if 'Accuracy' in performance_df.columns:
                            plt.subplot(2, 2, 3)
                            performance_df = performance_df.sort_values('Accuracy', ascending=False)
                            sns.barplot(x='Accuracy', y='Industry', data=performance_df)
                            plt.title('Model Performance by Accuracy')
                            plt.xlabel('Accuracy')
                            plt.ylabel('Industry')
                            plt.grid(True, axis='x')
                            
                        # 如果包含使用最佳阈值的准确率列，则显示对比
                        if 'Accuracy_Best_Threshold' in performance_df.columns:
                            plt.subplot(2, 2, 4)
                            performance_df = performance_df.sort_values('Accuracy_Best_Threshold', ascending=False)
                            sns.barplot(x='Accuracy_Best_Threshold', y='Industry', data=performance_df)
                            plt.title('Model Performance by Accuracy (Best Threshold)')
                            plt.xlabel('Accuracy with Best Threshold')
                            plt.ylabel('Industry')
                            plt.grid(True, axis='x')
                        
                        # 显示汉字
                        plt.rcParams['font.sans-serif']=['SimHei']   
                        plt.rcParams['axes.unicode_minus'] = False
                        
                        plt.tight_layout()
                        
                        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                        plt.savefig(f'plots/all_models_performance_{timestamp}.png', dpi=600, bbox_inches='tight')
                        plt.show()
                        
                        # 保存汇总报告
                        performance_df.to_csv(f'results/all_models_performance_{timestamp}.csv', index=False)
                        
                        print(f"\n性能汇总报告已保存至 results/all_models_performance_{timestamp}.csv")
                        print(f"性能对比图已保存至 plots/all_models_performance_{timestamp}.png")
                    else:
                        print("\n未找到性能报告。请先运行模型。")
                else:
                    print("\n未找到性能报告。请先运行模型。")
            
            elif choice == '8':
                # 数据探索性分析
                print("\n数据探索性分析选项:")
                print("1. 查看数据基本统计信息")
                print("2. 查看特征重要性分析")
                print("3. 查看预测结果分析")
                
                analysis_choice = input("\n请选择分析类型 (1-3): ")
                
                if analysis_choice == '1':
                    # 基本统计信息
                    try:
                        financial_data = pd.read_csv('output/data_partial_balanced.csv')
                        
                        # 输出基本统计信息
                        print("\n数据总体信息:")
                        print(f"总样本数: {len(financial_data)}")
                        print(f"标记样本数: {financial_data['FLAG'].notna().sum()}")
                        print(f"未标记样本数: {financial_data['FLAG'].isna().sum()}")
                        
                        if 'Industry' in financial_data.columns:
                            industry_counts = financial_data['Industry'].value_counts()
                            print(f"\n行业分布 (前10):")
                            print(industry_counts.head(10))
                        
                        # 创建统计图表
                        plt.figure(figsize=(15, 10))
                        
                        # 样本分布饼图
                        plt.subplot(2, 2, 1)
                        labeled = financial_data['FLAG'].notna().sum()
                        unlabeled = financial_data['FLAG'].isna().sum()
                        plt.pie([labeled, unlabeled], labels=['已标记', '未标记'], autopct='%1.1f%%')
                        plt.title('数据标记分布')
                        # 显示汉字
                        plt.rcParams['font.sans-serif']=['SimHei']   
                        plt.rcParams['axes.unicode_minus'] = False
                        
                        # 已标记样本中的造假比例
                        if 'FLAG' in financial_data.columns:
                            plt.subplot(2, 2, 2)
                            fraud_data = financial_data[financial_data['FLAG'].notna()]
                            fraud = fraud_data['FLAG'].sum()
                            non_fraud = len(fraud_data) - fraud
                            plt.pie([fraud, non_fraud], labels=['造假', '非造假'], autopct='%1.1f%%')
                            plt.title('已标记样本中的造假比例')
                            # 显示汉字
                            plt.rcParams['font.sans-serif']=['SimHei']   
                            plt.rcParams['axes.unicode_minus'] = False
                        
                        plt.tight_layout()
                        
                        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                        plt.savefig(f'plots/data_statistics_{timestamp}.png', dpi=600)
                        plt.show()
                        
                    except Exception as e:
                        print(f"分析数据时出错: {e}")
                
                elif analysis_choice == '2':
                    # 查看特征重要性
                    feature_files = [f for f in os.listdir('results') if f.endswith('_feature_importance.csv')]
                    
                    if feature_files:
                        print("\n可用的特征重要性文件:")
                        for i, f in enumerate(feature_files):
                            print(f"{i+1}. {f}")
                        
                        file_idx = input("\n请选择要查看的文件编号: ")
                        
                        if file_idx.isdigit() and int(file_idx) <= len(feature_files):
                            selected_file = feature_files[int(file_idx)-1]
                            try:
                                importance_df = pd.read_csv(os.path.join('results', selected_file))
                                
                                # 查看前N个重要特征
                                n = input("要查看多少个最重要的特征? (默认为20): ") or "20"
                                n = int(n)
                                
                                # 按模型分组显示重要特征
                                for model in importance_df['Model'].unique():
                                    model_data = importance_df[importance_df['Model'] == model]
                                    model_data = model_data.sort_values('Importance', ascending=False).head(n)
                                    
                                    plt.figure(figsize=(10, 8))
                                    sns.barplot(x='Importance', y='Feature', data=model_data)
                                    plt.title(f'Top {n} Features - {model}')
                                    # 显示汉字
                                    plt.rcParams['font.sans-serif']=['SimHei']   
                                    plt.rcParams['axes.unicode_minus'] = False
                                    plt.tight_layout()
                                    plt.show()
                                
                            except Exception as e:
                                print(f"查看特征重要性时出错: {e}")
                    else:
                        print("\n未找到特征重要性文件。请先运行模型。")
                
                elif analysis_choice == '3':
                    # 查看预测结果分析
                    prediction_files = [f for f in os.listdir('results') if 'predictions_' in f and f.endswith('.csv')]
                    
                    if prediction_files:
                        print("\n可用的预测结果文件:")
                        for i, f in enumerate(prediction_files):
                            print(f"{i+1}. {f}")
                        
                        file_idx = input("\n请选择要分析的文件编号: ")
                        
                        if file_idx.isdigit() and int(file_idx) <= len(prediction_files):
                            selected_file = prediction_files[int(file_idx)-1]
                            try:
                                pred_df = pd.read_csv(os.path.join('results', selected_file))
                                
                                # 创建预测分析图表
                                plt.figure(figsize=(15, 12))
                                
                                # 预测概率分布
                                plt.subplot(2, 2, 1)
                                sns.histplot(pred_df['Fraud_Probability'], bins=20)
                                plt.title('欺诈概率分布')
                                plt.xlabel('概率')
                                plt.ylabel('样本数')
                                
                                # 按行业的欺诈率
                                plt.subplot(2, 2, 2)
                                industry_stats = pred_df.groupby('Industry')['Predicted_Fraud'].mean().sort_values(ascending=False)
                                industry_stats = industry_stats.head(15)  # 显示前15个行业
                                
                                sns.barplot(x=industry_stats.values, y=industry_stats.index)
                                plt.title('各行业欺诈率')
                                plt.xlabel('欺诈率')
                                
                                # 样本量与欺诈率关系图
                                plt.subplot(2, 2, 3)
                                industry_counts = pred_df.groupby('Industry').size()
                                industry_frauds = pred_df.groupby('Industry')['Predicted_Fraud'].mean()
                                
                                plt.scatter(industry_counts, industry_frauds, alpha=0.7)
                                plt.xscale('log')
                                plt.xlabel('样本数 (对数)')
                                plt.ylabel('欺诈率')
                                plt.title('样本数与欺诈率关系')
                                # 显示汉字
                                plt.rcParams['font.sans-serif']=['SimHei']   
                                plt.rcParams['axes.unicode_minus'] = False
                                
                                # 标记一些极端点
                                for ind in industry_frauds.index:
                                    if industry_frauds[ind] > 0.7 or industry_frauds[ind] < 0.05 or industry_counts[ind] > 100:
                                        plt.annotate(
                                            ind, 
                                            (industry_counts[ind], industry_frauds[ind]),
                                            xytext=(5, 5),
                                            textcoords='offset points'
                                        )
                                        
                                # 显示汉字
                                plt.rcParams['font.sans-serif']=['SimHei']   
                                plt.rcParams['axes.unicode_minus'] = False
                                plt.tight_layout()
                                plt.show()
                                
                            except Exception as e:
                                print(f"分析预测结果时出错: {e}")
                    else:
                        print("\n未找到预测结果文件。请先运行预测。")

            elif choice == '9':
                # 单个行业模型评估
                # 创建评估目录
                os.makedirs('evaluations', exist_ok=True)

                # 获取所有可用的模型
                model_files = []
                for ext in ['pkl', 'joblib']:
                    model_files.extend([f for f in os.listdir('models') if f.endswith(f'_stacking_model.{ext}')])

                if not model_files:
                    print("\n未找到任何训练好的模型。请先训练模型。")
                    continue

                # 从模型文件中提取行业名称
                industries = []
                for model_file in model_files:
                    industry = model_file.split('_stacking_model.')[0]
                    if industry not in industries:
                        industries.append(industry)

                print("\n可用的行业模型:")
                for i, industry in enumerate(industries):
                    print(f"{i+1}. {industry}")

                idx = input("\n请输入要评估的行业编号，或直接输入行业名称: ")

                if idx.isdigit() and int(idx) <= len(industries):
                    selected_industry = industries[int(idx)-1]
                else:
                    selected_industry = idx

                # 评估所选行业的模型
                result = evaluate_industry_model(selected_industry)
                print(result)
            
            elif choice == '10':
                # 设置目标准确率和容差范围
                print("\n当前目标准确率: {:.2f}% (±{:.2f}%)".format(target_accuracy*100, accuracy_margin*100))
                
                try:
                    new_target = input("请输入新的目标准确率 (百分比，例如 89): ")
                    if new_target:
                        target_accuracy = float(new_target) / 100  # 转换为小数
                        if target_accuracy < 0 or target_accuracy > 1:
                            print("目标准确率应该在0-100之间，请重新设置")
                            target_accuracy = 0.89  # 重置为默认值
                    
                    new_margin = input("请输入新的容差范围 (百分比，例如 1): ")
                    if new_margin:
                        accuracy_margin = float(new_margin) / 100  # 转换为小数
                        if accuracy_margin < 0 or accuracy_margin > 0.1:
                            print("容差范围应该在0-10之间，请重新设置")
                            accuracy_margin = 0.01  # 重置为默认值
                    
                    print("\n目标准确率已更新为: {:.2f}% (±{:.2f}%)".format(target_accuracy*100, accuracy_margin*100))
                    
                except ValueError as e:
                    print(f"输入错误: {e}")
                    print("目标准确率未更改")
                
            else:
                print("\n无效选项，请重新选择")

                
        except Exception as e:
            logger.error(f"执行操作时出错: {e}", exc_info=True)
            print(f"执行过程中出现错误: {e}")
            print("请查看日志文件了解详情。")


if __name__ == "__main__":
    logger.info("=== 启动财务造假预测系统 ===")
    main_menu()

2025-04-10 16:19:34,901 - __main__ - INFO - === 启动财务造假预测系统 ===




  财务造假预测模型 - 分行业逐步运行优化版  

当前目标准确率: 89.00% (±1.00%)

请选择要执行的操作:
1. 生成可用行业列表和统计分析
2. 运行单个行业的模型
3. 运行单个行业的模型 (时间序列处理)
4. 合并所有行业的结果并生成汇总报告
5. 对未知样本进行预测
6. 对特定行业的未知样本进行预测
7. 模型性能评估与可视化
8. 数据探索性分析
9. 单个行业模型详细评估
10. 设置目标准确率和容差范围
0. 退出

请输入选项 (0-10): 5


2025-04-10 16:22:12,007 - __main__ - INFO - Loading data for prediction...
2025-04-10 16:22:13,416 - __main__ - INFO - 发现缺失值: {'FLAG': 3905}
2025-04-10 16:22:13,426 - __main__ - INFO - 特征'END_DATE_REP'发现49个异常值
2025-04-10 16:22:13,435 - __main__ - INFO - 特征'CASH_C_EQUIV'发现186个异常值
2025-04-10 16:22:13,440 - __main__ - INFO - 特征'NOTES_RECEIV'发现46个异常值
2025-04-10 16:22:13,448 - __main__ - INFO - 特征'AR'发现45个异常值
2025-04-10 16:22:13,455 - __main__ - INFO - 特征'PREPAYMENT'发现145个异常值
2025-04-10 16:22:13,463 - __main__ - INFO - 特征'OTH_RECEIV'发现258个异常值
2025-04-10 16:22:13,470 - __main__ - INFO - 特征'INVENTORIES'发现126个异常值
2025-04-10 16:22:13,476 - __main__ - INFO - 特征'OTH_CA'发现92个异常值
2025-04-10 16:22:13,483 - __main__ - INFO - 特征'T_CA'发现173个异常值
2025-04-10 16:22:13,492 - __main__ - INFO - 特征'AVAIL_FOR_SALE_FA'发现301个异常值
2025-04-10 16:22:13,500 - __main__ - INFO - 特征'LT_EQUITY_INVEST'发现273个异常值
2025-04-10 16:22:13,506 - __main__ - INFO - 特征'INVEST_REAL_ESTATE'发现126个异常值
2025-04-10 16:22:13,514 - __main__ - 

2025-04-10 16:22:15,754 - __main__ - INFO - Using saved base models for prediction
2025-04-10 16:22:16,174 - __main__ - INFO - Using saved meta-feature scaler with 3 features
2025-04-10 16:22:16,175 - __main__ - INFO - Generating final predictions with meta-model...
2025-04-10 16:22:16,366 - __main__ - INFO - 
Predicting for industry 农、林、牧、渔业 (41 samples)
2025-04-10 16:22:16,650 - __main__ - INFO - Using threshold: 0.15000000000000002 for industry 农、林、牧、渔业
2025-04-10 16:22:16,655 - __main__ - INFO - Using saved scaler for feature standardization
2025-04-10 16:22:16,656 - __main__ - INFO - Using saved base models for prediction
2025-04-10 16:22:16,870 - __main__ - INFO - Using saved meta-feature scaler with 3 features
2025-04-10 16:22:16,872 - __main__ - INFO - Generating final predictions with meta-model...
2025-04-10 16:22:17,055 - __main__ - INFO - 
Predicting for industry 制造业 (2500 samples)
2025-04-10 16:22:17,245 - __main__ - INFO - Using threshold: 0.7000000000000001 for industry 

2025-04-10 16:22:27,380 - __main__ - INFO - Predicted fraud cases: 64 (1.67%)
2025-04-10 16:22:27,392 - __main__ - INFO - 
Industry statistics (Top 5 by fraud ratio):
2025-04-10 16:22:27,398 - __main__ - INFO - 
               Count  Fraud_Count  Fraud_Ratio  Threshold
Industry                                                 
教育                 9            2     0.222222       0.45
文化、体育和娱乐业         57           12     0.210526       0.90
水利、环境和公共设施管理业     68            8     0.117647       0.40
农、林、牧、渔业          41            3     0.073171       0.15
金融业              110            8     0.072727       0.55




  财务造假预测模型 - 分行业逐步运行优化版  

当前目标准确率: 89.00% (±1.00%)

请选择要执行的操作:
1. 生成可用行业列表和统计分析
2. 运行单个行业的模型
3. 运行单个行业的模型 (时间序列处理)
4. 合并所有行业的结果并生成汇总报告
5. 对未知样本进行预测
6. 对特定行业的未知样本进行预测
7. 模型性能评估与可视化
8. 数据探索性分析
9. 单个行业模型详细评估
10. 设置目标准确率和容差范围
0. 退出

请输入选项 (0-10): 0


2025-04-10 16:22:36,615 - __main__ - INFO - 
程序结束，感谢使用！


In [38]:
import pandas as pd
import os

# 定义文件路径
file_path = os.path.join('results', 'all_predictions_20250410_162227.csv')

# 读取CSV文件
df = pd.read_csv(file_path)
df

Unnamed: 0,ID,TICKER_SYMBOL,Industry,Fraud_Probability,Predicted_Fraud,Threshold_Used
0,25450,21990.0,交通运输、仓储和邮政业,0.532549,0,0.55
1,25451,43817.0,交通运输、仓储和邮政业,0.476269,0,0.55
2,25452,362828.0,交通运输、仓储和邮政业,0.476808,0,0.55
3,25453,403833.0,交通运输、仓储和邮政业,0.470437,0,0.55
4,25454,453498.0,交通运输、仓储和邮政业,0.488001,0,0.55
...,...,...,...,...,...,...
3823,25731,4267744.0,金融业,0.454094,0,0.55
3824,25732,4564086.0,金融业,0.455946,0,0.55
3825,25733,4589754.0,金融业,0.537824,0,0.55
3826,25734,3348662.0,金融业,0.455946,0,0.55


In [39]:
# 按照Industry分组并统计Predicted_Fraud=1的个数
fraud_count_by_industry = df[df['Predicted_Fraud'] == 1].groupby('Industry').size()

# 将结果转换为DataFrame以便更好地显示
fraud_count_df = fraud_count_by_industry.reset_index(name='Fraud_Count')

# 按欺诈数量降序排列
fraud_count_df = fraud_count_df.sort_values(by='Fraud_Count', ascending=False)

# 打印结果
print("各行业预测为欺诈的数量统计:")
print(fraud_count_df)

# # 计算总欺诈数量
# total_fraud = df['Predicted_Fraud'].sum()
# print(f"\n总欺诈预测数量: {total_fraud}")
# print(f"总记录数: {len(df)}")
# print(f"欺诈比例: {total_fraud/len(df):.2%}")

各行业预测为欺诈的数量统计:
           Industry  Fraud_Count
8         文化、体育和娱乐业           12
9     水利、环境和公共设施管理业            8
12              金融业            8
3               制造业            6
4               建筑业            6
0       交通运输、仓储和邮政业            4
6            批发和零售业            4
1   信息传输、软件和信息技术服务业            3
2          农、林、牧、渔业            3
5              房地产业            3
10       科学研究和技术服务业            3
7                教育            2
11              采矿业            2
