In [None]:
# !unzip /home/jovyan/input/agent/agentA.zip -d /home/jovyan/output/

In [None]:
# !honghucli --env=honghu --token=4607c49aeadf4fe3adc74f98e977a8da --type=download --source=agentB.zip

In [None]:
# !unzip /home/jovyan/project/agentB.zip -d /home/jovyan/output/

In [None]:
import pandas as pd
import numpy as np

# 读取数据
# A榜
train_df = pd.read_csv("/home/jovyan/output/agentA/train.csv")
# test_df = pd.read_csv("/home/jovyan/output/agentA/testA.csv")

# B榜
test_df = pd.read_csv("/home/jovyan/output/agentB/testB.csv")

# 重复值、异常值处理
train_df = train_df.drop_duplicates(subset=[col for col in train_df if col not in ['id', 'label']], keep='first')
train_df = train_df[~((train_df['innet_dura'] / 12) > train_df['age'])]

# 合并训练集，测试集
df_all = pd.concat([train_df, test_df], axis=0, ignore_index=True)

In [None]:
# 流量使用特征
df_all['flux_usage_ratio'] = df_all['cm_flux_use'] / (df_all['cm_flux_tot_cnt'] + 1e-6)
df_all['base_plan_usage_ratio'] = df_all['cm_base_plan_flux_use'] / (df_all['cm_base_plan_flux'] + 1e-6)
df_all['is_over_flux'] = (df_all['out_gprs'] > 0).astype(int)
df_all['over_flux_ratio'] = df_all['out_gprs'] / (df_all['cm_flux_use'] + 1e-6)
df_all['night_flux_ratio'] = (df_all['wday_night_flux'] + df_all['nwday_night_flux']) / (df_all['cm_flux_use'] + 1e-6)
df_all['weekend_flux_ratio'] = (df_all['nwday_day_flux'] + df_all['nwday_night_flux']) / (df_all['cm_flux_use'] + 1e-6)
df_all['flux_4g_ratio'] = df_all['flux_4g_use'] / (df_all['cm_flux_use'] + 1e-6)

# APP使用特征
df_all['video_intensity'] = (df_all['long_vid_use_dur'] + df_all['shrt_vid_use_dur'] + df_all['wtch_liv_use_dur']) / 3600  
df_all['realtime_app_intensity'] = (df_all['gm_use_dur'] + df_all['anchor_use_dur'] + df_all['wtch_liv_use_dur']) / 3600
df_all['high_bandwidth_score'] = (
    df_all['video_cnt_m'] + 
    df_all['game_cnt_m'] + 
    df_all['netdisk_use_dur'] / 3600
)

# 家宽使用活跃度
df_all['bd_daily_flux'] = df_all['bd_flux_m'] / (df_all['open_day_m'] + 1e-6)
df_all['bd_daily_duration'] = df_all['bd_dur_m'] / (df_all['open_day_m'] + 1e-6)
df_all['bd_avg_session_duration'] = df_all['bd_dur_m'] / (df_all['bd_cnt_m'] + 1e-6)
df_all['bd_daily_sessions'] = df_all['bd_cnt_m'] / (df_all['open_day_m'] + 1e-6)
df_all['arpu_per_flux'] = df_all['arpu'] / (df_all['cm_flux_tot_cnt'] / 1024 + 1e-6)  
df_all['high_arpu_heavy_user'] = ((df_all['arpu'] > df_all['arpu'].median()) & 
                                 (df_all['cm_flux_use'] > df_all['cm_flux_use'].median())).astype(int)
df_all['multi_service_user'] = (
    df_all['is_bd_tv'] + 
    df_all['is_fam_vnet_user'] + 
    df_all['is_ent_vnet_user']
)

# 用户价值特征
df_all['premium_user_potential'] = (
    (df_all['arpu'] > df_all['arpu'].quantile(0.7)).astype(int) +
    (df_all['video_intensity'] > df_all['video_intensity'].quantile(0.7)).astype(int) +
    (df_all['bd_flux_m'] > df_all['bd_flux_m'].quantile(0.7)).astype(int)
)
df_all['network_strain_indicator'] = (
    (df_all['flux_usage_ratio'] > 0.8).astype(int) +  
    (df_all['out_gprs'] > 0).astype(int) +  
    (df_all['video_intensity'] > df_all['video_intensity'].median()).astype(int)  
)
df_all['coverage_usage_gap'] = df_all['is_10g_pon'] - (df_all['bd_flux_m'] > df_all['bd_flux_m'].median()).astype(int)
df_all['peak_flux_concentration'] = np.maximum(
    df_all['wday_day_flux'], 
    df_all['wday_night_flux']
) / (df_all['cm_flux_use'] + 1e-6)

flux_columns = ['wday_day_flux', 'wday_night_flux', 'nwday_day_flux', 'nwday_night_flux']
df_all['flux_balance_ratio'] = df_all[flux_columns].std(axis=1) / (df_all[flux_columns].mean(axis=1) + 1e-6)
df_all['arpu_per_innet_month'] = df_all['arpu'] / (df_all['innet_dura'] + 1e-6)
df_all['arpu_per_voice_minute'] = df_all['arpu'] / (df_all['l3m_avg_mou'] + 1e-6)
df_all['comprehensive_value_score'] = (
    df_all['arpu'] * 0.4 + 
    df_all['innet_dura'] * 0.3 + 
    df_all['cm_flux_use'] * 0.3
)
df_all['plan_utilization_score'] = (
    df_all['flux_usage_ratio'] + 
    df_all['base_plan_usage_ratio'] +
    (df_all['out_gprs'] > 0).astype(int) +
    (df_all['out_call'] > 0).astype(int)
)
df_all['customer_life_stage'] = pd.cut(
    df_all['innet_dura'], 
    bins=[0, 12, 36, 60, 120, df_all['innet_dura'].max()],
    labels=[1, 2, 3, 4, 5]
).astype(int)
df_all['mature_user_high_usage'] = (
    (df_all['innet_dura'] > 24) & 
    (df_all['cm_flux_use'] > df_all['cm_flux_use'].median())
).astype(int)

# 消费潜力特征
df_all['loyalty_upgrade_potential'] = (
    df_all['innet_dura'] * 0.3 + 
    df_all['arpu'] * 0.4 + 
    df_all['plan_utilization_score'] * 0.3
)

# 整型转换
df_all['arpu'] = df_all['arpu'].astype('int')
df_all['edu_time_m'] = (df_all['edu_time_m'] > 0).astype('int') 
df_all['hi_flux_usr_lbl'] = (df_all['hi_flux_usr_lbl'] == 0).astype('int') 

In [None]:
# 类别特征
cat_cols = ['is_10g_pon', 'is_bd_status_abnormal', 'is_ent_vnet_user',
            'is_fam_vnet_user', 'sev_vid_usr_lbl', 'liv_usr_lbl','is_bd_tv',
            'if_high_games_cust', 'vid_usr_lbl', 'read_usr_lbl', 'gm_usr_lbl',
            'netdisk_usr_lbl', 'sex', 'if_like_video_cust', 'if_nulim_prod',
            'if_like_games_cust','msc_usr_lbl']

# 特征编码
for col in cat_cols:
    df_all[col],_ = df_all[col].factorize()
    df_all[col] -= df_all[col].min()
    # df_all[col] = df_all[col].astype("category")


# 目标编码（KFold 防止泄漏）
from sklearn.model_selection import KFold,StratifiedKFold

def kfold_target_encode(trn, col, target, n_fold=5, alpha=5):
    # 均值编码，alpha 平滑
    te = np.zeros(len(trn), dtype='float32')
    
    global_mean = trn[target].mean()
    kf = KFold(n_splits=n_fold, shuffle=True, random_state=42)
    
    for tr_idx, va_idx in kf.split(trn):
        enc = trn.iloc[tr_idx].groupby(col)[target].agg(['mean', 'count'])
        smooth = (enc['mean'] * enc['count'] + global_mean * alpha) / (enc['count'] + alpha)
        te[va_idx] = trn.iloc[va_idx][col].map(smooth).fillna(global_mean).values
    
    return te
    
# 标签
target_col = 'label'

te_cols = ['arpu', 'arpu_per_flux']
for col in te_cols:
    te_label = kfold_target_encode(df_all, col=col, target=target_col, n_fold=5, alpha=5)
    df_all[f'{col}_te_label'] = te_label

In [None]:
# 交叉统计特征
for col in ['bd_flux_m', 'bd_dur_m']:
    df_all[f'bd_cnt_m_{col}_mean'] = df_all['bd_cnt_m'].map(df_all.groupby(['bd_cnt_m'])[col].mean())
    # df_all[f'bd_cnt_m_{col}_sum'] = df_all['bd_cnt_m'].map(df_all.groupby(['bd_cnt_m'])[col].sum())
    # df_all[f'bd_cnt_m_{col}_skew'] = df_all['bd_cnt_m'].map(df_all.groupby(['bd_cnt_m'])[col].skew())

# 组合特征
from itertools import combinations

bd_cols = ['bd_flux_m', 'bd_dur_m', 'bd_cnt_m']

for col1, col2 in combinations(bd_cols, 2):
    # df_all[f'{col1}+{col2}'] = df_all[col1] + df_all[col2]
    # df_all[f'{col1}-{col2}'] = df_all[col1] - df_all[col2]
    # df_all[f'{col1}*{col2}'] = df_all[col1] * df_all[col2]
    df_all[f'{col1}/{col2}'] = df_all[col1] / df_all[col2].replace(0, np.nan)
    
df_all[f'{bd_cols[0]}+{bd_cols[1]}+{bd_cols[2]}'] = df_all[bd_cols].sum(axis=1)
df_all[f'{bd_cols[0]}*{bd_cols[1]}*{bd_cols[2]}'] = df_all[bd_cols].prod(axis=1)


# 特征分箱
df_all['arpu_bin'] = pd.qcut(df_all['arpu'], q=5, labels=False, duplicates='drop')
df_all['arpu_per_flux_bin'] = pd.qcut(df_all['arpu_per_flux'], q=5, labels=False, duplicates='drop')
df_all['cm_flux_tot_cnt_bin'] = pd.qcut(df_all['cm_flux_tot_cnt'], q=5, labels=False, duplicates='drop')
df_all['innet_dura_bin'] = pd.qcut(df_all['innet_dura'], q=5, labels=False, duplicates='drop')
df_all['arpu_per_innet_month_bin'] = pd.qcut(df_all['arpu_per_innet_month'], q=5, labels=False, duplicates='drop')

# 比值特征
df_all['flux_arpu_ratio'] = df_all['cm_flux_tot_cnt'] / (df_all['arpu'] + 1)
df_all['bd_flux_arpu_ratio'] = df_all['bd_flux_m'] / (df_all['arpu'] + 1)

# 特征标准化
df_all['arpu_std'] = (df_all['arpu'] - df_all['arpu'].mean()) / df_all['arpu'].std()
df_all['flux_std'] = (df_all['cm_flux_tot_cnt'] - df_all['cm_flux_tot_cnt'].mean()) / df_all['cm_flux_tot_cnt'].std()

In [None]:
# 分离
df_all = df_all.replace([-np.inf, np.inf],0)
train_df = df_all[~df_all[target_col].isna()].reset_index(drop=True)
test_df = df_all[df_all[target_col].isna()].reset_index(drop=True)

adv_feats = ['cm_flux_use', 'flux_4g_use', 'flux_usage_ratio', 'night_flux_ratio',
             'weekend_flux_ratio', 'flux_4g_ratio', 'peak_flux_concentration',
             'comprehensive_value_score']

drop_feats = ['fashion_time_m', 'if_high_games_cust', 'if_like_video_cust', 
              'cm_chos_plan_flux', 'if_like_games_cust','cm_chos_plan_flux_use']

customer_value_feats = ['arpu_bin', 'arpu_per_flux_bin', 'cm_flux_tot_cnt_bin',
                        'innet_dura_bin', 'arpu_per_innet_month_bin','arpu_std',
                        'flux_arpu_ratio', 'bd_flux_arpu_ratio','flux_std']

bd_feats = ['bd_flux_m/bd_dur_m', 'bd_flux_m/bd_cnt_m', 'bd_dur_m/bd_cnt_m', 
            'bd_flux_m+bd_dur_m+bd_cnt_m', 'bd_flux_m*bd_dur_m*bd_cnt_m']

# 训练特征
feats1 = [col for col in train_df if col not in ['id', target_col] + adv_feats + drop_feats + customer_value_feats]
feats2 = [col for col in train_df if col not in ['id', target_col] + adv_feats + drop_feats + bd_feats]   

print('feats1_num:',len(feats1))
print('feats2_num:',len(feats2))

In [None]:
import gc
import warnings
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# 自定义评估函数
def c_eval(preds, dtrain):

    labels = dtrain.get_label().astype(int)
    pred_labels = (preds >= 0.5).astype(int)
    acc = accuracy_score(labels, pred_labels)
    f1 = f1_score(labels, pred_labels, average='binary')
    score = 0.7 * acc + 0.3 * f1
    
    return 'c_score', score, True
   
# 评价指标
def c_score(y_true, y_pred, average: str = "binary"):

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average=average)
    
    return 0.7 * acc + 0.3 * f1   

# 模型定义
def lgb_mdoel(train, test, target, feats = [], seed = 2025):

    score = 0
    flods = 5
    feat_imp = 0
    kf = KFold(n_splits=flods, shuffle=True, random_state=seed)
    
    oof = np.zeros(len(train))
    pred = np.zeros(len(test))
    
    for i, (train_index, val_index) in enumerate(kf.split(train)):
    
        print(f"### LightGBM Fold {i+1}")
        
        x_train = train.loc[train_index, feats].copy()
        y_train = train.loc[train_index, target].astype(int)    
        x_valid = train.loc[val_index, feats].copy()
        y_valid = train.loc[val_index, target].astype(int)
        x_test = test[feats].copy()

        # w_train = compute_age_weights_positive_bias(y_train, x_train['age'], positive_boost=4)
        
        train_set = lgb.Dataset(x_train, label=y_train, categorical_feature=[])
        valid_set = lgb.Dataset(x_valid, label=y_valid, categorical_feature=[], reference=train_set)

        # 参数
        model_params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'learning_rate': 0.05,
            'metric': 'None',
            'num_leaves': 64,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.9,
            'bagging_freq': 4,
            # 'scale_pos_weight': 2 , # 2.3
            'verbose': -1,
            'seed': seed,
            'n_jobs': 8,
      }
        
        # 训练        
        model = lgb.train(
            model_params,
            train_set,
            num_boost_round = 300,
            valid_sets = [valid_set],
            feval = c_eval,
            callbacks = [
                # lgb.early_stopping(stopping_rounds=100, verbose=False),
                lgb.log_evaluation(period=50)
            ]
        )
        
        # 预测
        pred_val = model.predict(x_valid, num_iteration=model.best_iteration)
        pred += model.predict(x_test, num_iteration=model.best_iteration)
        oof[val_index] = pred_val

        score_kold = c_score(y_valid, (pred_val > 0.5).astype(int))
        score += score_kold
        print('Score = ',score_kold) 

        # 特征重要性
        feat_imp += model.feature_importance(importance_type='gain') 
    
    pred /= flods
    score /= flods
    feat_imp /= flods
    
    print(f"\nMean Score  = ", score)
    
    importance_df = pd.DataFrame({
        "Feature": feats,
        "Importance": feat_imp
    }).sort_values(by="Importance", ascending=False)

    # 垃圾回收
    del x_train, y_train, x_valid, y_valid, x_test, train_set, valid_set
    del model_params, model
    gc.collect()
    
    return oof, pred, importance_df

oof_lgb, pred_lgb, imp_lgb = lgb_mdoel(train_df, test_df, target_col, feats1, seed = 9999) 

In [None]:
import catboost
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

def cat_mdoel(train, test, target, feats = [], categorical_features = [], seed = 2025):

    score = 0
    flods = 5
    feat_imp = 0
    skf = StratifiedKFold(n_splits=flods, shuffle=True, random_state=seed)
    
    oof = np.zeros(len(train))
    pred = np.zeros(len(test))
    
    for i, (train_index, val_index) in enumerate(skf.split(train, train[target])):
    
        print(f"### CatBoost Fold {i+1}")
        
        x_train = train.loc[train_index, feats].copy()
        y_train = train.loc[train_index, target].astype(int)    
        x_valid = train.loc[val_index, feats].copy()
        y_valid = train.loc[val_index, target].astype(int)
        x_test = test[feats].copy()

        train_pool = Pool(x_train, y_train, cat_features=categorical_features)
        valid_pool = Pool(x_valid, y_valid, cat_features=categorical_features)
        test_pool = Pool(x_test, cat_features=categorical_features)
        
        # 参数
        model_params = {
                'task_type': 'CPU',
                'bootstrap_type': 'Bayesian',
                'loss_function': 'Logloss',
                'eval_metric': 'AUC',
                'learning_rate': 0.1,
                'iterations': 800,
                # 'depth': 6,
                # 'max_leaves': 64,
                # 'auto_class_weights': 'Balanced',
                # 'grow_policy': 'SymmetricTree',  
                'verbose': 100,
                # 'early_stopping_rounds': 50,
                'random_seed': seed,
       }
        
        # 训练
        model = CatBoostClassifier(**model_params)
        model.fit(train_pool, eval_set=valid_pool, use_best_model=False)

        # 预测
        pred_val = model.predict_proba(valid_pool)[:,1]
        pred += model.predict_proba(test_pool)[:,1]
        oof[val_index] = pred_val

        score_kold = c_score(y_valid, (pred_val > 0.5).astype(int))
        score += score_kold
        print('Score = ',score_kold) 
        
        # 特征重要性
        feat_imp += model.feature_importances_
    
    pred /= flods
    score /= flods
    feat_imp /= flods
    
    print(f"\nMean Score  = ", score)
    
    importance_df = pd.DataFrame({
        "Feature": feats,
        "Importance": feat_imp 
    }).sort_values(by="Importance", ascending=False)

    # 垃圾回收
    del x_train, y_train, x_valid, y_valid, x_test, train_pool, valid_pool, test_pool
    del model_params, model
    gc.collect()
    
    return oof, pred, importance_df
    
categorical_features = [col for col in feats2 if train_df[col].nunique() <= 5] # cat_cols + ['customer_life_stage']
oof_cat, pred_cat, imp_cat = cat_mdoel(train_df, test_df, target_col, feats2 , categorical_features, seed = 6666)

In [None]:
# 权重、阈值同步搜索
results = [(w, t, c_score(train_df[target_col], ((oof_lgb * w + oof_cat * (1-w)) >= t).astype(int)))
            for w in np.arange(0.1, 0.9, 0.05) 
            for t in np.arange(0.1, 0.9, 0.01)]

best_weight, best_threshold, best_score = max(results, key=lambda x: x[2])

f_oof = oof_lgb * best_weight + oof_cat * (1 - best_weight)
f_pred = pred_lgb * best_weight + pred_cat * (1 - best_weight)

print(f"best_weight: {best_weight:.3f}, best_threshold: {best_threshold:.3f}, best_score: {best_score:.5f}")

In [None]:
submission = pd.read_csv('/home/jovyan/output/agentB/submitB.csv')
submission['label'] = (f_pred > best_threshold).astype(int)
submission.to_csv('/home/jovyan/output/submission.csv', index=False)

In [None]:
!honghucli --env=honghu --token=xxxxxxxxxxxxxxxxxxxx --source=/home/jovyan/output/submission.csv