In [None]:
# !unzip /home/jovyan/input/data/dataA.zip -d /home/jovyan/output/data/

In [None]:
import pandas as pd
import numpy as np

# 读取数据
train_df = pd.read_csv("/home/jovyan/output/data/dataA/train.csv")
test_df = pd.read_csv("/home/jovyan/output/data/dataA/testA.csv")

# 合并训练集，测试集
df_all = pd.concat([train_df,test_df],axis=0,ignore_index=True)

In [None]:
# 标签
target_col = 'is_positive'

# 位置信息
location_info = ['residence_base_station_id','residence_cell_id']

# 时间特征
date_col = 'registration_date'

df_all[date_col] = pd.to_datetime(df_all[date_col],format='mixed')
df_all[f"{date_col}_year"] = df_all[date_col].dt.year
df_all[f"{date_col}_month"] = df_all[date_col].dt.month
df_all[f"{date_col}_day"] = df_all[date_col].dt.day

# 类别编码
for col in location_info:
    df_all[f'{col}_category'],_ = df_all[col].factorize()
    df_all[f'{col}_category'] -= df_all[f'{col}_category'].min()
    df_all[f'{col}_category'] = df_all[f'{col}_category'].astype("category")

In [None]:
# 分离
df_all = df_all.replace([-np.inf, np.inf],0)
train_df = df_all[~df_all[target_col].isna()].reset_index(drop=True)
test_df = df_all[df_all[target_col].isna()].reset_index(drop=True)

feats = [col for col in train_df if col not in ['user_id', date_col, target_col] + location_info]    
print('feats_num:',len(feats))

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# 样本权重(双层加权)
def compute_age_weights_positive_bias(y, age, positive_boost=1.5):

    unique_ages = np.unique(age)
    sample_weights = np.ones_like(y, dtype=float)
    
    for current_age in unique_ages:
        age_mask = age == current_age
        y_age = y[age_mask]
        
        if len(np.unique(y_age)) >= 2:
            # 计算基础权重
            base_weights = compute_sample_weight('balanced', y_age)
            
            # 增强正样本权重
            adjusted_weights = base_weights.copy()
            positive_mask = y_age == 1
            adjusted_weights[positive_mask] = adjusted_weights[positive_mask] * positive_boost
            
            sample_weights[age_mask] = adjusted_weights
    
    return sample_weights

# 评估函数
def c_eval(preds, dtrain):

    labels = dtrain.get_label().astype(int)
    pred_labels = (preds >= 0.5).astype(int)
    acc = accuracy_score(labels, pred_labels)
    f1 = f1_score(labels, pred_labels, average='binary')
    score = 0.7 * acc + 0.3 * f1
    
    return 'c_score', score, True
   
# 评价指标
def c_score(y_true, y_pred, average: str = "binary"):

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average=average)
    
    return 0.7 * acc + 0.3 * f1   

# 模型定义
def lgb_mdoel(train, test, target, feats, seed):

    score = 0
    flods = 5
    skf = StratifiedKFold(n_splits=flods, shuffle=True, random_state=seed)
    
    oof = np.zeros(len(train))
    pred = np.zeros(len(test))
    
    for i, (train_index, val_index) in enumerate(skf.split(train,train[target])):
    
        print(f"### LightGBM Fold {i+1}")
        
        x_train = train.loc[train_index, feats].copy()
        y_train = train.loc[train_index, target].astype(int)    
        x_valid = train.loc[val_index, feats].copy()
        y_valid = train.loc[val_index, target].astype(int)
        x_test = test[feats].copy()
        
        # 双层样本加权
        w_train = compute_age_weights_positive_bias(y_train, x_train['age'], positive_boost=4)
        
        train_set = lgb.Dataset(x_train, label = y_train, weight = w_train, categorical_feature = [])
        valid_set = lgb.Dataset(x_valid, label = y_valid, categorical_feature = [], reference = train_set)

        # 参数
        model_params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'None',
            'learning_rate': 0.08,         
            'num_leaves': 31,             
            'max_depth': 3,
            'min_child_samples': 10,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.2,
            'reg_lambda': 0.2,
            # 'scale_pos_weight': 6.7 , # 2.3
            'n_jobs': 8,
            'verbose': -1,
            'random_state': seed,
        }

        # 训练
        model = lgb.train(
            model_params,
            train_set,
            num_boost_round=1000,
            valid_sets=[valid_set],
            feval=c_eval,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False),
                lgb.log_evaluation(period=50)
            ]
        )
        
        # 预测
        pred_val = model.predict(x_valid, num_iteration=model.best_iteration)
        pred += model.predict(x_test, num_iteration=model.best_iteration)

        oof[val_index] = pred_val

        score_kold = c_score(y_valid, (pred_val > 0.5).astype(int))
        score += score_kold
        print('Score = ',score_kold) 
    
    pred /= flods
    score /= flods
    
    print(f"\nMean Score  = ", score)
    
    # 特征重要性
    importance_df = pd.DataFrame({
        "Feature": feats,
        "Importance": model.feature_importance(importance_type='gain')
    }).sort_values(by="Importance", ascending=False)
    
    return oof, pred, importance_df
    
# 训练
oof_lgb, pred_lgb, imp_lgb = lgb_mdoel(train_df, test_df, target_col, feats, seed = 9999)

In [None]:
# 保存
submission = pd.read_csv('/home/jovyan/output/data/dataA/submitA.csv')
submission['is_positive'] = (pred_lgb > 0.5).astype(int)
submission.to_csv('/home/jovyan/output/submission.csv', index=False)

In [None]:
!honghucli --env=honghu --token=xxxxxxxxxxxxxxxxxxx --source=/home/jovyan/output/submission.csv