In [None]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')



In [3]:
df_train=pd.read_csv('../data/train.csv')
df_test=pd.read_csv('../data/testA_data.csv')


In [6]:
full_df=pd.concat([df_train,df_test],axis=0)

In [21]:
full_df.head(-5)

Unnamed: 0,mid,eid,did,device_brand,ntt,operator,common_country,common_province,common_city,appver,channel,common_ts,os_type,udmap,is_new_did,ts,day,hour,dayofweek
0,42,76,19bd901f8c4d5f78,160,3,3,81,40,287,36,0,1741431265743,1,{},0.0,2025-03-08 10:54:25.743,8,10,5
1,31,116,051570c1561380f120796926e7f304c80ec07e21,65,3,0,81,79,438,69,5,1742032710624,0,{},1.0,2025-03-15 09:58:30.624,15,9,5
2,13,227,20a3de203fcbeeac7f841831da86de07c,178,3,0,81,53,451,84,11,1741179277363,1,{},0.0,2025-03-05 12:54:37.363,5,12,2
3,21,52,0d741ff751b6d5d6,28,3,1,81,195,123,54,12,1742695445286,1,{},1.0,2025-03-23 02:04:05.286,23,2,6
4,13,26,019a95365cb1471f86ea0884df45f43458de4b75,65,3,3,81,35,409,99,5,1742953002867,0,{},0.0,2025-03-26 01:36:42.867,26,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1143299,13,28,08146441dc70a2f9040aa34cfb0f441c18c32068,65,3,3,81,58,453,83,5,1742426730660,0,{},,2025-03-19 23:25:30.660,19,23,2
1143300,13,227,2131422dc5dc94a8e0ff789b9e915b248,58,3,1,81,196,16,69,1,1741761816168,1,{},,2025-03-12 06:43:36.168,12,6,2
1143301,31,116,004c063b0ac16109,24,3,1,81,224,338,5,11,1742913958961,1,{},,2025-03-25 14:45:58.961,25,14,1
1143302,13,227,2192fbe907ce2a533398f6b072297bec8,178,3,1,81,195,10,69,11,1741742231974,1,{},,2025-03-12 01:17:11.974,12,1,2


In [None]:
print(full_df.isnull().sum())
print(full_df.info())

In [None]:
for df in [df_train,df_test,full_df]:
    df['ts']=pd.to_datetime(df['common_ts'],unit='ms')
    df['day']=df['ts'].dt.day
    df['hour']=df['ts'].dt.hour
    df['dayofweek']=df['ts'].dt.dayofweek
    df.drop('ts',axis=1,inplace=True)

In [19]:
%%time
# 获取full_df中的唯一列
train_dids=set(df_train['did'].unique())
test_dids=set(df_test['did'].unique())
overlap_dids = train_dids & test_dids

num_overlap = len(overlap_dids)
num_train = len(train_dids)
num_test = len(test_dids)
ratio_in_train = num_overlap / num_train if num_train > 0 else 0
ratio_in_test = num_overlap / num_test if num_test > 0 else 0

print(f"重叠 did 数量: {num_overlap}")
print(f"占 train 比例: {ratio_in_train:.4f} ({num_overlap}/{num_train})")
print(f"占 test 比例: {ratio_in_test:.4f} ({num_overlap}/{num_test})")

重叠 did 数量: 192393
占 train 比例: 0.7104 (192393/270837)
占 test 比例: 0.9324 (192393/206342)
CPU times: user 509 ms, sys: 151 ms, total: 660 ms
Wall time: 1.01 s


In [26]:
cat_features = [
    'device_brand', 'ntt', 'operator', 'common_country',
    'common_province', 'common_city', 'appver', 'channel',
    'os_type', 'udmap'
]
# 初始化编码器字典
label_encoders = {}

for feature in cat_features:
    # 创建编码器，将类别特征转为0-N的自然数
    le = LabelEncoder()
    
    # 合并训练集和测试集的所有类别
    all_values = pd.concat([df_train[feature], df_test[feature]]).astype(str)
    
    # 训练编码器（使用所有可能值）
    le.fit(all_values)
    
    # 保存编码器
    label_encoders[feature] = le
    
    # 应用编码
    df_train[feature] = le.transform(df_train[feature].astype(str))
    df_test[feature] = le.transform(df_test[feature].astype(str))

In [28]:
features = [
    # 原始特征
    'mid', 'eid', 'device_brand', 'ntt', 'operator', 
    'common_country', 'common_province', 'common_city',
    'appver', 'channel', 'os_type', 'udmap',
    # 时间特征
    'hour', 'dayofweek', 'day', 'common_ts'
]

# 准备训练和测试数据
X_train = df_train[features]
y_train = df_train['is_new_did']
X_test = df_test[features]

In [43]:
from signal import valid_signals

import sys
sys.path.append('/Users/abcd15455/xf-ai-user-growth/src')
from find_optimal_threshold import find_optimal_threshold


seed=42
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': '12',
    'num_leaves': 63,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 10,
    'verbose': -1,
    'n_jobs':8,
    'seed': seed  # 使用动态生成的 seed
}

# 五折交叉验证
n_folds=5
kf=StratifiedKFold(n_folds,shuffle=True,random_state=seed)
test_preds=np.zeros(len(X_test))
fold_thresholds=[]
fold_f1_scores=[]
models=[]
oof_preds = np.zeros(len(X_train))
oof_probas = np.zeros(len(X_train))

print("\n开始训练模型...")
for fold,(train_idx,val_idx) in enumerate(kf.split(X_train,y_train)):
    print(f"\n开始第{fold+1}折交叉验证")
    X_tr,X_val=X_train.iloc[train_idx],X_train.iloc[val_idx]
    y_tr,y_val=y_train.iloc[train_idx],y_train.iloc[val_idx]

    train_set=lgb.Dataset(X_tr,y_tr)
    val_set=lgb.Dataset(X_val,y_val)

    model=lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[train_set,val_set],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50,verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )

    models.append(model)
    val_pred_proba = model.predict(X_val)
    oof_probas[val_idx]=val_pred_proba

    best_threshold,best_f1=find_optimal_threshold(y_val,val_pred_proba)
    fold_thresholds.append(best_threshold)

    val_pred_labels=(val_pred_proba>=best_threshold).astype(int)
    fold_f1=f1_score(y_val,val_pred_labels)
    fold_f1_scores.append(fold_f1)
    oof_preds[val_idx]=val_pred_labels

    print(f"Fold {fold+1} Optimal Threshold: {best_threshold:.4f}")
    print(f"Fold {fold+1} F1 Score: {fold_f1:.5f}")
    
    test_preds+=model.predict(X_test)/n_folds
    







开始训练模型...

开始第1折交叉验证
[100]	training's binary_logloss: 0.320916	valid_1's binary_logloss: 0.321336
[200]	training's binary_logloss: 0.308903	valid_1's binary_logloss: 0.310251
[300]	training's binary_logloss: 0.299568	valid_1's binary_logloss: 0.301818
[400]	training's binary_logloss: 0.292112	valid_1's binary_logloss: 0.295168
[500]	training's binary_logloss: 0.285697	valid_1's binary_logloss: 0.289461
[600]	training's binary_logloss: 0.280419	valid_1's binary_logloss: 0.284861
[700]	training's binary_logloss: 0.275493	valid_1's binary_logloss: 0.280575
[800]	training's binary_logloss: 0.270956	valid_1's binary_logloss: 0.276647
[900]	training's binary_logloss: 0.266695	valid_1's binary_logloss: 0.272971
[1000]	training's binary_logloss: 0.26248	valid_1's binary_logloss: 0.26939
Fold 1 Optimal Threshold: 0.1000
Fold 1 F1 Score: 0.46452

开始第2折交叉验证
[100]	training's binary_logloss: 0.320635	valid_1's binary_logloss: 0.321543
[200]	training's binary_logloss: 0.308399	valid_1's binary_logl

In [46]:
for idx, model in enumerate(models):
    model.save_model(f'/Users/abcd15455/xf-ai-user-growth/models/model_{idx}.txt')