In [5]:
import pandas as pd
import numpy as np
import os 
import random 
import gc
from sklearn.metrics import *
from tqdm import tqdm 
import warnings 
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_pickle("原封不动特征.pkl")

In [7]:
drop_fea = ['id','a3','y','core_cust_id','prod_code','f22','l5','m1','m2','m3','m4','m5', 'm8','i4','j5','k10','k1','k2',
            'k3',
            'rank',
            'uid_cnt_in_p2_1m_ago']
feature = [x for x in df.columns if x not in drop_fea]
print(len(feature))
print("--------------------------------------------数据读取完毕，部分特征删除-------------------")

df_0 = df[(df['a3'] < '2021-10-01') & (df['y'] == 0)].drop_duplicates(feature)
df_1 = df[(df['a3'] < '2021-10-01') & (df['y'] == 1)]
df_ = pd.concat([df_0, df_1]).sample(frac=1, random_state=2).reset_index(drop=True)

X_train = df_[df_["a3"] == '2021-07-01'][feature].reset_index(drop=True)
y_train = df_[df_["a3"] == '2021-07-01']["y"]
X_valid = df_[df_["a3"] == '2021-08-01'][feature].reset_index(drop=True)
y_valid = df_[df_["a3"] == '2021-08-01']["y"]
X_test = df[df["a3"] == '2021-10-01'][feature].reset_index(drop=True)

del df_, df_0, df_1; gc.collect()
print(len(X_train), len(y_train), len(X_valid), len(y_valid))

print("--------------------开始LightGBM模型初步训练-----------------")
clf = LGBMClassifier(num_leaves=128,
                     n_estimators=10000,
                     learning_rate=0.01,
                     subsample=0.8,
                     feature_fraction=0.5,
                     metric='auc',
                     lambda_l1=0.1,
                     lambda_l2=0.1, 
                     min_child_weight=40,
                     random_seed=2019,
                     verbose=-1)
clf.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],early_stopping_rounds=100,verbose=100)
print("----------------------------------------模型初步训练结束--------------------")

print("--------------------将特征值剔除后进行LightGBM模型二次训练-----------------")
feat_import = dict(zip(clf.feature_name_,clf.feature_importances_))
high_import_feat = [key for key,value in feat_import.items() if value > 100]
print("特征的长度为",len(high_import_feat))
X_train = X_train[high_import_feat]
X_valid = X_valid[high_import_feat]
X_test = X_test[high_import_feat]
clf.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],early_stopping_rounds=100,verbose=100)
print("--------------------------二次模型训练完成--------------------------")
oof_prob = clf.predict_proba(X_valid[high_import_feat])[:, 1]
print("----------------------------------------开始寻找最佳F2及其阈值-----------------------")
gc.collect()

def find_best_threshold(y_valid, oof_prob):
    best_f2 = 0
    best_recall = 0
    best_precision = 0
    for th in tqdm([i/1000 for i in range(50, 200)]):
        oof_prob_copy = oof_prob.copy()
        oof_prob_copy[oof_prob_copy >= th] = 1
        oof_prob_copy[oof_prob_copy < th] = 0
        recall = recall_score(y_valid, oof_prob_copy)
        precision = precision_score(y_valid, oof_prob_copy)
#         print("此时的recall为:{}".format(recall),"此时的精确率为:{}".format(precision))
        f2 = 5*recall*precision / (4*precision+recall)
        if f2 > best_f2:
            best_th = th
            best_f2 = f2
            best_recall = recall
            best_precision = precision
        gc.collect()
    return best_th, best_f2,best_recall,best_precision
best_th, best_f2,best_recall,best_precision = find_best_threshold(y_valid, oof_prob)
print("最佳阈值是:{}".format(best_th), "最佳F2的分数为:{}".format(best_f2),"最佳recall的分数为:{}".format(best_recall),"最佳pricision的分数为:{}".format(best_precision))

gc.collect()
y_pre = clf.predict_proba(X_test)[:, 1]

96
--------------------------------------------数据读取完毕，部分特征删除-------------------
291630 291630 261148 261148
--------------------开始LightGBM模型初步训练-----------------
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.907733
[200]	valid_0's auc: 0.913207
[300]	valid_0's auc: 0.916072
[400]	valid_0's auc: 0.918177
[500]	valid_0's auc: 0.919662
[600]	valid_0's auc: 0.920553
[700]	valid_0's auc: 0.921132
[800]	valid_0's auc: 0.921508
[900]	valid_0's auc: 0.921751
[1000]	valid_0's auc: 0.922007
[1100]	valid_0's auc: 0.922178
[1200]	valid_0's auc: 0.922222
[1300]	valid_0's auc: 0.922266
Early stopping, best iteration is:
[1281]	valid_0's auc: 0.922271
----------------------------------------模型初步训练结束--------------------
--------------------将特征值剔除后进行LightGBM模型二次训练-----------------
特征的长度为 69
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.906847
[200]	valid_0's auc: 0.913106
[300]	valid_0's auc: 0.916428
[400]	valid_0's auc: 0.9

100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:46<00:00,  3.26it/s]


最佳阈值是:0.104 最佳F2的分数为:0.4800731613400077 最佳recall的分数为:0.6643133075795924 最佳pricision的分数为:0.22759218692953634


In [8]:
res = df.loc[df["y"].isna(),["id"]]
res['y'] = y_pre
res.loc[res['y'] >= best_th, 'y'] = 1
res.loc[res['y'] < best_th, 'y'] = 0
res.to_csv('submission_0115_2019.csv',index = False) 
print("结果输出完毕")

结果输出完毕


In [5]:
res_dict = dict(zip(clf.feature_name_,clf.feature_importances_))
res_df = pd.DataFrame(res_dict,index=res_dict.keys()).T[["a2"]]
ready_list = res_df.sort_values(by="a2").index.tolist()

In [20]:
res_df.sort_values(by="a2")

Unnamed: 0,a2
pid_cnt_grp_uid_in_p3_1m_ago,159
j6,189
f15_mean_1m_ago,202
f10_mean_1m_ago,217
apply_amt_mean_grp_uid_in_p2_1m_ago,248
...,...
borrow_sum_1m_ago,7377
uid_cnt_in_click_action_all_1m_ago,7747
apply_amt_mean_grp_uid_in_p1_all_2m_ago,8892
apply_amt_sum_grp_uid_in_p1_all_2m_ago,9518
