In [1]:
import pandas as pd 
import numpy as np

import gc
gc.collect()
gc.enable()

In [2]:
train_df3 = pd.read_pickle('./data/feature_select_fill2_v1_15W.pkl') 
# train_df3 = pd.read_pickle('./data/raw_merge_top100featuretools50_v1.pkl')



## bayes调参 模型融合

随机抽样+参数扰动训练子模型

In [3]:
from hyperopt import hp, tpe, Trials, STATUS_OK, Trials, anneal
from functools import partial
from hyperopt.fmin import fmin
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,roc_curve,auc, cohen_kappa_score
from sklearn.metrics import mean_squared_error

def model_metrics(model, x, y):
    """ 评估 """
    yhat = model.predict(x)

    return cohen_kappa_score(y, yhat)


def bayes_fmin(train_x, test_x, train_y, test_y, eval_iters=50, gap_d=0):
    """
    bayes 优化超参数
    """
    
    def lgb_factory(params):
        """
        定义调参目标函数
        """
        fit_params = {
            "boosting":params["boosting"],
            'max_depth':int(params['max_depth']),
            'n_estimators':int(params['n_estimators']),
            "learning_rate":params["learning_rate"],
            "num_leaves": int(params["num_leaves"]),
            "lambda_l1":params["lambda_l1"],
            "lambda_l2":params["lambda_l2"],
            'subsample_for_bin':int(params['subsample_for_bin']),            
            'bagging_fraction':params['bagging_fraction'],
            "feature_fraction":params["feature_fraction"],
            "min_data_in_leaf":int(params["min_data_in_leaf"]),            
            'min_child_weight': params['min_child_weight'],
            "min_split_gain":params["min_split_gain"]
            }
        fit_params.update(base_params)
        # 模型训练
        model=lgb.LGBMClassifier(**fit_params)
        model.fit(train_x, train_y)
        
        # 测试集最小化（- kappa）,且gap较小为目标
        train_metric = model_metrics(model, train_x, train_y)
        test_metric = model_metrics(model, test_x, test_y)
        # 测试进度 +训练-测试gap惩罚系数
        loss = -(test_metric-gap_d*(train_metric-test_metric))
        return {"loss": loss, "status":STATUS_OK}

    # 参数空间
    base_params = {
            "n_jobs":-1, 
            "objective":'multi:softprob', 
            "random_state":None,
            "silent":True,
            "verbose":-1
            }
    space = {
        'max_depth': hp.quniform('max_depth', 2, 12, 1),
        'n_estimators': hp.quniform('n_estimators', 2, 3000, 2), 
        'boosting': hp.choice('boosting', ['gbdt','dart','goss']),
        "class_weight": hp.choice('class_weight',['balanced',None]),
        'num_leaves': hp.quniform('num_leaves', 2, 3000, 2), 
        'learning_rate': hp.uniform('learning_rate', 1e-4, 9e-1),
        'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
        'feature_fraction': hp.uniform('feature_fraction', 0.1, 1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.1, 1), 
        'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1),
        'lambda_l1': hp.uniform('lambda_l1', 0, 1),
        'lambda_l2': hp.uniform('lambda_l2', 0, 1),
        'min_child_weight': hp.loguniform('min_child_weight', -16, 5), 
        'min_split_gain': hp.uniform('min_split_gain', 0, 1)
            }
    
    best_params = fmin(lgb_factory, space, algo=partial(anneal.suggest,), max_evals=eval_iters, trials=Trials(),return_argmin=True)
    
    # 取最优参数
    best_params.update(base_params)
    best_params["class_weight"] = ['balanced',None][int(best_params["class_weight"])]
    best_params["boosting"] = ['gbdt','dart','goss'][int(best_params["boosting"])]
    best_params["min_data_in_leaf"] = int(best_params["min_data_in_leaf"])
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["subsample_for_bin"] = int(best_params["subsample_for_bin"])
    best_params["num_leaves"] = int(best_params["num_leaves"])
    best_params["n_estimators"] = int(best_params["n_estimators"])
    
    return best_params

In [4]:
import pickle
import lightgbm as lgb
from hyperopt import hp, tpe, Trials, STATUS_OK, Trials, anneal
from functools import partial
from hyperopt.fmin import fmin
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,roc_curve,auc, cohen_kappa_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from lightgbm import plot_importance
import warnings
warnings.filterwarnings("ignore")

def to_category_or_num(df, category_list):
    """数据预处理"""
    for ft in df.columns:
        if ft in category_list:
            df[ft] = df[ft].astype('category')
        else:
            df[ft] = pd.to_numeric(df[ft], errors='coerce')



def base_model_predict(lgbmodel,train_df, probs_df):
    # 模型融合step1:生成子模型的预测值   
    probs_x = train_df.drop(['cust_q','label','cust_no'],axis=1)
    to_category_or_num(probs_x, category_list)

    probs_df[str(model_no)] = lgbmodel.predict(probs_x)
    probs_df[(str(model_no)+'_prob')] = lgbmodel.predict(probs_x) + lgbmodel.predict_proba(probs_x).max(axis=1)
    

    
def data_train_bayes2(df, category_list,test_size, iters):
    """
    数据处理及训练
    
    # type 1:训练集按比例抽样
    # df = train_df[train_df.label.notnull()].sample(frac=0.632)
    # type 2:样本均衡抽样 按最小类的0.8均衡取样
    # df = train_df.groupby('label', group_keys=False).apply(pd.DataFrame.sample, n=18000)
    # type 3:分割上层模型/子模型的样本0.2
    # regmodel_df = train_df[train_df.label.notnull()].sample(frac=0.8,random_state=0)
    # 子模型样本均衡抽样 按最小类的0.6
    # df = regmodel_df.groupby('label', group_keys=False).apply(pd.DataFrame.sample, n=9608)
    # type 4:非均衡抽样 按总类的0.8 ，子模型有0.4的不确定样本
    # df = train_df[train_df.label.notnull()].sample(frac=0.8,random_state=0)
    # type5 train无交集
    # df = train_df[train_df.label.notnull()]
    
    """


    
    # 训练集预处理
    x = df.drop(['cust_q','label','cust_no'],axis=1)
    to_category_or_num(x, category_list)

    # 划分训练集，测试集
    y = df.label
    train_x, test_x, train_y, test_y = train_test_split(x, y,test_size=test_size)

    
        
    # 参数优化 

    best_params = bayes_fmin(train_x, test_x, train_y, test_y, iters)
    print(best_params)
    
    # 评估
    lgbmodel=lgb.LGBMClassifier(**best_params)
    lgbmodel.fit(train_x, train_y)
    
    print('-'*30, 'train')
    print(model_metrics(lgbmodel, train_x, train_y))

    print('-'*30, 'test')
    print(model_metrics(lgbmodel, test_x, test_y))
    
    # 重要特征
    plot_importance(lgbmodel, max_num_features=20,figsize=(5,5),importance_type='split')
    plt.show()
    feature_importance = pd.DataFrame({
        'feature': lgbmodel.booster_.feature_name(),
        'gain': lgbmodel.booster_.feature_importance('gain'),
        'split': lgbmodel.booster_.feature_importance('split')

    }).sort_values('gain',ascending=False)
    print(feature_importance[0:50])
    
    return lgbmodel

In [5]:
# type 1 跑子模型概率及类别


probs_df = train_df3.loc[:,['cust_q','label','cust_no']]
category_list = ['I1','I3','I5','I8','I10','I13','I14','lastq_I1','lastq_I3','lastq_I5','lastq_I8','lastq_I10','lastq_I13','lastq_I14']

# type 4:非均衡抽样 按总类的0.8 ，子模型有0.4的不确定样本
# 子模型训练集
base_train_df = train_df3[train_df3.label.notnull()].sample(frac=0.8,random_state=0)

models_num = 5
for model_no in range(models_num):
    lgbmodel = data_train_bayes2(base_train_df, category_list, test_size=0.3, iters=50)
    base_model_predict(lgbmodel,train_df3, probs_df)
    
    # save base_model
    print(model_no)
    with open('./data/stack_lgb'+str(model_no)+'.v1pkl','wb') as fileobj:
        pickle.dump(lgbmodel,fileobj)
    
    
# 保存子模型预测结果
probs_df.to_pickle('./data/basemodel_ft_50w_1209.v1pkl')  
# 备注子模型的样本
probs_df.loc[base_train_df.index,"regmodel"] = 1

# # type2 跑kfold子模型预测概率
# from sklearn.model_selection import KFold

# folds = KFold(n_splits=5, shuffle=True, random_state=0)
# for train_index, test_index in folds.split(train_df3):
#     train_df, test_df = train_df3.iloc[train_index], train_df3.iloc[test_index]
#     lgbmodel = data_train_bayes2(test_df, category_list)
#     # savemodel
#     print(model_no)

boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt                                 
feature_fraction is set=0.6098972330824208, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.6098972330824208
min_data_in_leaf is set=3, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=3                     
lambda_l1 is set=0.6632095022882218, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.6632095022882218        
bagging_fraction is set=0.9293194023588456, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9293194023588456
lambda_l2 is set=0.00978229183122814, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.00978229183122814     
boosting is set=dart, boosting_type=gbdt will be ignored. Current value: boosting=dart                                 
feature_fraction is set=0.8047586134357417, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8047586134357417
min_data

feature_fraction is set=0.8058764202404394, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8058764202404394
min_data_in_leaf is set=51, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=51                   
lambda_l1 is set=0.8832240344405364, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.8832240344405364        
bagging_fraction is set=0.9487218421158847, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9487218421158847
lambda_l2 is set=0.071131847687931, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.071131847687931         
boosting is set=goss, boosting_type=gbdt will be ignored. Current value: boosting=goss                                 
feature_fraction is set=0.6976275973502735, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.6976275973502735
min_data_in_leaf is set=51, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=51                   
lambda_l

KeyboardInterrupt: 

In [20]:
probs_df.to_pickle('./data/basemodel_ft_50w_1208.v1pkl')  

In [58]:
# import pickle
# import lightgbm as lgb
# import matplotlib.pyplot as plt
# from lightgbm import plot_importance
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# import warnings
# warnings.filterwarnings("ignore")


# def stack_models_bayes(df, iters=200, model_type='lgb'):
#     """
#     stack 模型融合及训练

#     """
    
#     # 训练集预处理&df.regmodel.isnull()
#     x = df[df.label.notnull()].drop(['cust_q','label','cust_no'],axis=1)
#     y = df[df.label.notnull()].label
#     # 划分训练集，测试集
#     train_x, test_x, train_y, test_y = train_test_split(x, y,test_size=0.2, random_state=0)
#     if model_type == 'lgb':
#         # LGB参数优化
#         best_params = bayes_fmin(train_x, test_x, train_y, test_y, iters, 0.5)
#         print(best_params)    
#         # 评估
#         model=lgb.LGBMClassifier(**best_params)
#         model.fit(train_x, train_y)
#         # 重要特征
#         plot_importance(model, max_num_features=100,figsize=(10,30),importance_type='gain')
#         plt.show()    
#         feature_importance = pd.DataFrame({
#             'feature': lgbmodel.booster_.feature_name(),
#             'gain': lgbmodel.booster_.feature_importance('gain'),
#             'split': lgbmodel.booster_.feature_importance('split')
#         }).sort_values('gain',ascending=False)
#     elif model_type == 'lr':
        
#         model = LogisticRegression(penalty='l2', dual=False, tol=0.01, C=0.1, fit_intercept=True, intercept_scaling=1,
#                                    class_weight='balanced', max_iter=500, multi_class='ovr', random_state=0,n_jobs=1,
#                                    solver='newton-cg', verbose=5, warm_start=True,l1_ratio=None)
#         model.fit(train_x, train_y)
    
#     print('-'*30, 'train')
#     print(model_metrics(model, train_x, train_y))

#     print('-'*30, 'test')
#     print(model_metrics(model, test_x, test_y))
    
#     # 融合的数据生成竞赛结果 &df.regmodel.isnull()
#     result_df = df.loc[df.label.isnull()]

#     test_df_x = result_df.drop(['cust_q','label', 'cust_no'],axis=1)
#     result_df['label'] = model.predict(test_df_x)


#     # 保存为 固定格式结果
#     result_df['label'] = result_df['label'].astype('int')
#     result_df[['cust_no','label']].to_csv('./data/%sstack_1209.csv'%model_type,index=False)
#     print(result_df[['cust_no','label']].head())
    
#     return model


# # 模型融合step2: 融合最终模型，预测，并保存test结果

# probs_df = pd.read_pickle('./data/basemodel_ft_50w_1208.v1pkl')  
# # 微调-1概率值
# for row in probs_df.loc[:,probs_df.columns.str.contains('_prob')].columns:
#     probs_df.loc[probs_df[row]< 0, row] =  -(probs_df.loc[probs_df[row]< 0, row])  - 2
# print(row)
    
# # 模型融合
# model = stack_models_bayes(probs_df,iters=20,model_type='lr')
# print(model.n_iter_)

In [32]:
# 均值融合

test_df = probs_df.set_index('cust_no')
# probs_df = train_df3.loc[:,[,'cust_no']]
result_df = pd.DataFrame(test_df.loc[test_df.label.isnull(),~test_df.columns.str.contains('_prob')].mode(axis=1)[0].astype('int'))
result_df.columns =['label']
result_df.to_csv('./data/result_df-aa.csv',header=True,index=True)