本文[参考](https://mp.weixin.qq.com/s?__biz=MzI5ODQxMTk5MQ==&mid=2247485727&idx=2&sn=411ac0329bdae3b5475e49d9af11b67f&chksm=eca77ba7dbd0f2b1fe5bd209f153f3797fc24093cebdeed0f292fd6c090bb36d9ef40991caf2&mpshare=1&scene=23&srcid=0227CpGIrhXsICJifSej0A3v#rd)-杰少，值得借鉴的地方：
* MAE+MSE 的整合处理模式
* MAE 和 MSE 结果的线性融合

In [7]:
## 数据工具包
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import tqdm,tqdm_notebook 


## 字符串处理工具包
import string
import re
# import gensim
from collections import Counter
import pickle
# from nltk.corpus import stopwords


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
# from keras.preprocessing import text, sequence 

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import lightgbm as lgb
from functools import partial

import os 
import gc
import joblib
from scipy import stats 
from scipy.sparse import vstack  
import time
import datetime
import multiprocessing as mp
import seaborn as sns 
tqdm.pandas() 
%matplotlib inline

In [8]:
train = pd.read_csv('./data/train_dataset.csv')
test = pd.read_csv('./data/test_dataset.csv')

In [9]:
# 简单特征工程
def _simple_features(df_):
    df = df_.copy() 
    df['次数'] = df['当月网购类应用使用次数'] +  df['当月物流快递类应用使用次数'] +  df['当月金融理财类应用使用总次数'] + df['当月视频播放类应用使用次数'] + df['当月飞机类应用使用次数'] + df['当月火车类应用使用次数'] + df['当月旅游资讯类应用使用次数']  + 1

    for col in ['当月金融理财类应用使用总次数','当月旅游资讯类应用使用次数']: # 这两个比较积极向上一点
        df[col + '百分比'] = df[col].values / df['次数'].values 

    df['当月通话人均话费'] = df['用户账单当月总费用（元）'].values / (df['当月通话交往圈人数'].values + 1)
    df['上个月费用'] = df['用户当月账户余额（元）'].values + df['用户账单当月总费用（元）'].values
    df['用户上网年龄'] = df['用户年龄'] - df['用户网龄（月）']
    df['用户上网年龄百分比'] = df['用户网龄（月）'] / (df['用户年龄'] + 1)
    df['近似总消费'] = df['用户近6个月平均消费值（元）'] / 6 * df['用户网龄（月）']
    return df

# run
train_fea = _simple_features(train)
test_fea  = _simple_features(test)

In [10]:
fea_cols = [col for col in train_fea.columns if train_fea[col].dtypes!='object' and train_fea[col].dtypes != '<M8[ns]' and col!='用户编码' and col!='信用分']   
len(fea_cols)

36

In [11]:
# 线下验证函数
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import mean_squared_error

def _get_values_lgbregresser_models(df_fea, df_label,  feature_names):
    kf = KFold(n_splits=5,shuffle=False)#,random_state=1)

    models  = [] 
    models_1 = []
    models_2 = []

    importances = pd.DataFrame() 
    lgb_params = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'mse',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
#          'n_estimators': 10000,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": 50,
         "verbosity": -1}

    lgb_params1 = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'mae',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
#          'n_estimators': 10000,
         "bagging_seed": 11,
         "lambda_l1": 0.1,
         "nthread": 50,
         "verbosity": -1}

    min_val = np.min(df_label)
    print(min_val)
    for fold_, (trn_, val_) in enumerate(kf.split(df_fea)): 
        trn_x, trn_y= df_fea[trn_,:], df_label[trn_]#, df_label1[trn_] 
        val_x, val_y = df_fea[val_,:], df_label[val_]#, df_label1[val_] 
        tmp = pd.DataFrame()

        model = lgb.LGBMRegressor(**lgb_params1)
        model.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric ='mae',verbose=50,early_stopping_rounds=250)     
        tmp['target'] = val_y
        tmp['pred1'] = model.predict(val_x)
        models.append(model)

        model1 = lgb.LGBMRegressor(**lgb_params)
        model1.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric ='mae',verbose=50,early_stopping_rounds=250)     
        tmp['pred2'] = model1.predict(val_x)
        models_1.append(model1)

        tmp = tmp.sort_values('pred1')
        tmp['ranks'] = list(range(tmp.shape[0]))
        tmp['preds'] = tmp['pred1'].values
        tmp.loc[tmp.ranks<2000,'preds']  = tmp.loc[tmp.ranks< 2000,'pred2'].values *0.4 + tmp.loc[tmp.ranks< 2000,'pred1'].values * 0.6
        tmp.loc[tmp.ranks>8000,'preds']  = tmp.loc[tmp.ranks> 8000,'pred2'].values *0.4 + tmp.loc[tmp.ranks> 8000,'pred1'].values * 0.6
         
        print('*' * 100)
        print('MAE Model',     1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['pred1'] ))))
        print('MSE Model',     1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['pred2'] ))))
        print('Merge Model12', 1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['preds'] )))) 

        imp_df = pd.DataFrame()
        imp_df['feature'] = feature_names
        imp_df['gain'] = model.feature_importances_
        imp_df['fold'] = fold_ + 1
        importances = pd.concat([importances, imp_df], axis=0)
        gc.collect() 

    return models,models_1,importances 

In [12]:
# 模型训练
models_mae, models_mse, importances = _get_values_lgbregresser_models(train_fea[fea_cols].values, train_fea['信用分'].values, feature_names=fea_cols)

422
Training until validation scores don't improve for 250 rounds.
[50]	valid_0's l1: 27.6643	valid_1's l1: 27.89
[100]	valid_0's l1: 23.9312	valid_1's l1: 24.1528
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 23.9312	valid_1's l1: 24.1528
Training until validation scores don't improve for 250 rounds.
[50]	valid_0's l1: 28.1296	valid_1's l1: 28.3201
[100]	valid_0's l1: 24.1561	valid_1's l1: 24.3807
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 24.1561	valid_1's l1: 24.3807
****************************************************************************************************
('MAE Model', 0.03975702918993141)
('MSE Model', 0.03940002012682151)
('Merge Model12', 0.03998108581012781)
Training until validation scores don't improve for 250 rounds.
[50]	valid_0's l1: 27.6766	valid_1's l1: 27.9675
[100]	valid_0's l1: 23.9536	valid_1's l1: 24.2647
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 23.9536	valid_1's l1: 24.2647
Training

In [13]:
# MAE 提交
pred_mae = 0
for i,model in enumerate(models_mae): 
    pred_mae += model.predict(test_fea[fea_cols]) * 0.2
test_fea['pred_mae'] = pred_mae

# MSE 提交
pred_mse = 0
for i,model in enumerate(models_mse): 
    pred_mse += model.predict(test_fea[fea_cols]) * 0.2
test_fea['pred_mse'] = pred_mse

submit_mae = pd.DataFrame()
submit_mae['id']    = test_fea['用户编码'].values
submit_mae['score'] = test_fea['pred_mae'].values 
submit_mae['score'] = submit_mae['score'].astype(int)
submit_mae[['id','score']].to_csv('baseline_mae.csv',index = None)
submit_mae['score'].describe()

count    50000.000000
mean       623.632800
std         13.473317
min        596.000000
25%        614.000000
50%        627.000000
75%        634.000000
max        641.000000
Name: score, dtype: float64

In [14]:
# MAE+MSE 提交 简单的线性融合
test_fea = test_fea.sort_values('pred_mae')
test_fea['ranks'] = list(range(test_fea.shape[0]))
test_fea['score'] = test_fea['pred_mae'].values
test_fea.loc[test_fea.ranks<10000,'score']  = test_fea.loc[test_fea.ranks< 10000,'pred_mse'].values *0.4 + test_fea.loc[test_fea.ranks< 10000,'pred_mae'].values * 0.6
test_fea.loc[test_fea.ranks>40000,'score']  = test_fea.loc[test_fea.ranks> 40000,'pred_mse'].values *0.4 + test_fea.loc[test_fea.ranks> 40000,'pred_mae'].values * 0.6

submit_mae_mse = pd.DataFrame()
submit_mae_mse['id']    = test_fea['用户编码'].values
submit_mae_mse['score'] = test_fea['score'].values 
submit_mae_mse['score'] = submit_mae_mse['score'].astype(int)
submit_mae_mse[['id','score']].to_csv('baseline_mae_mse.csv',index = None)
submit_mae_mse['score'].describe()

count    50000.000000
mean       622.623660
std         13.979832
min        588.000000
25%        614.000000
50%        627.000000
75%        634.000000
max        640.000000
Name: score, dtype: float64