In [124]:
# import packages
import pandas as pd
import seaborn as sns
import numpy as np
# from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
# show more columns with trian_df.describe()
pd.set_option('display.max_columns', 50)
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
import time
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# plt.plot([-1,2,-5,3])
# plt.title(u'中文',fontproperties=myfont)
# plt.show()

In [125]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [126]:
# load data
data_dir = '../dataset/DataFountain2019-消费者人群画像/'
train_df = pd.read_csv(data_dir + 'train_dataset.csv')
test_df = pd.read_csv(data_dir + 'test_dataset.csv')

## #1 Data preprocessing

In [127]:
def base_process(data_df):
    transform_value_feats = ['用户年龄', '用户网龄（月）', '当月通话交往圈人数', '近三个月月均商场出现次数',
                            '当月网购类应用使用次数', '当月物流快递类应用使用次数', '当月金融理财类应用使用总次数', 
                             '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数', 
                             '当月旅游资讯类应用使用次数']
    bill_feats = ['缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）','用户账单当月总费用（元）', 
                   '用户当月账户余额（元）']
    log_feats = ['当月网购类应用使用次数', '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数']
    
    # 处理极小或极大的离散点
    for col in transform_value_feats + bill_feats:
        up_limit = np.percentile(data_df[col].values, 99.9) # 99.9%分位数
        low_limit = np.percentile(data_df[col].values, 0.1) # 0.1%分位数
        data_df.loc[data_df[col] > up_limit, col] = up_limit
        data_df.loc[data_df[col] < low_limit, col] = low_limit
    
    # 解决正太分布左偏的情况，取对数
    for col in bill_feats + log_feats:
        data_df[col] = data_df[col].map(lambda x : np.log1p(x))
    
    return data_df

# run
train_df = base_process(train_df)
test_df = base_process(test_df)

## #2 Feature Engineering


In [129]:
# my methods to create new features
def create_features(data_df):
    # 异常值处理
    ## 对年龄异常值取
    data_df.loc[data_df['用户年龄'] == 0, '用户年龄'] = data_df['用户年龄'].mode() # 线下测试，众数比平均数好
    ## 用户话费敏感度处理
    data_df.loc[data_df['用户话费敏感度'] == 0, '用户话费敏感度'] = data_df['用户话费敏感度'].mode()
    

    # 用户费用相关特征
    ## 不同的充值路径
    data_df['不同充值途径'] = 0
    data_df.loc[(data_df['缴费用户最近一次缴费金额（元）'] % 10 == 0) & 
                      (data_df['缴费用户最近一次缴费金额（元）'] != 0), '不同充值途径'] = 1
    ## 费用稳定性
    data_df['当前费用稳定性'] = data_df['用户账单当月总费用（元）'] / (data_df['用户近6个月平均消费值（元）'] + 1)
   
    # 构造 ratio 比例特征
    ## '缴费用户最近一次缴费金额（元）'/'用户当月账户余额（元）'
    data_df['充值_余额_比例'] = data_df['缴费用户最近一次缴费金额（元）'] / (data_df['用户当月账户余额（元）'] + 1)
    ## 用户账单当月总费用/当月账户余额
    data_df['月费_余额_比例'] = data_df['用户账单当月总费用（元）'] / (data_df['用户当月账户余额（元）'] + 1)
    # '用户账单当月总费用（元）'/ '缴费用户最近一次缴费金额（元）'
    data_df['月费_缴费_比例'] = data_df['用户账单当月总费用（元）'] / (data_df['缴费用户最近一次缴费金额（元）'] + 1)
    ## '用户近6个月平均消费值（元）'/ '缴费用户最近一次缴费金额（元）'
    data_df['均费_缴费_比例'] = data_df['用户近6个月平均消费值（元）'] / (data_df['缴费用户最近一次缴费金额（元）'] + 1)
    ## '用户近6个月平均消费值（元）' / 
    data_df['均费_月费_比例'] = data_df['用户近6个月平均消费值（元）'] / (data_df['用户账单当月总费用（元）'] + 1)
    
    ## 用户上网年龄
    data_df['用户上网年龄'] = data_df['用户年龄'] - data_df['用户网龄（月）']
#     ## '用户网龄（月）'/'用户年龄', '用户年龄'/ '用户网龄（月）'不是很好算出来，毕竟是个大数
#     data_df['网龄_年龄_比例'] = data_df['用户网龄（月）'] / (data_df['用户年龄'] + 1)
    
    
    # 构造加减特征
    data_df['缴费金额是否能覆盖当月账单'] = data_df['缴费用户最近一次缴费金额（元）'] - data_df['用户账单当月总费用（元）']
    data_df['最近一次缴费是否超过平均消费额'] = data_df['缴费用户最近一次缴费金额（元）'] - data_df['用户近6个月平均消费值（元）']
    data_df['当月账单是否超过平均消费额'] = data_df['用户账单当月总费用（元）'] - data_df['用户近6个月平均消费值（元）']
    
    # 对 bool 特征进行简单构造
    data_df['是否去过高档商场'] = data_df['当月是否到过福州山姆会员店'] + data_df['当月是否逛过福州仓山万达']
    ## 检查后发现结果为2的比较稀少，于是将1、2都归到1中
    data_df['是否去过高档商场'] = data_df['是否去过高档商场'].map(lambda x : 1 if x >= 1 else 0)
    
    
    data_df['是否_商场_电影'] = data_df['是否去过高档商场'] * data_df['当月是否看电影']
    data_df['是否_商场_旅游'] = data_df['是否去过高档商场'] * data_df['当月是否景点游览']
    data_df['是否_商场_体育馆'] = data_df['是否去过高档商场'] * data_df['当月是否体育场馆消费']
    data_df['是否_电影_体育馆'] = data_df['当月是否看电影'] * data_df['当月是否体育场馆消费']
    data_df['是否_电影_旅游'] = data_df['当月是否看电影'] * data_df['当月是否景点游览']
    data_df['是否_旅游_体育馆'] = data_df['当月是否景点游览'] * data_df['当月是否体育场馆消费']
    
    data_df['是否_商场_旅游_体育馆'] = data_df['是否去过高档商场'] * data_df['当月是否景点游览'] * data_df['当月是否体育场馆消费']
    data_df['是否_商场_电影_体育馆'] = data_df['是否去过高档商场'] * data_df['当月是否看电影'] * data_df['当月是否体育场馆消费']
    data_df['是否_商场_电影_旅游'] = data_df['是否去过高档商场'] * data_df['当月是否看电影'] * data_df['当月是否景点游览']
    data_df['是否_体育馆_电影_旅游'] = data_df['当月是否体育场馆消费'] * data_df['当月是否看电影'] * data_df['当月是否景点游览']
    
    data_df['是否_商场_体育馆_电影_旅游'] = data_df['是否去过高档商场'] * data_df['当月是否体育场馆消费'] * \
                                        data_df['当月是否看电影'] * data_df['当月是否景点游览']
    
#     # 杰少特征参考
#     data_df['次数'] = data_df['当月网购类应用使用次数'] +  data_df['当月物流快递类应用使用次数'] +  data_df['当月金融理财类应用使用总次数'] + \
#                 data_df['当月视频播放类应用使用次数'] + data_df['当月飞机类应用使用次数'] + data_df['当月火车类应用使用次数'] + \
#                 data_df['当月旅游资讯类应用使用次数']  + 1

#     for col in ['当月金融理财类应用使用总次数', '当月旅游资讯类应用使用次数']: # 这两个比较积极向上一点
#         data_df[col + '百分比'] = data_df[col] / data_df['次数'] 

#     data_df['当月通话人均话费'] = data_df['用户账单当月总费用（元）'] / (data_df['当月通话交往圈人数'] + 1)

#     data_df['上个月费用'] = data_df['用户当月账户余额（元）'] + data_df['用户账单当月总费用（元）']

#     data_df['近似总消费'] = data_df['用户近6个月平均消费值（元）'] * data_df['用户网龄（月）'] / 12
    
    
    return data_df

# run
train_df = create_features(train_df)
test_df = create_features(test_df)

## #3 Single Model Training

In [130]:
# drop useless features
# print train_df.columns, len(train_df.columns)
drop_cols = ['用户编码', '是否黑名单客户']

X = train_df.drop(drop_cols + ['信用分'], axis=1)
X_submit = test_df.drop(drop_cols, axis=1)

In [131]:
# # Dimension Reduction 降维
# from sklearn.decomposition import PCA
# pca = PCA(n_components=600)
# pca.fit(X)
# X = pca.fit_transform(X)
# pca.fit(X_submit)
# X_submit = pca.fit_transform(X_submit)
# print X.shape, X_submit.shape

In [132]:
# k-cv
N_FOLDS = 5
y = train_df['信用分']
kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=2019)
kf = kfold.split(X, y)

In [133]:
# LightGBM: GBDT
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric': 'mae',
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'num_leaves': 31,
    'verbose': -1,
    'max_depth': 5,
    'lambda_l1': 0,
    'lambda_l2': 2.5,
    'nthread': 4
}

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()
    
# 0--------------------------------------1



# process the k-cv
cv_pred = np.zeros(test_df.shape[0])
valid_best_l2_all = 0

feature_importance_df = pd.DataFrame()
count = 0
for i, (train_idx, test_idx) in enumerate(kf):
    print('fold: ',i, ' training')
    X_train, X_test, y_train, y_test = X.iloc[train_idx, :], X.iloc[test_idx, :], y.iloc[train_idx], y.iloc[test_idx]
#     X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx], y[test_idx]
    data_train = lgb.Dataset(X_train, y_train)
    data_test = lgb.Dataset(X_test, y_test)
    lgb_model = lgb.train(params, data_train, num_boost_round=10000, valid_sets=data_test, 
                          verbose_eval=-1, early_stopping_rounds=50)
    cv_pred += lgb_model.predict(X_submit, num_iteration=lgb_model.best_iteration)
    valid_best_l2_all += lgb_model.best_score['valid_0']['l1']
    
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = list(unicode(X_train.columns))
#     fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain', iteration=lgb_model.best_iteration)
#     fold_importance_df["fold"] = count + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    count += 1
    
cv_pred /= N_FOLDS
valid_best_l2_all /= N_FOLDS
print('cv score for valid is: ', 1 / (1 + valid_best_l2_all))

# show the importance of features
# display_importances(feature_importance_df)

('fold: ', 0, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2198]	valid_0's l1: 14.7833
('fold: ', 1, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2360]	valid_0's l1: 14.6713
('fold: ', 2, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2225]	valid_0's l1: 14.7905
('fold: ', 3, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2617]	valid_0's l1: 14.4921
('fold: ', 4, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2521]	valid_0's l1: 14.7948
('cv score for valid is: ', 0.06366834321147859)


In [134]:
# 0.06364782116275812
# 0.063714735700162436 ———— 去掉'年龄_网龄_比例'特征后
# 0.063735568491715147 ———— 众数填充为零的年龄（平均数填充结果：0.063640145348041702，较差）
# 0.063666002670369704 ———— 构造加减特征
# 0.063684663314497278 ———— 构造'是否去过高档商场'
# 0.063714215150177056 ———— 用众数替代‘用户话费敏感度’中的 0 值（平均数填充结果：0.063683775050515118，较差）
# 0.0637013629544181 ———— 加上了base_process里面对左偏分布的处理

0.06368864913531594

0.06371758445225165

0.06371758445225165

In [76]:
# # XGBoost
# import xgboost as xgb
# xgb_params={'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 
#           'objective': 'reg:linear', 'eval_metric': 'mae', 'silent': True, 'nthread': 8}
# from sklearn.model_selection import KFold
# cv_pred_allxgb=0
# en_amount=3
# oof_xgb1=np.zeros(len(train_data))
# prediction_xgb1=np.zeros(len(test_data))
# for seed in range(en_amount):
#     NFOLDS=5
#     train_label=train_data['信用分']
#     kfold=KFold(n_splits=NFOLDS, shuffle=True, random_state=seed+2019)
#     kf=kfold.split(train_data,train_label)
    
#     train_data_use = train_data.drop(['用户编码','信用分'], axis=1)
#     test_data_use = test_data.drop(['用户编码'], axis=1)
    
#     cv_pred = np.zeros(test_data.shape[0])
#     valid_best_l2_all = 0
    
#     feature_importance_df = pd.DataFrame()
#     count = 0
    
#     for i, (train_fold, validate) in enumerate(kf):
#         print('fold: ',i, ' training')
#         X_train, X_validate, label_train, label_validate = train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], train_label[train_fold], train_label[validate]
#         dtrain = xgb.DMatrix(X_train, label_train)
#         dvalid = xgb.DMatrix(X_validate, label_validate)
#         watchlist = [(dtrain, 'train'), (dvalid, 'valid_data')]
#         bst = xgb.train(dtrain=dtrain, num_boost_round=10000, evals=watchlist, early_stopping_rounds=100, verbose_eval=300, params=xgb_params)
#         cv_pred += bst.predict(xgb.DMatrix(test_data_use), ntree_limit=bst.best_ntree_limit)
#         oof_xgb1[validate]=bst.predict(xgb.DMatrix(X_validate),ntree_limit=bst.best_ntree_limit)
#         prediction_xgb1+=bst.predict(xgb.DMatrix(test_data_use),ntree_limit=bst.best_ntree_limit)/kfold.n_splits
#         count += 1
        
#     cv_pred /= NFOLDS
#     cv_pred_allxgb+=cv_pred
# cv_pred_allxgb /= en_amount

In [77]:
# # Logistical Regression 逻辑斯特回归
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import mean_absolute_error

# # process the k-cv
# cv_pred = np.zeros(test_df.shape[0])
# valid_best_l2_all = 0

# feature_importance_df = pd.DataFrame()
# count = 0
# for i, (train_idx, test_idx) in enumerate(kf):
#     print('fold: ',i, ' training')
#     X_train, X_test, y_train, y_test = X.iloc[train_idx, :], X.iloc[test_idx, :], y.iloc[train_idx], y.iloc[test_idx]
# #     X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx], y[test_idx]
#     clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
#     valid_best_l2_all += mean_absolute_error(clf.predict(X_test), y_test)
#     cv_pred += clf.predict(X_submit)

# cv_pred /= N_FOLDS
# print('cv score for valid is: ', 1/(1+valid_best_l2_all))

## #4 Single model submit

In [44]:
submit_df = test_df[['用户编码']]
submit_df['score'] = cv_pred
submit_df.columns = ['id', 'score']

# 信用分都是整数
submit_df['score'] = submit_df['score'].apply(lambda x: int(np.round(x)))
submit_df.to_csv('./submission/baseline_2019-03-10T08:47:26_0.06363332257662933.csv', index=False)

In [39]:
submit_df.head(10)

Unnamed: 0,id,score
0,a4651f98c82948b186bdcdc8108381b4,534
1,aeb10247db4e4d67b2550bbc42ff9827,441
2,5af23a1e0e77410abb25e9a7eee510aa,524
3,43c64379d3c24a15b8478851b22049e4,525
4,f1687f3b8a6f4910bd0b13eb634056e2,513
5,52795d470db4478584f6c92f66af0294,516
6,0d758e1b10cc4f618dda9f87fc948068,523
7,b05585f9635245f282bf2cffd3c5773c,489
8,acde81ca14eb429a983bec773e16098c,453
9,1f78ee310e9d48449b8d5a1fd1537286,446


### —————————————————————————————————————————————————————————————

## #5 Model ensembling Training

In [157]:
# drop useless features
drop_cols = ['用户编码', '是否黑名单客户']

X = train_df.drop(drop_cols + ['信用分'], axis=1)
X_submit = test_df.drop(drop_cols, axis=1)

In [159]:
# LightGBM params
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric': 'mae',
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'num_leaves': 31,
    'verbose': -1,
    'max_depth': 5,
    'lambda_l2': 5, 'lambda_l1': 0
}

# process the k-cv
cv_pred_all = 0
en_amount = 3
for seed in range(en_amount):
    # k-cv
    N_FOLDS = 5
    y = train_df['信用分']
    kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=2019)
    kf = kfold.split(X, y)

    # process the k-cv
    cv_pred = np.zeros(test_df.shape[0])
    valid_best_l2_all = 0

    feature_importance_df = pd.DataFrame()
    count = 0
    for i, (train_idx, test_idx) in enumerate(kf):
        print('fold: ',i, ' training')
        X_train, X_test, y_train, y_test = X.iloc[train_idx, :], X.iloc[test_idx, :], y.iloc[train_idx], y.iloc[test_idx]
#         X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx], y[test_idx]
        data_train = lgb.Dataset(X_train, y_train)
        data_test = lgb.Dataset(X_test, y_test)
        lgb_model = lgb.train(params, data_train, num_boost_round=10000, valid_sets=data_test, 
                              verbose_eval=-1, early_stopping_rounds=50)
        cv_pred += lgb_model.predict(X_submit, num_iteration=lgb_model.best_iteration)
        valid_best_l2_all += lgb_model.best_score['valid_0']['l1']

#         fold_importance_df = pd.DataFrame()
#         fold_importance_df["feature"] = list(X_train.columns)
#         fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain', iteration=lgb_model.best_iteration)
#         fold_importance_df["fold"] = count + 1
#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        count += 1

    cv_pred /= N_FOLDS
    valid_best_l2_all /= N_FOLDS
    print('cv score for valid is: ', 1/(1+valid_best_l2_all))
    
    cv_pred_all += cv_pred

# avg cv_pred_all
cv_pred_all = cv_pred_all / en_amount

('fold: ', 0, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2324]	valid_0's l1: 14.7854
('fold: ', 1, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2317]	valid_0's l1: 14.6964
('fold: ', 2, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[3255]	valid_0's l1: 14.7701
('fold: ', 3, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[3321]	valid_0's l1: 14.477
('fold: ', 4, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2141]	valid_0's l1: 14.8283
('cv score for valid is: ', 0.06364782116275812)
('fold: ', 0, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2324]	valid_0's l1: 14.7854
('fold: ', 1, ' training')
Trainin

In [160]:
#para
params2 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': 'mae',
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'num_leaves': 31,
    'verbose': -1,
    'max_depth': 5,
    'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,
    'seed': 89
}

# process the k-cv
cv_pred_all2 = 0
en_amount = 3
for seed in range(en_amount):
    # k-cv
    N_FOLDS = 5
    y = train_df['信用分']
    kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=2019)
    kf = kfold.split(X, y)

    # process the k-cv
    cv_pred = np.zeros(test_df.shape[0])
    valid_best_l2_all = 0

    feature_importance_df = pd.DataFrame()
    count = 0
    for i, (train_idx, test_idx) in enumerate(kf):
        print('fold: ',i, ' training')
        X_train, X_test, y_train, y_test = X.iloc[train_idx, :], X.iloc[test_idx, :], y.iloc[train_idx], y.iloc[test_idx]
#         X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx], y[test_idx]
        data_train = lgb.Dataset(X_train, y_train)
        data_test = lgb.Dataset(X_test, y_test)
        lgb_model = lgb.train(params2, data_train, num_boost_round=10000, valid_sets=data_test, 
                              verbose_eval=-1, early_stopping_rounds=50)
        cv_pred += lgb_model.predict(X_submit, num_iteration=lgb_model.best_iteration)
        valid_best_l2_all += lgb_model.best_score['valid_0']['l1']

#         fold_importance_df = pd.DataFrame()
#         fold_importance_df["feature"] = list(X_train.columns)
#         fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain', iteration=lgb_model.best_iteration)
#         fold_importance_df["fold"] = count + 1
#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        count += 1

    cv_pred /= N_FOLDS
    valid_best_l2_all /= N_FOLDS
    print('cv score for valid is: ', 1/(1+valid_best_l2_all))
    
    cv_pred_all2 += cv_pred

# avg cv_pred_all
cv_pred_all2 = cv_pred_all2 / en_amount

('fold: ', 0, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2019]	valid_0's l1: 14.8025
('fold: ', 1, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2143]	valid_0's l1: 14.6985
('fold: ', 2, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2732]	valid_0's l1: 14.8106
('fold: ', 3, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2004]	valid_0's l1: 14.5162
('fold: ', 4, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2499]	valid_0's l1: 14.7874
('cv score for valid is: ', 0.063600933726241329)
('fold: ', 0, ' training')
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2019]	valid_0's l1: 14.8025
('fold: ', 1, ' training')
Train

## #6 Model Ensembling Submission

In [162]:
submit_df = test_df[['用户编码']]
submit_df['score'] = (cv_pred_all + cv_pred_all2) / 2
submit_df.columns = ['id', 'score']
submit_df['score1'] = cv_pred_all
submit_df['score2'] = cv_pred_all2

# int
submit_df['score'] = submit_df['score'].apply(lambda x: int(np.round(x)))
submit_df[['id','score']].to_csv('./submission/model_ensemble_baseline_2019-03-07T09:40:35_0.063600933726241343.csv', index=False)
