In [3]:
import pickle,os
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import trange
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [4]:
# 数据读取
train = pd.read_csv('./data/train_stage2_update_20200320.csv')
train_y = pd.read_csv('./data/train_label.csv').Label
test = pd.read_csv('./data/test_stage2_update_20200320.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# 经EDA发现训练集中部分特征分布情况与测试集不一致
# 将这部分数据的训练权重设置为0.01，降低这部分分布不一致数据对模型训练的影响
# 在提高模型训练效果的同时，避免对训练集过拟合
# 不直接删除这部分数据，保留了对这部分数据的学习，降低了它们的影响，提高模型泛化能力
train['weight'] = 1
test['weight'] = 1
 
train.loc[train['流动资产合计_年末数']<0, 'weight'] = 0.01
train.loc[train['其他应收款_年初数']<0, 'weight'] = 0.01
train.loc[train['流动资产合计_年初数']<0, 'weight'] = 0.01
train.loc[train['其他应收款_年末数']<0, 'weight'] = 0.01

In [6]:
# 将EDA发现的部分离群值设置为None，避免异常信息影响模型学习，降低过拟合
train.loc[train['注册资本']>10000, '注册资本'] = None
train.loc[train['其他应收款_年末数']>400000, '其他应收款_年末数'] = None
train.loc[train['其他应收款_年末数']>400000, '其他应收款_年末数'] = None
train.loc[train['流动资产合计_年末数']<0, '流动资产合计_年末数'] = None
train.loc[train['其他应收款_年初数']<0, '其他应收款_年初数'] = None
train.loc[train['流动资产合计_年初数']<0, '流动资产合计_年初数'] = None
train.loc[train['其他应收款_年末数']<0, '其他应收款_年末数'] = None

test.loc[test['企业所得税']>7000, '企业所得税'] = None
test.loc[test['城建税']>300, '城建税'] = None
test.loc[test['增值税']>5000, '增值税'] = None
test.loc[test['教育费']>140, '教育费'] = None

data = train.append(test)

In [7]:
# 特征名称，综合考虑特征重要性和特征缺失程度，确定所使用的原始特征
feat_imp = ['企业所得税','城建税','增值税','印花税','教育费','年度参保总额',
            '货币资金_年末数','行业代码','最新参保人数','注册资本','货币资金_年初数',
            '投资总额','行业门类','企业类型','其他应收款_年末数','登记注册类型代码',
            '登记机关','资本变更前','流动资产合计_年初数','其他应收款_年初数','流动资产合计_年末数',
            '管辖机关','固定资产合计_年初数','未分配利润_年末数','非流动资产合计_年初数']

In [8]:
# 税务特征
data["经营范围"] = data["经营范围"].apply(lambda x: x.count(",") + 1)
data["是否全资"] = data['注册资本'] >= data['投资总额']
data["企业缴税"] = np.sum(data[['增值税', '企业所得税', '印花税', '城建税', "教育费"]], axis=1)
data["增值税/企业缴税"] = data["增值税"] / data["企业缴税"]
data["企业所得税/企业缴税"] = data["企业所得税"] / data["企业缴税"]
data["印花税/企业缴税"] = data["印花税"] / data["企业缴税"]
data["教育费/企业缴税"] = data["教育费"] / data["企业缴税"]
data["城建税/企业缴税"] = data["城建税"] / data["企业缴税"]

In [9]:
# 业务特征 年初年末变化
data['年度参保总额/企业缴税'] = data['年度参保总额']/data["企业缴税"]
data['企业缴税/经营范围'] = data["企业缴税"]/data['经营范围']
data['投资总额/企业缴税'] = data['投资总额']/data["企业缴税"]

data['货币资金变化'] = data['货币资金_年末数']-data['货币资金_年初数']
data['流动资产合计变化'] = data['流动资产合计_年末数'] - data['流动资产合计_年初数']
data['其他应收款变化'] = data['其他应收款_年末数'] - data['其他应收款_年初数']
data['固定资产合计变化'] = data['固定资产合计_年末数'] - data['固定资产合计_年初数']
data['未分配利润变化'] = data['未分配利润_年末数'] - data['未分配利润_年初数']
data['非流动资产合计变化'] = data['非流动资产合计_年末数'] - data['非流动资产合计_年初数']
data['短期借款变化'] = data['短期借款_年末数'] - data['短期借款_年初数']
data['应交税费变化'] = data['应交税费_年末数'] - data['应交税费_年初数']
data['负债合计变化'] = data['负债合计_年末数'] - data['负债合计_年初数']
data['其他应付款变化'] = data['其他应付款_年末数'] - data['其他应付款_年初数']
data['负债和所有者权益总计变化'] = data['负债和所有者权益总计_年末数'] - data['负债和所有者权益总计_年初数']
data['所有者权益合计变化'] = data['所有者权益合计_年末数'] - data['所有者权益合计_年初数']
data['其他流动负债变化'] = data['其他流动负债_年末数'] - data['其他流动负债_年初数']
data['流动负债合计变化'] = data['流动负债合计_年末数'] - data['流动负债合计_年初数']
data['存货变化'] = data['存货_年末数'] - data['存货_年初数']
data['资产总计变化'] = data['资产总计_年末数'] - data['资产总计_年初数']
data['预收款项变化'] = data['预收款项_年末数'] - data['预收款项_年初数']

In [10]:
# count特征，在数据的value_counts分布有一定规律时，会更有效
for col in feat_imp:
    data[f'{col}_catcount'] = data[col].map(data[col].value_counts())

In [11]:
# 交叉特征，直接进行原始特征的交叉，同时借助lgb特征重要性进行筛选
train = data[:train.shape[0]]
test = data[train.shape[0]:]

d={'add':'+', 'sub':'-', 'mul':'*', 'div':'/'}
feat0 = feat_imp.copy()
feat_cross = []
for i in trange(len(feat_imp)):
    df_temp=train[feat_imp].copy()
    for j in range(i+1,len(feat_imp)):
        df_temp['%s|%s|add'%(feat_imp[i],feat_imp[j])] = train[feat_imp[i]]+train[feat_imp[j]]
        df_temp['%s|%s|sub'%(feat_imp[i],feat_imp[j])] = train[feat_imp[i]]-train[feat_imp[j]]
        df_temp['%s|%s|mul'%(feat_imp[i],feat_imp[j])] = train[feat_imp[i]]*train[feat_imp[j]]
        df_temp['%s|%s|div'%(feat_imp[i],feat_imp[j])] = train[feat_imp[i]]/train[feat_imp[j]]
    model = LGBMClassifier(n_estimators=200, learning_rate=0.2, max_depth=7, 
                           subsample=0.8, colsample_bytree=0.6, n_jobs=-1)
    model.fit(df_temp.values, train_y)
    qq = pd.Series(model.feature_importances_, index=df_temp.columns).sort_values()
    for col in set(qq.loc[qq>10].index)-set(feat0):
        f0, f1, f2 = col.split('|')
        train[col] = df_temp[col]
        test[col] = eval("test['%s']%stest['%s']"%(f0,d[f2],f1))
    feat_cross.extend(list(set(qq.loc[qq>10].index)-set(feat0)))
print(len(feat_cross))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

641





In [12]:
# 特征选择，结合特征类别、特征nunique和缺失程度进行筛选
data = train.append(test)
feat = list(set(data.columns)-set(data.select_dtypes(object))-set(['Label','ID','weight']))
remove_col = []
for col in feat:
    if (data[col].nunique() < 2) or (data[col].isnull().sum()/data.shape[0] > 0.95):
        remove_col.append(col)
print(len(remove_col))
feat0 = list(set(feat) - set(remove_col))
print(len(feat0))

data = data[feat0]

519
831


In [13]:
# 使用lightgbm初训练获取预测结果用于后续stacking
kf = StratifiedKFold(5,True,random_state=1)
prob = np.zeros(len(train))
test_prob = np.zeros(len(test))
test_data = test[feat0].values
valid_score = []
for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    print(str(idx) + '_training')
    train_data = train.loc[train_index][feat0].values
    valid_data = train.loc[valid_index][feat0].values
    model = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=15, 
                           subsample=0.8, colsample_bytree=0.6, n_jobs=4)
    model.fit(train_data, train_y.loc[train_index], 
              eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=100,verbose=200)
    prob[valid_index] = model.predict_proba(valid_data)[:, 1]
    test_prob += model.predict_proba(test_data)[:, 1]/5
    valid_score.append(model.best_score_['valid_0']['binary_logloss'])
print('score:', np.mean(valid_score), valid_score)
train['lgb_prob0'] = prob
test['lgb_prob0'] = test_prob

0_training
Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.13459
[400]	valid_0's binary_logloss: 0.12864
Early stopping, best iteration is:
[496]	valid_0's binary_logloss: 0.12823
1_training
Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.14441
[400]	valid_0's binary_logloss: 0.140411
Early stopping, best iteration is:
[381]	valid_0's binary_logloss: 0.140331
2_training
Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.151429
Early stopping, best iteration is:
[269]	valid_0's binary_logloss: 0.149779
3_training
Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.139882
[400]	valid_0's binary_logloss: 0.134372
[600]	valid_0's binary_logloss: 0.133718
Early stopping, best iteration is:
[558]	valid_0's binary_logloss: 0.133571
4_training
Training until validation scores don't improve for 100 rounds
[200]	v

In [12]:
# 使用lightgbm初训练获取预测结果用于后续stacking
kf = StratifiedKFold(5,True,random_state=1)
prob = np.zeros(len(train))
test_prob = np.zeros(len(test))
test_data = test[feat0].values
valid_score = []
for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    print(str(idx) + '_training')
    train_data = train.loc[train_index][feat0].values
    valid_data = train.loc[valid_index][feat0].values
    model = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=31, 
                           subsample=0.8, colsample_bytree=0.6, n_jobs=-1, seed=4396)
    model.fit(train_data, train_y.loc[train_index], 
              eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50,verbose=200)
    prob[valid_index] = model.predict_proba(valid_data)[:, 1]
    test_prob += model.predict_proba(test_data)[:, 1]/5
    valid_score.append(model.best_score_['valid_0']['binary_logloss'])
print('score:', np.mean(valid_score), valid_score)
train['lgb_prob1'] = prob
test['lgb_prob1'] = test_prob

0_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.15334
Early stopping, best iteration is:
[269]	valid_0's binary_logloss: 0.152507
1_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.143252
Early stopping, best iteration is:
[329]	valid_0's binary_logloss: 0.139927
2_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.14483
Early stopping, best iteration is:
[334]	valid_0's binary_logloss: 0.141436
3_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.146651
Early stopping, best iteration is:
[307]	valid_0's binary_logloss: 0.143572
4_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.143128
[400]	valid_0's binary_logloss: 0.139877
Early stopping, best iteration is:
[368]	valid_0's binary_logloss: 0.139798
score: 0.14

In [13]:
# 使用lightgbm初训练获取预测结果用于后续stacking
kf = StratifiedKFold(5,True,random_state=1)
prob = np.zeros(len(train))
test_prob = np.zeros(len(test))
test_data = test[feat0].values
valid_score = []
for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    print(str(idx) + '_training')
    train_data = train.loc[train_index][feat0].values
    valid_data = train.loc[valid_index][feat0].values
    model = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=7, 
                           subsample=0.8, colsample_bytree=0.6, n_jobs=4, seed=2020)
    model.fit(train_data, train_y.loc[train_index], 
              eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50,verbose=200)
    prob[valid_index] = model.predict_proba(valid_data)[:, 1]
    test_prob += model.predict_proba(test_data)[:, 1]/5
    valid_score.append(model.best_score_['valid_0']['binary_logloss'])
print('score:', np.mean(valid_score), valid_score)
train['lgb_prob2'] = prob
test['lgb_prob2'] = test_prob

0_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.148673
[400]	valid_0's binary_logloss: 0.145208
Early stopping, best iteration is:
[382]	valid_0's binary_logloss: 0.145126
1_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.142152
[400]	valid_0's binary_logloss: 0.136975
Early stopping, best iteration is:
[407]	valid_0's binary_logloss: 0.136924
2_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.143864
[400]	valid_0's binary_logloss: 0.138256
[600]	valid_0's binary_logloss: 0.137435
Early stopping, best iteration is:
[640]	valid_0's binary_logloss: 0.137304
3_training
Training until validation scores don't improve for 50 rounds.
[200]	valid_0's binary_logloss: 0.141455
[400]	valid_0's binary_logloss: 0.136562
Early stopping, best iteration is:
[440]	valid_0's binary_logloss: 0.136397
4_training
Training until validatio

In [17]:
# # 使用xgb初训练获取预测结果用于后续stacking
import xgboost as xgb
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'gamma': 0.1,
    'max_depth': 8,
    'alpha': 0,
    'lambda': 0,
    'subsample': 0.7,
    'colsample_bytree': 0.5,
    'min_child_weight': 3,
    'silent': 1,
    'eta': 0.02,
    'nthread': 8,
    'missing': 1,
    'seed': 2019,
}

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
xgb_prob = np.zeros((test.shape[0]))
prob = np.zeros(len(train))

## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train_y)):
    print("fold {}".format(fold_ + 1))
    trn_data = xgb.DMatrix(train.iloc[trn_idx][feat0], label=train_y[trn_idx])
    val_data = xgb.DMatrix(train.iloc[val_idx][feat0], label=train_y[val_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    
    clf = xgb.train(params, trn_data,  5000, watchlist, verbose_eval=200, early_stopping_rounds=50)
    xgb_prob += clf.predict(xgb.DMatrix(test[feat0]), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    prob[val_idx] = clf.predict(xgb.DMatrix(train.iloc[val_idx][feat0]), ntree_limit=clf.best_ntree_limit)

fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = clf.get_fscore().keys()
fold_importance_df["importance"] = clf.get_fscore().values()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
train['xgb_prob'] = prob
test['xgb_prob'] = xgb_prob 

fold 1
[0]	train-logloss:0.67645	valid-logloss:0.67688
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.09196	valid-logloss:0.14794
Stopping. Best iteration:
[293]	train-logloss:0.07662	valid-logloss:0.14557

fold 2
[0]	train-logloss:0.67636	valid-logloss:0.67683
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.09293	valid-logloss:0.14697
Stopping. Best iteration:
[265]	train-logloss:0.08036	valid-logloss:0.14458

fold 3
[0]	train-logloss:0.67641	valid-logloss:0.67696
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.09143	valid-logloss:0.14907
Stopping. Best iteration:
[247]	train-logloss:0.08126	valid-logloss:0.14671

f

In [18]:
# # 使用catboost初训练获取预测结果用于后续stacking
kf = StratifiedKFold(5, True, random_state=1)
prob = np.zeros(len(train))
cat_prob = np.zeros(len(test))
feat1=list(set(feat0))
test_data=test[feat1].values
valid_score = []

for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    print(str(idx)+'_training')
    train_data = train.loc[train_index][feat1]
    valid_data = train.loc[valid_index][feat1]
    model = CatBoostClassifier(iterations=1000, learning_rate=0.02, depth=7,thread_count=8)
    model.fit(train_data, train_y.loc[train_index], 
              eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50,verbose=200)
    prob[valid_index] = model.predict_proba(valid_data)[:, 1]
    cat_prob += model.predict_proba(test_data)[:,1]/5
    valid_score.append(model.best_score_)
print('score:', valid_score)

train['cat_prob'] = prob
test['cat_prob'] = cat_prob

0_training
0:	learn: 0.6565801	test: 0.6563305	best: 0.6563305 (0)	total: 359ms	remaining: 5m 58s
200:	learn: 0.1216963	test: 0.1328158	best: 0.1328158 (200)	total: 32.6s	remaining: 2m 9s
400:	learn: 0.1066960	test: 0.1312358	best: 0.1312318 (398)	total: 1m 3s	remaining: 1m 35s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1311706221
bestIteration = 423

Shrink model to first 424 iterations.
1_training
0:	learn: 0.6579619	test: 0.6579634	best: 0.6579634 (0)	total: 167ms	remaining: 2m 46s
200:	learn: 0.1172104	test: 0.1430811	best: 0.1430811 (200)	total: 31.5s	remaining: 2m 5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1429972266
bestIteration = 204

Shrink model to first 205 iterations.
2_training
0:	learn: 0.6542509	test: 0.6548254	best: 0.6548254 (0)	total: 176ms	remaining: 2m 55s
200:	learn: 0.1178167	test: 0.1511827	best: 0.1511150 (198)	total: 32.6s	remaining: 2m 9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1

In [16]:
# 使用上述模型预测结果、model1预测结果和全部特征进行训练，获取最终结果
# 使用lgb，在构建lgb.dataset时加入预设置的权重，降低分布不一致的训练数据的影响，降低过拟合
# 使用不同模式的lightgbm预测结果作为stacking特征效果会更好
# 加权融合的效果不好
import lightgbm as lgb

train_code2 = pd.read_csv('./output/train_stacking_code2_1.csv')
test_code2 = pd.read_csv('./output/test_stacking_code2_1.csv')
train = train.merge(train_code2, on='ID', how='left')
test = test.merge(test_code2, on='ID', how='left')

feat=list(set(feat0 + ['lgb_prob0', 'lgb_prob1','lgb_prob2','xgb_prob', 'cat_prob',
                       'code2_lgb_prob1']))

prob = np.zeros(len(train))
test_prob = np.zeros(len(test))
test_data = test[feat0].values
valid_score = []
kf = StratifiedKFold(n_splits=5, random_state=1024, shuffle=True)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 7,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_seed':0,
    'bagging_freq': 1,
    'verbose': 1,
    'reg_alpha':3,
    'reg_lambda':1
}

for k, (train_index, test_index) in enumerate(kf.split(train, train_y)):
    print('train _K_ flod', k)
    
    lgb_train = lgb.Dataset(train.iloc[train_index][feat].values, train_y[train_index], weight=train.iloc[train_index]['weight'].values)
    lgb_evals = lgb.Dataset(train.iloc[test_index][feat].values, train_y[test_index], reference=lgb_train)

    model = lgb.train(params,
                    lgb_train,
                    num_boost_round=5000,
                    valid_sets=[lgb_train,lgb_evals],
                    valid_names=['train','valid'],
                    early_stopping_rounds=50,
                    verbose_eval=200,
                    )
    prob[test_index] = model.predict(train.iloc[test_index][feat].values, num_iteration=model.best_iteration)
    test_prob += model.predict(test_data, num_iteration=model.best_iteration)/5
    valid_score.append(model.best_score['valid']['binary_logloss'])
print('score:', np.mean(valid_score), valid_score)
train['lgb_prob_new'] = prob
test['lgb_prob_new'] = test_prob

train _K_ flod 0
Training until validation scores don't improve for 50 rounds.
[200]	train's binary_logloss: 0.137715	valid's binary_logloss: 0.137893
[400]	train's binary_logloss: 0.127203	valid's binary_logloss: 0.133149
Early stopping, best iteration is:
[489]	train's binary_logloss: 0.124369	valid's binary_logloss: 0.132863
train _K_ flod 1
Training until validation scores don't improve for 50 rounds.
[200]	train's binary_logloss: 0.135405	valid's binary_logloss: 0.14637
[400]	train's binary_logloss: 0.124658	valid's binary_logloss: 0.143034
Early stopping, best iteration is:
[373]	train's binary_logloss: 0.125619	valid's binary_logloss: 0.142966
train _K_ flod 2
Training until validation scores don't improve for 50 rounds.
[200]	train's binary_logloss: 0.13794	valid's binary_logloss: 0.138312
[400]	train's binary_logloss: 0.127299	valid's binary_logloss: 0.132453
Early stopping, best iteration is:
[462]	train's binary_logloss: 0.125207	valid's binary_logloss: 0.132148
train _K_ fl

In [24]:
test_df = pd.read_csv('./data/test_stage2_update_20200320.csv')
test_df['Label'] = test['lgb_prob_new']
test_df[['ID', 'Label']].to_csv('./output/sub.csv', index=False)

  interactivity=interactivity, compiler=compiler, result=result)
