In [None]:
# model1产生结果作为stacking特征供model2学习

In [1]:
import pickle,os
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import trange
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
# 数据读取
train = pd.read_csv('./data/train_stage2_update_20200320.csv')
train_y = pd.read_csv('./data/train_label.csv').Label
test = pd.read_csv('./data/test_stage2_update_20200320.csv')
train['label'] = train_y
data = train.append(test)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# 税务特征 object特征预处理
data["经营范围"] = data["经营范围"].apply(lambda x: x.count(",") + 1)
data['邮政编码'] = data['邮政编码'].apply(lambda x: str(x).strip(".0"))
data["是否全资"] = data['注册资本'] >= data['投资总额']
data["企业缴税"] = np.sum(data[['增值税', '企业所得税', '印花税', '城建税', "教育费"]], axis=1)
data["增值税/企业缴税"] = data["增值税"] / data["企业缴税"]
data["企业所得税/企业缴税"] = data["企业所得税"] / data["企业缴税"]
data["印花税/企业缴税"] = data["印花税"] / data["企业缴税"]
data["教育费/企业缴税"] = data["教育费"] / data["企业缴税"]
data["城建税/企业缴税"] = data["城建税"] / data["企业缴税"]

In [4]:
# 业务特征 年初年末变化
data['年度参保总额/企业缴税'] = data['年度参保总额']/data["企业缴税"]
data['企业缴税/经营范围'] = data["企业缴税"]/data['经营范围']
data['投资总额/企业缴税'] = data['投资总额']/data["企业缴税"]

data['货币资金变化'] = data['货币资金_年末数']-data['货币资金_年初数']
data['流动资产合计变化'] = data['流动资产合计_年末数'] - data['流动资产合计_年初数']
data['其他应收款变化'] = data['其他应收款_年末数'] - data['其他应收款_年初数']
data['固定资产合计变化'] = data['固定资产合计_年末数'] - data['固定资产合计_年初数']
data['未分配利润变化'] = data['未分配利润_年末数'] - data['未分配利润_年初数']
data['非流动资产合计变化'] = data['非流动资产合计_年末数'] - data['非流动资产合计_年初数']
data['短期借款变化'] = data['短期借款_年末数'] - data['短期借款_年初数']
data['应交税费变化'] = data['应交税费_年末数'] - data['应交税费_年初数']
data['负债合计变化'] = data['负债合计_年末数'] - data['负债合计_年初数']
data['其他应付款变化'] = data['其他应付款_年末数'] - data['其他应付款_年初数']
data['负债和所有者权益总计变化'] = data['负债和所有者权益总计_年末数'] - data['负债和所有者权益总计_年初数']
data['所有者权益合计变化'] = data['所有者权益合计_年末数'] - data['所有者权益合计_年初数']
data['其他流动负债变化'] = data['其他流动负债_年末数'] - data['其他流动负债_年初数']
data['流动负债合计变化'] = data['流动负债合计_年末数'] - data['流动负债合计_年初数']
data['存货变化'] = data['存货_年末数'] - data['存货_年初数']
data['资产总计变化'] = data['资产总计_年末数'] - data['资产总计_年初数']
data['预收款项变化'] = data['预收款项_年末数'] - data['预收款项_年初数']

In [5]:
train = data[:train.shape[0]]
test = data[train.shape[0]:]

In [6]:
# w2v对金额特征进行embedding，实现特殊数字聚类效果。
# 该场景下结合特征缺失程度进行原始特征筛选，整体效果一般
num_col = ['企业所得税','城建税','增值税','印花税','教育费','年度参保总额',
            '货币资金_年末数','注册资本']
for col in num_col:
    df = pd.read_csv('./w2v/%s.csv' % col)
    train = train.merge(df, on=col, how='left')
    test = test.merge(df, on=col, how='left')
data = train.append(test)

In [7]:
# 特征筛选：去除不可使用特征，同时结合特征缺失情况和nunique进行无效特征删除
feat0 = list(set(data.columns)-set(data.select_dtypes(object))-set(['label','ID','pred']))
remove_col = []
for col in feat0:
    if (data[col].nunique() < 2) or (data[col].isnull().sum()/data.shape[0] > 0.95):
        remove_col.append(col)
print(len(remove_col))
feat0 = list(set(feat0) - set(remove_col))
print(len(feat0))

480
284


In [8]:
train = train[feat0].values
test = test[feat0].values

In [9]:
test_df = pd.read_csv('./data/test_stage2_update_20200320.csv')
train_df = pd.read_csv('./data/train_stage2_update_20200320.csv')

In [10]:
# 借助lgb完成二分类，得到训练集和测试集的标签预测结果，用于stacking，供主模型使用
kf = StratifiedKFold(5,True,random_state=1)
prob = np.zeros(train.shape[0])
test_prob = np.zeros(test.shape[0])
test_data = test
valid_score = []
for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    print(str(idx) + '_training')
    train_data = train[train_index]
    valid_data = train[valid_index]
    model = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=32, 
                           n_jobs=4, seed=2020,
                           reg_alpha=0., reg_lambda=0.01,max_depth=-1,min_child_samples=50,
                           subsample=0.7, colsample_bytree=0.45, subsample_freq=5)
    model.fit(train_data, train_y[train_index], 
              eval_set=(valid_data, train_y[valid_index]), early_stopping_rounds=50,verbose=-1)
    prob[valid_index] = model.predict_proba(valid_data)[:, 1]
    test_prob += model.predict_proba(test_data)[:, 1]/5
    valid_score.append(model.best_score_['valid_0']['binary_logloss'])
print('score:', np.mean(valid_score), valid_score)
train_df['code2_lgb_prob1'] = prob
test_df['code2_lgb_prob1'] = test_prob

0_training
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[350]	valid_0's binary_logloss: 0.138335
1_training
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[361]	valid_0's binary_logloss: 0.145866
2_training
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[234]	valid_0's binary_logloss: 0.156608
3_training
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[330]	valid_0's binary_logloss: 0.143011
4_training
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[348]	valid_0's binary_logloss: 0.142063
score: 0.14517659595878013 [0.13833474513304564, 0.14586580616775105, 0.1566084703111654, 0.1430113528262853, 0.14206260535565327]


In [13]:
# 将产生的预测结果保存，作为stacking特征
train_df[['ID', 'code2_lgb_prob1']].to_csv('./output/train_stacking_code2.csv', index=False)
test_df[['ID', 'code2_lgb_prob1']].to_csv('./output/test_stacking_code2.csv', index=False)