In [49]:
import gc, os, random, pickle
import pandas as pd
import numpy as np
import scipy
from scipy import stats

# 导入模型和sklearn
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import roc_auc_score, accuracy_score,auc, log_loss
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tqdm import tqdm 

pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',50)
    
'''seaborn and matplotlib'''
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

'''plotly'''
import plotly.offline as py
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

'''按markdown格式展现数据'''
init_notebook_mode(connected = True)
from IPython.display import Markdown

def bold(string):
    display(Markdown(string))


In [50]:
train = pd.read_csv("../data/train.csv")
train_label = pd.read_csv("../data/train_label.csv")
test = pd.read_csv("../data/test.csv")
print(train.shape, train_label.shape, test.shape)


Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.



(14366, 178) (14366, 2) (9578, 178)


In [51]:
bold("### train head 3#")
display(train.head(1))
bold("### train label 3#")
display(train_label.head(1))
bold("### test head 3#")
display(test.head(1))

### train head 3#

Unnamed: 0,ID,企业类型,经营期限至,登记机关,企业状态,邮政编码,投资总额,注册资本,核准日期,行业代码,注销时间,经营期限自,成立日期,行业门类,企业类别,管辖机关,经营范围,增值税,企业所得税,印花税,教育费,城建税,递延收益_年初数,长期负债合计_年初数,长期借款_年初数,长期应付款_年初数,短期借款_年初数,递延所得税负债_年初数,非流动负债合计_年初数,负债合计_年初数,负债和所有者权益总计_年初数,交易性金融负债_年初数,流动负债合计_年初数,其他非流动负债_年初数,其他负债（或长期负债）_年初数,其他流动负债_年初数,其他应付款_年初数,其他应交款_年初数,实收资本（股本）_年初数,实收资本（或股本）净额_年初数,所有者权益合计_年初数,未分配利润_年初数,小企业_应付利润_年初数,应付福利费_年初数,应付股利_年初数,应付利息_年初数,应付票据_年初数,应付职工薪酬_年初数,应付债券_年初数,应付账款_年初数,预计负债_年初数,应交税费_年初数,一年内到期的非流动负债_年初数,预提费用_年初数,预收款项_年初数,盈余公积_年初数,资本公积_年初数,专项应付款_年初数,递延收益_年末数,长期负债合计_年末数,长期借款_年末数,长期应付款_年末数,短期借款_年末数,递延所得税负债_年末数,非流动负债合计_年末数,负债合计_年末数,负债和所有者权益总计_年末数,交易性金融负债_年末数,流动负债合计_年末数,其他非流动负债_年末数,其他负债（或长期负债）_年末数,其他流动负债_年末数,其他应付款_年末数,其他应交款_年末数,实收资本（股本）_年末数,实收资本（或股本）净额_年末数,所有者权益合计_年末数,未分配利润_年末数,小企业_应付利润_年末数,应付福利费_年末数,应付股利_年末数,应付利息_年末数,应付票据_年末数,应付职工薪酬_年末数,应付债券_年末数,应付账款_年末数,预计负债_年末数,应交税费_年末数,一年内到期的非流动负债_年末数,预提费用_年末数,预收款项_年末数,盈余公积_年末数,资本公积_年末数,专项应付款_年末数,货币资金_年初数,存货_年初数,存货中的原材料_年初数,存货中的周转材料_年初数,长期待摊费用_年初数,长期股权投资_年初数,长期投资合计_年初数,长期应收款_年初数,持有至到期投资_年初数,短期投资_年初数,待摊费用_年初数,递延所得税资产_年初数,非流动资产合计_年初数,工程物资_年初数,固定资产合计_年初数,固定资产净额_年初数,固定资产净值_年初数,固定资产清理_年初数,固定资产原价_年初数,交易性金融资产_年初数,库存商品(产成品)_年初数,开发支出_年初数,可供出售金融资产_年初数,流动资产合计_年初数,其他非流动资产_年初数,其他流动资产_年初数,其他应收款_年初数,生产性生物资产_年初数,商誉_年初数,投资性房地产_年初数,无形资产_年初数,无形资产及其他资产合计_年初数,应收账款_年初数,一年内到期的非流动资产_年初数,油气资产_年初数,应收补贴款_年初数,应收股利_年初数,应收利息_年初数,应收票据_年初数,预付款项_年初数,资产总计_年初数,在建工程_年初数,货币资金_年末数,存货_年末数,存货中的原材料_年末数,存货中的周转材料_年末数,长期待摊费用_年末数,长期股权投资_年末数,长期投资合计_年末数,长期应收款_年末数,持有至到期投资_年末数,短期投资_年末数,待摊费用_年末数,递延所得税资产_年末数,非流动资产合计_年末数,工程物资_年末数,固定资产合计_年末数,固定资产净额_年末数,固定资产净值_年末数,固定资产清理_年末数,固定资产原价_年末数,交易性金融资产_年末数,库存商品(产成品)_年末数,开发支出_年末数,可供出售金融资产_年末数,流动资产合计_年末数,其他非流动资产_年末数,其他流动资产_年末数,其他应收款_年末数,生产性生物资产_年末数,商誉_年末数,投资性房地产_年末数,无形资产_年末数,无形资产及其他资产合计_年末数,应收账款_年末数,一年内到期的非流动资产_年末数,油气资产_年末数,应收补贴款_年末数,应收股利_年末数,应收利息_年末数,应收票据_年末数,预付款项_年末数,资产总计_年末数,在建工程_年末数
0,1,9.0,,1.0,0.0,266300,19.11,19.11,00:00.0,5154.0,,00:00.0,00:00.0,1.0,0.0,1.0,"[1412, 1024, 21375, 15023, 18526, 18789, 17022...",106.03965,0.331552,1.247218,3.181189,7.422776,0.0,0.0,25.110484,0.0,0.0,0.0,25.110484,797.386637,844.162546,0.0,772.276153,0.0,0.0,0.0,106.078436,0.0,50.220972,0.0,46.77591,-3.445063,0.0,0.0,0.0,0.0,125.552431,0.0,0.0,520.839979,0.0,19.805307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,585.013852,632.032182,0.0,585.013852,0.0,0.0,0.0,83.123866,0.0,50.220972,0.0,47.01833,-3.202643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,489.934244,0.0,11.955743,0.0,0.0,0.0,0.0,0.0,0.0,24.579246,462.290647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,135.081011,0.0,135.081011,0.0,0.0,0.0,368.329907,0.0,0.0,0.0,0.0,709.081535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,215.371727,0.0,0.0,0.0,0.0,0.0,2.647788,4.192126,844.162546,0.0,31.416284,229.147518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.128884,0.0,0.0,99.813781,0.0,99.813781,0.0,0.0,0.0,368.329907,0.0,0.0,0.0,0.0,532.218401,0.0,0.0,20.088389,0.0,0.0,0.0,0.0,0.0,142.310005,0.0,0.0,0.0,0.0,0.0,28.122902,1.004419,632.032182,0.0


### train label 3#

Unnamed: 0,ID,Label
0,1,0


### test head 3#

Unnamed: 0,ID,企业类型,经营期限至,登记机关,企业状态,邮政编码,投资总额,注册资本,核准日期,行业代码,注销时间,经营期限自,成立日期,行业门类,企业类别,管辖机关,经营范围,增值税,企业所得税,印花税,教育费,城建税,递延收益_年初数,长期负债合计_年初数,长期借款_年初数,长期应付款_年初数,短期借款_年初数,递延所得税负债_年初数,非流动负债合计_年初数,负债合计_年初数,负债和所有者权益总计_年初数,交易性金融负债_年初数,流动负债合计_年初数,其他非流动负债_年初数,其他负债（或长期负债）_年初数,其他流动负债_年初数,其他应付款_年初数,其他应交款_年初数,实收资本（股本）_年初数,实收资本（或股本）净额_年初数,所有者权益合计_年初数,未分配利润_年初数,小企业_应付利润_年初数,应付福利费_年初数,应付股利_年初数,应付利息_年初数,应付票据_年初数,应付职工薪酬_年初数,应付债券_年初数,应付账款_年初数,预计负债_年初数,应交税费_年初数,一年内到期的非流动负债_年初数,预提费用_年初数,预收款项_年初数,盈余公积_年初数,资本公积_年初数,专项应付款_年初数,递延收益_年末数,长期负债合计_年末数,长期借款_年末数,长期应付款_年末数,短期借款_年末数,递延所得税负债_年末数,非流动负债合计_年末数,负债合计_年末数,负债和所有者权益总计_年末数,交易性金融负债_年末数,流动负债合计_年末数,其他非流动负债_年末数,其他负债（或长期负债）_年末数,其他流动负债_年末数,其他应付款_年末数,其他应交款_年末数,实收资本（股本）_年末数,实收资本（或股本）净额_年末数,所有者权益合计_年末数,未分配利润_年末数,小企业_应付利润_年末数,应付福利费_年末数,应付股利_年末数,应付利息_年末数,应付票据_年末数,应付职工薪酬_年末数,应付债券_年末数,应付账款_年末数,预计负债_年末数,应交税费_年末数,一年内到期的非流动负债_年末数,预提费用_年末数,预收款项_年末数,盈余公积_年末数,资本公积_年末数,专项应付款_年末数,货币资金_年初数,存货_年初数,存货中的原材料_年初数,存货中的周转材料_年初数,长期待摊费用_年初数,长期股权投资_年初数,长期投资合计_年初数,长期应收款_年初数,持有至到期投资_年初数,短期投资_年初数,待摊费用_年初数,递延所得税资产_年初数,非流动资产合计_年初数,工程物资_年初数,固定资产合计_年初数,固定资产净额_年初数,固定资产净值_年初数,固定资产清理_年初数,固定资产原价_年初数,交易性金融资产_年初数,库存商品(产成品)_年初数,开发支出_年初数,可供出售金融资产_年初数,流动资产合计_年初数,其他非流动资产_年初数,其他流动资产_年初数,其他应收款_年初数,生产性生物资产_年初数,商誉_年初数,投资性房地产_年初数,无形资产_年初数,无形资产及其他资产合计_年初数,应收账款_年初数,一年内到期的非流动资产_年初数,油气资产_年初数,应收补贴款_年初数,应收股利_年初数,应收利息_年初数,应收票据_年初数,预付款项_年初数,资产总计_年初数,在建工程_年初数,货币资金_年末数,存货_年末数,存货中的原材料_年末数,存货中的周转材料_年末数,长期待摊费用_年末数,长期股权投资_年末数,长期投资合计_年末数,长期应收款_年末数,持有至到期投资_年末数,短期投资_年末数,待摊费用_年末数,递延所得税资产_年末数,非流动资产合计_年末数,工程物资_年末数,固定资产合计_年末数,固定资产净额_年末数,固定资产净值_年末数,固定资产清理_年末数,固定资产原价_年末数,交易性金融资产_年末数,库存商品(产成品)_年末数,开发支出_年末数,可供出售金融资产_年末数,流动资产合计_年末数,其他非流动资产_年末数,其他流动资产_年末数,其他应收款_年末数,生产性生物资产_年末数,商誉_年末数,投资性房地产_年末数,无形资产_年末数,无形资产及其他资产合计_年末数,应收账款_年末数,一年内到期的非流动资产_年末数,油气资产_年末数,应收补贴款_年末数,应收股利_年末数,应收利息_年末数,应收票据_年末数,预付款项_年末数,资产总计_年末数,在建工程_年末数
0,0,4.0,52:39.0,8.0,0.0,266100,,19.135111,00:00.0,5179.0,,52:39.0,52:39.0,1.0,1.0,7.0,"[4986, 5923, 7503, 20665, 405, 580, 2223, 1751...",34.793229,14.086406,2.849362,1.043797,2.435526,0.0,0.0,0.0,0.0,35.154681,0.0,0.0,968.952923,1407.261536,0.0,968.952923,0.0,0.0,0.0,14.150109,0.0,25.110486,0.0,438.308613,413.198127,0.0,0.0,0.0,0.0,0.0,7.251006,0.0,905.494548,0.0,6.902579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.154681,0.0,0.0,727.602328,1172.398183,0.0,727.602328,0.0,0.0,0.0,9.687125,0.0,25.110486,0.0,444.795855,419.685369,0.0,0.0,0.0,0.0,0.0,7.27636,0.0,675.088316,0.0,0.395847,0.0,0.0,0.0,0.0,0.0,0.0,208.640271,256.101135,0.0,0.0,31.803881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73.748348,0.0,41.944467,0.0,0.0,0.0,56.17239,0.0,0.0,0.0,0.0,1333.513189,0.0,124.067888,45.391309,0.0,0.0,0.0,0.0,0.0,695.730609,0.0,0.0,0.0,0.0,0.0,0.0,3.581977,1407.261536,0.0,312.148533,147.01401,0.0,0.0,15.35714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75.899483,0.0,60.542343,0.0,0.0,0.0,88.136449,0.0,0.0,0.0,0.0,1096.498699,0.0,130.911487,25.491233,0.0,0.0,0.0,0.0,0.0,476.56117,0.0,0.0,0.0,0.0,0.0,0.0,4.372267,1172.398183,0.0


In [61]:
train_label['Label'].value_counts() # 正负样本极度不均衡， 需要看下样本差异

0    13425
1      941
Name: Label, dtype: int64

In [63]:
train = train.merge(train_label, on='ID',how='left')
train.to_csv('train2.csv',index=False)

In [52]:
train.dtypes

ID                   int64
企业类型               float64
经营期限至               object
登记机关               float64
企业状态               float64
邮政编码                object
投资总额               float64
注册资本               float64
核准日期                object
行业代码               float64
注销时间                object
经营期限自               object
成立日期                object
行业门类               float64
企业类别               float64
管辖机关               float64
经营范围                object
增值税                float64
企业所得税              float64
印花税                float64
教育费                float64
城建税                float64
递延收益_年初数           float64
长期负债合计_年初数         float64
长期借款_年初数           float64
长期应付款_年初数          float64
短期借款_年初数           float64
递延所得税负债_年初数        float64
非流动负债合计_年初数        float64
负债合计_年初数           float64
负债和所有者权益总计_年初数     float64
交易性金融负债_年初数        float64
流动负债合计_年初数         float64
其他非流动负债_年初数        float64
其他负债（或长期负债）_年初数    float64
其他流动负债_年初数         float64
其他应付款_年初数          float64
其

第1类特征

企业类型               float64  ==> 类别特征，分布不均
登记机关               float64  ==> 类别特征，分布不均
企业状态               float64  ==> 类别特征，分布不均
行业门类               float64  ==> 类别特征，分布不均
企业类别               float64  ==> 类别特征，分布不均
管辖机关               float64  ==> 类别特征，分布不均
邮政编码                object  ==> 无用
行业代码               float64  ==> 无用
经营范围                object  ==> 数组 需要做做一下转换，比如转换成数组的长度

经营期限至               object
核准日期                object
注销时间                object
经营期限自               object
成立日期                object

投资总额               float64  ==> 数据及其稀疏， 尝试丢弃
注册资本               float64
增值税                float64
企业所得税              float64
印花税                float64
教育费                float64
城建税                float64

可以将第1类特征分成几种类型
1、类别特征：
企业类型 
行业门类      
企业类别  
登记机关 
企业状态
行业代码  
管辖机关

2、时间类特征

3、税收类特征

In [53]:

print(train['企业类型'].value_counts(), "\n\n")
# 大部分都是0 1、2， 其余的类别很少 ==>可以将特征做个转换，  高频部分 + other 一个类别， 比如<1000 的都当成other

print(train['行业门类'].value_counts(), "\n\n")
# 行业门类：18个值， 但从12开始，都是低频
print(train['企业类别'].value_counts(), "\n\n")
# 4个值，3 的频率最低
print(train['登记机关'].value_counts(), "\n\n")
# 13个值， 10 开始频率较低

print(train['企业状态'].value_counts(), "\n\n")
# 4个值，3的状态只有20个

print(train['行业代码'].value_counts(), "\n\n")
# 较分散，无明确意义 ==> 可丢弃
print(train['管辖机关'].value_counts(), "\n\n")
# 13个值，10开始都很低


0.0     5570
1.0     2664
2.0     1605
3.0      464
5.0      410
4.0      400
6.0      248
7.0      219
8.0      206
9.0      139
10.0     136
11.0     120
12.0     110
13.0     101
15.0      88
14.0      84
16.0      82
17.0      75
18.0      72
19.0      68
20.0      67
21.0      47
24.0      40
23.0      40
27.0      38
22.0      35
25.0      32
29.0      25
28.0      25
26.0      24
31.0      23
30.0      21
33.0      21
37.0      21
35.0      20
36.0      18
34.0      17
32.0      16
38.0      16
40.0      15
39.0      14
41.0      13
46.0       9
43.0       9
42.0       9
48.0       8
44.0       8
45.0       7
52.0       7
50.0       7
54.0       6
53.0       6
59.0       6
49.0       6
47.0       6
56.0       5
61.0       4
57.0       4
58.0       4
55.0       4
65.0       3
63.0       3
62.0       3
66.0       3
64.0       3
79.0       2
81.0       2
60.0       2
80.0       2
73.0       2
68.0       2
51.0       1
83.0       1
74.0       1
75.0       1
76.0       1
70.0       1

Name: 行业代码, dtype: int64 


0.0     3921
1.0     1904
2.0     1471
3.0     1397
4.0     1120
5.0      870
6.0      833
7.0      640
8.0      581
9.0      393
10.0     251
11.0     168
12.0      48
Name: 管辖机关, dtype: int64 




In [60]:
 # 插件时间类的特征
train[['经营期限自', '经营期限至', '成立日期', '核准日期', '注销时间']].head(100)

# 可以将注销时间，抽成注销了次数的特征

Unnamed: 0,经营期限自,经营期限至,成立日期,核准日期,注销时间
0,00:00.0,,00:00.0,00:00.0,
1,31:40.0,,31:40.0,00:00.0,
2,13:31.0,,13:31.0,13:31.0,
3,42:05.0,42:05.0,42:05.0,00:00.0,
4,55:27.0,,55:27.0,00:00.0,
5,00:00.0,,00:00.0,00:00.0,
6,00:00.0,,00:00.0,51:08.0,51:08.0
7,00:00.0,,00:00.0,00:00.0,
8,42:06.0,,42:06.0,42:06.0,54:59.0
9,17:35.0,,17:35.0,00:00.0,


0.0     3526
1.0     2736
2.0     2553
3.0     1869
4.0      577
5.0      537
6.0      330
7.0      322
8.0      298
9.0      290
10.0     174
11.0     162
12.0      83
13.0      58
15.0      33
14.0      27
16.0      15
17.0       4
18.0       2
Name: 行业门类, dtype: int64 

 0.0    8642
1.0    4053
2.0     821
3.0      82
Name: 企业类别, dtype: int64

