In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
import json
from sklearn.model_selection import train_test_split
from collections import Counter
from itertools import combinations
import copy

In [2]:
train = pd.read_csv('../data/train.csv')
label = pd.read_csv('../data/train_label.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/submission.csv')
train_data_label = pd.merge(train,label,on='ID')

c2c = ['长期负债合计_年初数','长期负债合计_年末数','其他负债（或长期负债）_年初数','其他负债（或长期负债）_年末数','其他应交款_年初数',
    '其他应交款_年末数','应付福利费_年初数','应付福利费_年末数','预提费用_年初数','预提费用_年末数','待摊费用_年初数','待摊费用_年末数',
    '应收补贴款_年初数','应收补贴款_年末数','长期投资合计_年初数','长期投资合计_年末数','固定资产净额_年初数','固定资产净额_年末数',
    '固定资产净值_年初数','固定资产净值_年末数','无形资产及其他资产合计_年初数','无形资产及其他资产合计_年末数']

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
columns = list(set(train_data_label.columns.to_list()) - set(c2c))
train_data_label = train_data_label[columns]

columns = list(set(test.columns.to_list()) - set(c2c))
test = test[columns]

In [4]:
train_data_label.head()

Unnamed: 0,其他非流动负债_年初数,其他应收款_年初数,投资总额,资产总计_年初数,经营期限至,企业状态,商誉_年末数,一年内到期的非流动资产_年初数,实收资本（或股本）净额_年初数,商誉_年初数,...,应收账款_年末数,长期借款_年初数,邮政编码,长期借款_年末数,小企业_应付利润_年末数,其他流动负债_年初数,非流动资产合计_年末数,油气资产_年初数,应收票据_年初数,工程物资_年初数
0,0.0,0.0,19.11,844.162546,,0.0,0.0,0.0,0.0,0.0,...,142.310005,25.110484,266300,0.0,0.0,0.0,99.813781,0.0,2.647788,0.0
1,,,29.154194,,,0.0,,,,,...,,,266000,,,,,,,
2,,,,,,0.0,,,,,...,,,266000,,,,,,,
3,,,,,42:05.0,0.0,,,,,...,,,266000,,,,,,,
4,0.0,22.072334,,397.527613,,0.0,0.0,0.0,0.0,0.0,...,156.90103,0.0,266400,0.0,0.0,0.0,3.103381,0.0,0.0,0.0


In [5]:
def deal_time(x):
    if str(x) == 'nan':
        return x
    
    x = str(x)
    parts = x.split(':')
    return int(parts[0]) + float(parts[1])/100

In [6]:
def cal_diff(x, y):
    if str(x) == 'nan':
        return x
    elif str(y) == 'nan':
        return y
    else:
        return y-x

In [7]:
def deal_manage_range(x):
    g = json.loads(x)
    return len(g)

In [8]:
def deal_cat(x):
    if str(x) == 'nan':
        return -1
    return int(x)

In [9]:
def get_built_tax_rate(x, y, z):  #x:cj y:zz z:xf
    if str(x) == 'nan':
        return x
    elif str(y) == 'nan':
        return y
    elif str(z) == 'nan':
        return z
    elif y+z==0:
        return 0
    else:
        return x/(y+z)

In [10]:
def get_xf_tax(x, y): #消费税 x:jy  y:zz
    if str(x) == 'nan':
        return x
    elif str(y) == 'nan':
        return y
    return x/0.03 - y

In [11]:
def get_tax(x, y): #y->tax
    if str(x) == 'nan':
        return x
    elif str(y) == 'nan':
        return y
    elif y == 0:
        return 0
    return x/y

In [12]:
def divide(x, y):
    if str(x) == 'nan':
        return x
    elif str(y) == 'nan':
        return y
    elif y == 0:
        return 0
    return x/y

In [13]:
def isnll(x):
    if str(x) == 'nan':
        return 0
    return 1

In [14]:
all_ranges = []
jyfw = list(train_data_label['经营范围'])+list(test['经营范围'])
for item in jyfw:
    all_ranges += json.loads(item)
range_dict = dict(Counter(all_ranges))
# bins = [4,8,16,64,128,256,600,3000]
def deal_manage_range_bin(x, lower, upper=100000):
    count = 0
    items = json.loads(x)
    for item in items:
        counts = range_dict.get(item)
        if counts and counts<=upper and counts>lower:
            count += 1
    return count

In [15]:
industry_code_arr = []
industry_code_list = list(train['行业代码'].fillna(-1))+list(test['行业代码'].fillna(-1))
for code in industry_code_list:
    industry_code_arr.append(code)
    
industry_code_map = dict(Counter(industry_code_arr))
def deal_industry_code(x):
    if industry_code_map.get(x) == 4644:
        return 1
    elif industry_code_map.get(x) == 1993:
        return 2
    elif industry_code_map.get(x) == 1272:
        return 0
    elif industry_code_map.get(x) in range(259,1272):
        return 3
    elif industry_code_map.get(x) in range(100,259):
        return 4
    elif industry_code_map.get(x) in range(50,100):
        return 5
    elif industry_code_map.get(x) in range(0,50):
        return 6
    else:
        return 7

In [16]:
def deal_industry_class(x):
    if x>=12:
        return 12
    return x

In [17]:
#企业类型

industry_type_arr = []
industry_type_list = list(train['企业类型'].fillna(-1))+list(test['企业类型'].fillna(-1))
for type_ in industry_type_list:
    industry_type_arr.append(type_)
    
industry_type_map = dict(Counter(industry_type_arr))
def deal_industry_type(x):
    if industry_type_map.get(x)==9311:
        return 1
    elif industry_type_map.get(x)==4406:
        return 2
    elif industry_type_map.get(x)==2750:
        return 3
    elif industry_type_map.get(x)==1262:
        return 4
    elif industry_type_map.get(x) in range(200,1262):
        return 5
    elif industry_type_map.get(x) in range(30, 200):
        return 6
    elif industry_type_map.get(x)<30:
        return 7
    else:
        return 8

In [18]:
def enus2classfify(x):
    if str(x) == 'nan':
        return -1
    else:
        return 0

In [19]:
def gen_feat(raw_table):
    table = copy.copy(raw_table)
    table['经营期限至'] = table['经营期限至'].apply(deal_time)
    table['经营期限自'] = table['经营期限自'].apply(deal_time)
    table['成立日期'] = table['成立日期'].apply(deal_time)
    table['核准日期'] = table['核准日期'].apply(deal_time)
    table['注销时间'] = table['注销时间'].apply(deal_time)

    table['经营期限'] = table.apply(lambda row: cal_diff(row['经营期限自'], row['经营期限至']), axis=1)

    table['消费税'] = table.apply(lambda row: get_xf_tax(row['教育费'], row['增值税']), axis=1)
    table['城建税率'] = table.apply(lambda row: get_built_tax_rate(row['城建税'], row['增值税'], row['消费税']), axis=1)
    table['应纳税额'] = table.apply(lambda row: get_tax(row['企业所得税'], row['城建税率']), axis=1)

    table['经营范围1'] = table['经营范围'].apply(deal_manage_range_bin, args=(0,4,))
    table['经营范围2'] = table['经营范围'].apply(deal_manage_range_bin, args=(4,8,))
    table['经营范围3'] = table['经营范围'].apply(deal_manage_range_bin, args=(8,16,))
    table['经营范围4'] = table['经营范围'].apply(deal_manage_range_bin, args=(16,64,))
    table['经营范围5'] = table['经营范围'].apply(deal_manage_range_bin, args=(64,128,))
    table['经营范围6'] = table['经营范围'].apply(deal_manage_range_bin, args=(128,200,))
    table['经营范围7'] = table['经营范围'].apply(deal_manage_range_bin, args=(200,1000,))
    table['经营范围8'] = table['经营范围'].apply(deal_manage_range_bin, args=(1000,5000,))
    table['经营范围9'] = table['经营范围'].apply(deal_manage_range_bin, args=(5000,))
    
    table['isnull'] = table['在建工程_年末数'].apply(isnll)

    table['经营范围'] = table['经营范围'].apply(deal_manage_range)

    cat_feat = ['行业代码','行业门类','管辖机关', '企业类别','企业类型','登记机关']
    for item in cat_feat:
        table[item] = table[item].apply(deal_cat)

    table['行业代码plus'] = table['行业代码'].apply(deal_industry_code)

    table['行业门类plus'] = table['行业门类'].apply(deal_industry_class)

    table['企业类型plus'] = table['企业类型'].apply(deal_industry_type)

    

    columns = table.columns.tolist()
    columns = list(set(columns)- set(c2c))
    for item in columns:
        parts = item.split("_")
        if len(parts) == 2:
            if parts[1] == '年初数':
                table[parts[0] + '_diff'] = table.apply(lambda row: cal_diff(row[item], row[parts[0]+"_年末数"]), axis=1)

    table['货币资金b/e'] = table.apply(lambda row: divide(row['货币资金_年初数'], row['货币资金_年末数']), axis=1)
    table['货币资金d/e'] = table.apply(lambda row: divide(row['货币资金_年末数'], row['货币资金_diff']), axis=1)
    table['货币资金d/b'] = table.apply(lambda row: divide(row['货币资金_年初数'], row['货币资金_diff']), axis=1)

    table['城建税/增值税'] = table.apply(lambda row: divide(row['城建税'], row['增值税']), axis=1)
    table['城建税/消费税'] = table.apply(lambda row: divide(row['城建税'], row['消费税']), axis=1)


    cat_feats=cat_feat+['行业代码plus','行业门类plus','企业类型plus','isnull']

    feats = table.columns
    return table, feats, cat_feats



In [20]:

drop_feat = ['ID','邮政编码','Label']

In [21]:
train_data, feats, cat_feats = gen_feat(train_data_label)
# feat = [item for item in feats if item not in drop_feat and "年初数" not in item and "年末数" not in item]


feat = [item for item in feats if item not in drop_feat]
# cat_feat = ['管辖机关', '企业类别','行业门类','行业代码','企业状态','邮政编码','登记机关','企业类型']
X = train_data[feat]
y = train_data['Label'].values
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [22]:
test_data, _, _ = gen_feat(test)

In [23]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='Logloss',
                               eval_metric="AUC",
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=20000,
                               od_type="Iter",
        
#                                depth=8,
                               early_stopping_rounds=500,
    #                            l2_leaf_reg=1,
    #                            border_count=96,
                               random_seed=2019
                              )
        
    return clf



In [24]:
cat_feats = list(set(feat)&set(cat_feats))
cat_clf = make_classifier()
cat_clf.fit(x_train, y_train, eval_set=(x_test, y_test),
            use_best_model=True, verbose=500,cat_features=cat_feats)

0:	test: 0.8725157	best: 0.8725157 (0)	total: 95ms	remaining: 31m 39s
500:	test: 0.9231092	best: 0.9231121 (497)	total: 13.9s	remaining: 9m
1000:	test: 0.9249675	best: 0.9251541 (925)	total: 28.1s	remaining: 8m 52s
1500:	test: 0.9269094	best: 0.9269307 (1495)	total: 42.2s	remaining: 8m 40s
2000:	test: 0.9274162	best: 0.9274684 (1841)	total: 1m 18s	remaining: 11m 48s
2500:	test: 0.9279945	best: 0.9280390 (2491)	total: 2m 1s	remaining: 14m 13s
3000:	test: 0.9285583	best: 0.9286308 (2990)	total: 2m 32s	remaining: 14m 24s
3500:	test: 0.9287827	best: 0.9288417 (3450)	total: 2m 46s	remaining: 13m 2s
4000:	test: 0.9288745	best: 0.9291589 (3739)	total: 2m 59s	remaining: 11m 56s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.9291588695
bestIteration = 3739

Shrink model to first 3740 iterations.


<catboost.core.CatBoostClassifier at 0x12e4af0f0>

In [25]:
pd.set_option('max_rows', None)
feature_importance_df = pd.DataFrame()
feature_importance_df["Feature"] = feat
feature_importance_df["importance"] = [item.round(2) for item in cat_clf.feature_importances_]
feature_importance_df.sort_values('importance')

Unnamed: 0,Feature,importance
106,一年内到期的非流动负债_年初数,0.0
38,其他非流动负债_年初数,0.0
45,其他非流动负债_年末数,0.0
52,应付股利_年初数,0.0
62,专项应付款_年末数,0.0
72,递延所得税负债_年初数,0.0
12,开发支出_年末数,0.0
94,应收利息_年末数,0.0
96,递延收益_年末数,0.0
14,一年内到期的非流动负债_年末数,0.0


In [26]:
prob_oof = cat_clf.predict_proba(x_test)
prob_oof = [item[1] for item in prob_oof]
score = roc_auc_score(y_test, prob_oof)
print(score)

prob_oof = cat_clf.predict_proba(x_train)
prob_oof = [item[1] for item in prob_oof]
score = roc_auc_score(y_train, prob_oof)
print(score)

0.9291588694664056
0.9816065166728378


In [24]:
def feat_filter(feature_importance_df, cat_feats, thred = 0.01):
    arr = []
    while((feature_importance_df['importance']<thred).any()):
        filted_feat = list(feature_importance_df[feature_importance_df['importance']>thred]['Feature'])
        X = train_data[filted_feat]
        y = train_data['Label'].values
        x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=42)

        cat_clf = make_classifier()
        cat_feats = list(set(cat_feats)&set(filted_feat))
        cat_clf.fit(x_train, y_train, eval_set=(x_test, y_test),
        use_best_model=True, verbose=0,cat_features=cat_feats)

        prob_oof = cat_clf.predict_proba(x_test)
        prob_oof = [item[1] for item in prob_oof]
        test_score = roc_auc_score(y_test, prob_oof)


        prob_oof = cat_clf.predict_proba(x_train)
        prob_oof = [item[1] for item in prob_oof]
        train_score = roc_auc_score(y_train, prob_oof)

        print('round:')
        print('train_auc:',train_score)
        print('test_auc:',test_score)
        print('-----------------------------------')
        pd.set_option('max_rows', None)
        feature_importance_df = pd.DataFrame()
        feature_importance_df["Feature"] = filted_feat
        feature_importance_df["importance"] = [item.round(2) for item in cat_clf.feature_importances_]
        
        arr.append(filted_feat)
    return arr

In [25]:
arr = feat_filter(feature_importance_df, cat_feats, 0.001)

round:
train_auc: 0.9787511162415873
test_auc: 0.9298445395420808
-----------------------------------
round:
train_auc: 0.9816021606081067
test_auc: 0.9298667827180195
-----------------------------------
round:
train_auc: 0.9731477390468308
test_auc: 0.9284678803703972
-----------------------------------
round:
train_auc: 0.9838793712331708
test_auc: 0.9299596238001983
-----------------------------------
round:
train_auc: 0.9746699503097473
test_auc: 0.9289296680447766
-----------------------------------
round:
train_auc: 0.9829510627242206
test_auc: 0.9301844732961001
-----------------------------------
round:
train_auc: 0.9699702231860879
test_auc: 0.9287130388530258
-----------------------------------
round:
train_auc: 0.9761290764209328
test_auc: 0.9302894030608545
-----------------------------------
round:
train_auc: 0.9807270583183723
test_auc: 0.9299721960300766
-----------------------------------
round:
train_auc: 0.9811772887230819
test_auc: 0.9315422741229662
----------------

In [28]:
filted_feats = arr[4]
X = train_data[filted_feats]
y = train_data['Label'].values
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=42)

cat_clf = make_classifier()
cat_feats = list(set(cat_feats)&set(filted_feats))
cat_clf.fit(x_train, y_train, eval_set=(x_test, y_test),
use_best_model=True, verbose=500,cat_features=cat_feats)

prob_oof = cat_clf.predict_proba(x_test)
prob_oof = [item[1] for item in prob_oof]
test_score = roc_auc_score(y_test, prob_oof)


prob_oof = cat_clf.predict_proba(x_train)
prob_oof = [item[1] for item in prob_oof]
train_score = roc_auc_score(y_train, prob_oof)

0:	test: 0.9028742	best: 0.9028742 (0)	total: 22.7ms	remaining: 7m 33s
500:	test: 0.9295341	best: 0.9298987 (484)	total: 4.56s	remaining: 2m 57s
1000:	test: 0.9226726	best: 0.9303068 (536)	total: 8.95s	remaining: 2m 49s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.9303068108
bestIteration = 536

Shrink model to first 537 iterations.


In [None]:
round:
train_auc: 0.9778491774816346
test_auc: 0.9313706148303956
-----------------------------------
round:
train_auc: 0.978490452439552
test_auc: 0.9326674886970817
-----------------------------------
round:
train_auc: 0.984600144372431
test_auc: 0.9330228959647977
-----------------------------------
round:
train_auc: 0.9840691711964553
test_auc: 0.9325673944053577
-----------------------------------
round:
train_auc: 0.9832174827546509
test_auc: 0.93267522545393
-----------------------------------
round:
train_auc: 0.9836698134048558
test_auc: 0.933713885060806
-----------------------------------
round:
train_auc: 0.9773896126525011
test_auc: 0.9308242063779889
-----------------------------------

In [None]:
filted_feat = list(feature_importance_df[feature_importance_df['importance']>0.01]['Feature'])
X = train_data[filted_feat]
y = train_data['Label'].values
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [None]:
from tpot import TPOTClassifier
tpot = TPOTClassifier(generations= 50, population_size= 20, verbosity= 2, n_jobs=-1, scoring='roc_auc')
tpot.fit(x_train, y_train)
print(tpot.score(x_test, y_test))
tpot.export( 'tpot_boston_pipeline.py')

Imputing missing values in feature set


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=1020, style=ProgressStyle(descrip…

Generation 1 - Current best internal CV score: 0.921478711563054
Generation 2 - Current best internal CV score: 0.921478711563054
Generation 3 - Current best internal CV score: 0.921478711563054
Generation 4 - Current best internal CV score: 0.921478711563054
Generation 5 - Current best internal CV score: 0.921478711563054
Generation 6 - Current best internal CV score: 0.9231246064758286
Generation 7 - Current best internal CV score: 0.9231246064758286
Generation 8 - Current best internal CV score: 0.9231246064758286
Generation 9 - Current best internal CV score: 0.9231246064758286
Generation 10 - Current best internal CV score: 0.9231246064758286
Generation 11 - Current best internal CV score: 0.9231246064758286
Generation 12 - Current best internal CV score: 0.9231246064758286
Generation 13 - Current best internal CV score: 0.9231246064758286
Generation 14 - Current best internal CV score: 0.9231246064758286
Generation 15 - Current best internal CV score: 0.9231246064758286
Generatio

In [36]:
result = tpot.predict_proba(x_test)

Imputing missing values in feature set


In [37]:
pred_result = [item[1] for item in result]

In [38]:

score = roc_auc_score(y_test, pred_result)
score

0.900550276830831

In [34]:
submission = pd.DataFrame()
submission['ID'] = test_data['ID']
submission['Label'] = pred_result
submission

Unnamed: 0,ID,Label
0,0,0.051254
1,2,0.065557
2,4,0.008691
3,5,0.01702
4,6,0.010179
5,7,0.01952
6,8,0.015058
7,11,0.03685
8,12,0.033882
9,13,0.063485


In [40]:
cat_clf = make_classifier()
cat_clf.fit(x_train, y_train, eval_set=(x_test, y_test),
            use_best_model=True, verbose=500,cat_features=cat_feats)

0:	test: 0.8829226	best: 0.8829226 (0)	total: 33.1ms	remaining: 11m 1s
500:	test: 0.9253766	best: 0.9253766 (500)	total: 14.1s	remaining: 9m 9s
1000:	test: 0.9272160	best: 0.9273330 (998)	total: 28.8s	remaining: 9m 6s
1500:	test: 0.9290037	best: 0.9290037 (1500)	total: 43.4s	remaining: 8m 55s
2000:	test: 0.9307135	best: 0.9307135 (2000)	total: 57.6s	remaining: 8m 38s
2500:	test: 0.9310906	best: 0.9311709 (2333)	total: 1m 11s	remaining: 8m 23s
3000:	test: 0.9316322	best: 0.9316322 (3000)	total: 1m 26s	remaining: 8m 11s
3500:	test: 0.9321409	best: 0.9321428 (3454)	total: 1m 41s	remaining: 7m 58s
4000:	test: 0.9324717	best: 0.9326022 (3890)	total: 1m 56s	remaining: 7m 45s
4500:	test: 0.9325500	best: 0.9327453 (4168)	total: 2m 10s	remaining: 7m 30s
5000:	test: 0.9329784	best: 0.9330229 (4919)	total: 2m 25s	remaining: 7m 17s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.933022896
bestIteration = 4919

Shrink model to first 4920 iterations.


<catboost.core.CatBoostClassifier at 0x1a2a72dda0>

In [41]:
prob_oof = cat_clf.predict_proba(x_test)
prob_oof = [item[1] for item in prob_oof]
score = roc_auc_score(y_test, prob_oof)
score

0.9330228959647977

In [42]:
prob_oof = cat_clf.predict_proba(x_train)
prob_oof = [item[1] for item in prob_oof]
score = roc_auc_score(y_train, prob_oof)
score

0.984600144372431

In [29]:
pd.set_option('max_rows', None)
feature_importance_df = pd.DataFrame()
feature_importance_df["Feature"] = filted_feats
feature_importance_df["importance"] = [item.round(2) for item in cat_clf.feature_importances_]
feature_importance_df.sort_values('importance')

Unnamed: 0,Feature,importance
10,在建工程_年初数,0.86
4,其他流动负债_年初数,0.91
9,应收票据_年初数,1.09
8,应收股利_年初数,1.63
14,长期股权投资_diff,1.66
2,负债合计_年初数,2.65
11,流动资产合计_年末数,2.89
15,货币资金_diff,3.15
16,固定资产原价_diff,3.32
6,流动资产合计_年初数,3.4


In [22]:
0.9299755808611977
dropit 0.9304726674886971
ascat 0.9312033074635525

SyntaxError: invalid syntax (<ipython-input-22-5e90baa79963>, line 2)

In [34]:
test['经营期限至'] = test['经营期限至'].apply(deal_time)
test['经营期限自'] = test['经营期限自'].apply(deal_time)
test['成立日期'] = test['成立日期'].apply(deal_time)
test['核准日期'] = test['核准日期'].apply(deal_time)
test['注销时间'] = test['注销时间'].apply(deal_time)

test['经营期限'] = test.apply(lambda row: cal_diff(row['经营期限自'], row['经营期限至']), axis=1)
test['changed'] = test.apply(lambda row: is_changed(row['核准日期'], row['成立日期']), axis=1)

test['消费税'] = test.apply(lambda row: get_xf_tax(row['教育费'], row['增值税']), axis=1)
test['城建税率'] = test.apply(lambda row: get_built_tax_rate(row['城建税'], row['增值税'], row['消费税']), axis=1)
test['应纳税额'] = test.apply(lambda row: get_tax(row['企业所得税'], row['城建税率']), axis=1)

test['经营范围1'] = test['经营范围'].apply(deal_manage_range_bin, args=(0,4,))
test['经营范围2'] = test['经营范围'].apply(deal_manage_range_bin, args=(4,8,))
test['经营范围3'] = test['经营范围'].apply(deal_manage_range_bin, args=(8,16,))
test['经营范围4'] = test['经营范围'].apply(deal_manage_range_bin, args=(16,64,))
test['经营范围5'] = test['经营范围'].apply(deal_manage_range_bin, args=(64,128,))
test['经营范围6'] = test['经营范围'].apply(deal_manage_range_bin, args=(128,256,))
test['经营范围7'] = test['经营范围'].apply(deal_manage_range_bin, args=(256,600,))
test['经营范围8'] = test['经营范围'].apply(deal_manage_range_bin, args=(600,3000,))
test['经营范围9'] = test['经营范围'].apply(deal_manage_range_bin, args=(3000,))

test['经营范围'] = test['经营范围'].apply(deal_manage_range)

cat_feat = ['行业代码','行业门类','管辖机关', '企业类别','企业类型','登记机关']
for item in cat_feat:
    test[item] = test[item].apply(deal_cat)
    
test['行业代码plus'] = test['行业代码'].apply(deal_industry_code)

test['行业门类plus'] = test['行业门类'].apply(deal_industry_class)

test['企业类型plus'] = test['企业类型'].apply(deal_industry_type)

c2c = ['长期负债合计_年初数','长期负债合计_年末数','其他负债（或长期负债）_年初数','其他负债（或长期负债）_年末数','其他应交款_年初数',
'其他应交款_年末数','应付福利费_年初数','应付福利费_年末数','预提费用_年初数','预提费用_年末数','待摊费用_年初数','待摊费用_年末数',
'应收补贴款_年初数','应收补贴款_年末数','长期投资合计_年初数','长期投资合计_年末数','固定资产净额_年初数','固定资产净额_年末数',
'固定资产净值_年初数','固定资产净值_年末数','无形资产及其他资产合计_年初数','无形资产及其他资产合计_年末数']

columns = test.columns.tolist()
columns = list(set(columns)- set(c2c))
for item in columns:
    parts = item.split("_")
    if len(parts) == 2:
        if parts[1] == '年初数':
            test[parts[0] + '_diff'] = test.apply(lambda row: cal_diff(row[item], row[parts[0]+"_年末数"]), axis=1)

for item in c2c:
    test[item] = test[item].apply(enus2classfify)
    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-19-29272e92c9bb> in <module>()
      9 test['应纳税额'] = test.apply(lambda row: get_tax(row['企业所得税'], row['城建税率']), axis=1)
     10 
---> 11 test['经营范围1'] = test['经营范围'].apply(deal_manage_range_bin, args=(0,4,))
     12 test['经营范围2'] = test['经营范围'].apply(deal_manage_range_bin, args=(4,8,))
     13 test['经营范围3'] = test['经营范围'].apply(deal_manage_range_bin, args=(8,16,))

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
   4043             else:
   4044                 values = self.astype(object).values
-> 4045                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   4046 
   4047         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in f(x)
   4029 
   4030             def f(x):
-> 4031                 return func(x, *args, **kwds)
   4032 
   4033         else:

<ipython-input-11-31168e103d35> in deal_manage_range_bin(x, lower, upper)
     10     for item in items:
     11         counts = range_dict.get(item)
---> 12         if counts<=upper and counts>lower:
     13             count += 1
     14     return count

TypeError: '<=' not supported between instances of 'NoneType' and 'int'

In [35]:
test_X = test[filted_feats]
result = cat_clf.predict_proba(test_X)

In [36]:
result = [item[1] for item in result]
result

[0.0005293366824749417,
 0.03613565752374427,
 0.002585735991622814,
 0.0015161269188012595,
 0.00158624812760472,
 0.0009524545095219848,
 0.0007120038130775732,
 0.001694371717622933,
 0.0027621101787371665,
 0.0028556945996887372,
 0.0007548480550277118,
 0.0023127425066325276,
 0.003991420233112344,
 0.016831489190690357,
 0.0023484620950662886,
 0.004735428908741838,
 0.22994324113953105,
 0.003781314696630403,
 0.0011284567298747244,
 0.004094058917889714,
 0.0019297035234184935,
 0.0030714382512098868,
 0.003311055833514744,
 0.0016876384400340168,
 0.00217077012692186,
 0.3038554670639643,
 0.0008744449107122831,
 0.45193253370840397,
 0.002235035923269852,
 0.0028833160561546264,
 0.002043208636227297,
 0.0019740037435069476,
 0.35020893159097194,
 0.0007028879566704637,
 0.0032105315962323955,
 0.36311254579081664,
 0.003148020244095431,
 0.0006805564285467836,
 0.0036659071551098893,
 0.4805306516362622,
 0.17967918297772667,
 0.0008073686721325414,
 0.004947681512161565,
 0

In [37]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['Label'] = result
submission

Unnamed: 0,ID,Label
0,0,0.000529
1,2,0.036136
2,4,0.002586
3,5,0.001516
4,6,0.001586
5,7,0.000952
6,8,0.000712
7,11,0.001694
8,12,0.002762
9,13,0.002856


In [38]:
submission.to_csv('submission.csv', index=False, encoding='UTF8')

In [25]:
feat = ['在建工程_年初数','其他流动负债_年初数','应收票据_年初数','应收股利_年初数','长期股权投资_diff',
'负债合计_年初数','流动资产合计_年末数','货币资金_diff','固定资产原价_diff','流动资产合计_年初数',
'流动负债合计_年初数','其他应收款_年初数','教育费','货币资金_年初数','增值税','应纳税额','城建税率','城建税/消费税']

In [27]:
from sklearn.model_selection import KFold
import numpy as np
def cv(X, y, feat, cat, test=None):
    if test is not None:
        test_X = test[feat]
        preds = []

    NFOLDS = 5
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    score = 0
    for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
        clf = make_classifier()
        clf.fit(X.iloc[trn_idx], y[trn_idx], eval_set=(X.iloc[test_idx], y[test_idx]),
                              use_best_model=True, verbose=500)

        result = clf.predict_proba(X.iloc[test_idx])
        result = [item[1] for item in result]
        score += roc_auc_score(y[test_idx], result)

        if test is not None:
            result = clf.predict_proba(test_X)
            result = [item[1] for item in result]
            preds.append(result)
    print(score/NFOLDS)
    if test is not None:
        return preds
    

In [28]:
ar = cv(X,y,feat,cat_feats,test_data)

0:	test: 0.8731975	best: 0.8731975 (0)	total: 29.6ms	remaining: 9m 52s
500:	test: 0.9287493	best: 0.9289949 (453)	total: 8.46s	remaining: 5m 29s
1000:	test: 0.9295697	best: 0.9296565 (982)	total: 17s	remaining: 5m 22s
1500:	test: 0.9309077	best: 0.9310454 (1496)	total: 25.2s	remaining: 5m 10s
2000:	test: 0.9310073	best: 0.9312063 (1725)	total: 33.4s	remaining: 5m
2500:	test: 0.9317525	best: 0.9317567 (2498)	total: 41.7s	remaining: 4m 51s
3000:	test: 0.9323241	best: 0.9323834 (2974)	total: 50.5s	remaining: 4m 45s
3500:	test: 0.9327423	best: 0.9327423 (3500)	total: 1m	remaining: 4m 44s
4000:	test: 0.9335045	best: 0.9335151 (3997)	total: 1m 9s	remaining: 4m 37s
4500:	test: 0.9340062	best: 0.9340486 (4488)	total: 1m 18s	remaining: 4m 29s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.9340485894
bestIteration = 4488

Shrink model to first 4489 iterations.
0:	test: 0.8810204	best: 0.8810204 (0)	total: 34.6ms	remaining: 11m 31s
500:	test: 0.9110385	best: 0.9115248 (433)	

In [29]:
pred_result = np.zeros(len(test_data))

In [32]:
pred_result += ar[3]

In [33]:
pred_result/=3

In [34]:
submission = pd.DataFrame()
submission['ID'] = test_data['ID']
submission['Label'] = pred_result
submission

Unnamed: 0,ID,Label
0,0,0.001614
1,2,0.021409
2,4,0.002798
3,5,0.002876
4,6,0.002454
5,7,0.001858
6,8,0.00068
7,11,0.001897
8,12,0.004487
9,13,0.004966


In [35]:
submission.to_csv('submission.csv', index=False, encoding='UTF8')