In [1]:
import pandas as pd
import lightgbm as lgb
import json
from sklearn.model_selection import train_test_split

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('../data/train.csv')
label = pd.read_csv('../data/train_label.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/submission.csv')
train_data = pd.merge(train,label,on='ID')

drop_feat = ['ID','邮政编码','Label']
feat = [item for item in train_data.columns if item not in drop_feat]

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train_data.head()

Unnamed: 0,ID,企业类型,经营期限至,登记机关,企业状态,邮政编码,投资总额,注册资本,核准日期,行业代码,...,一年内到期的非流动资产_年末数,油气资产_年末数,应收补贴款_年末数,应收股利_年末数,应收利息_年末数,应收票据_年末数,预付款项_年末数,资产总计_年末数,在建工程_年末数,Label
0,1,9.0,,1.0,0.0,266300,19.11,19.11,00:00.0,5154.0,...,0.0,0.0,0.0,0.0,0.0,28.122902,1.004419,632.032182,0.0,0
1,3,11.0,,5.0,0.0,266000,29.154194,19.210446,00:00.0,7214.0,...,,,,,,,,,,0
2,9,0.0,,3.0,0.0,266000,,19.120045,13:31.0,2110.0,...,,,,,,,,,,0
3,10,0.0,42:05.0,9.0,0.0,266000,,19.160223,00:00.0,5163.0,...,,,,,,,,,,0
4,14,0.0,,2.0,0.0,266400,,19.210446,00:00.0,5439.0,...,0.0,0.0,0.0,0.0,0.0,10.044194,0.508236,319.205293,0.0,0


In [4]:
def deal_time(x):
    if str(x) == 'nan':
        return x
    
    x = str(x)
    parts = x.split(':')
    return int(parts[0]) + float(parts[1])/100

In [7]:
def cal_diff(x, y):
    if str(x) == 'nan':
        return x
    elif str(y) == 'nan':
        return y
    else:
        return float(y)-float(x)

In [8]:
def deal_manage_range(x):
    g = json.loads(x)
    return len(g)

In [9]:
train_data['经营期限至'] = train_data['经营期限至'].apply(deal_time)
train_data['经营期限自'] = train_data['经营期限自'].apply(deal_time)
train_data['成立日期'] = train_data['成立日期'].apply(deal_time)
train_data['核准日期'] = train_data['核准日期'].apply(deal_time)
train_data['注销时间'] = train_data['注销时间'].apply(deal_time)
train_data['经营范围'] = train_data['经营范围'].apply(deal_manage_range)

for item in train_data.columns:
    parts = item.split("_")
    if len(parts) == 2:
        if parts[1] == '年初数':
            train_data[parts[0] + '_diff'] = train_data.apply(lambda row: cal_diff(train_data[item], train_data[parts[0]+"_年末数"]), axis=1)

ValueError: invalid literal for int() with base 10: 'nan'

In [8]:
train_data.head()

Unnamed: 0,ID,企业类型,经营期限至,登记机关,企业状态,邮政编码,投资总额,注册资本,核准日期,行业代码,...,一年内到期的非流动资产_年末数,油气资产_年末数,应收补贴款_年末数,应收股利_年末数,应收利息_年末数,应收票据_年末数,预付款项_年末数,资产总计_年末数,在建工程_年末数,Label
0,1,9.0,,1.0,0.0,266300,19.11,19.11,0.0,5154.0,...,0.0,0.0,0.0,0.0,0.0,28.122902,1.004419,632.032182,0.0,0
1,3,11.0,,5.0,0.0,266000,29.154194,19.210446,0.0,7214.0,...,,,,,,,,,,0
2,9,0.0,,3.0,0.0,266000,,19.120045,13.31,2110.0,...,,,,,,,,,,0
3,10,0.0,42.05,9.0,0.0,266000,,19.160223,0.0,5163.0,...,,,,,,,,,,0
4,14,0.0,,2.0,0.0,266400,,19.210446,0.0,5439.0,...,0.0,0.0,0.0,0.0,0.0,10.044194,0.508236,319.205293,0.0,0


In [8]:
X = train_data[feat]
y = train_data['Label'].values
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [9]:
params = {'num_leaves': 60, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "is_unbalance": True,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,             #l1正则
          # 'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},  ##评价函数选择
          "random_state": 2019, #随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }



In [10]:
trn_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_test, label=y_test)

In [11]:
num_round = 1000
clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=20,
                    early_stopping_rounds=60)

Training until validation scores don't improve for 60 rounds.
[20]	training's binary_logloss: 0.164369	training's auc: 0.967385	valid_1's binary_logloss: 0.172732	valid_1's auc: 0.915557
[40]	training's binary_logloss: 0.164452	training's auc: 0.970559	valid_1's binary_logloss: 0.187182	valid_1's auc: 0.917751
[60]	training's binary_logloss: 0.172369	training's auc: 0.972993	valid_1's binary_logloss: 0.206312	valid_1's auc: 0.918147
Early stopping, best iteration is:
[16]	training's binary_logloss: 0.167266	training's auc: 0.966385	valid_1's binary_logloss: 0.171934	valid_1's auc: 0.915354


In [12]:
pd.set_option('max_rows', None)
feature_importance_df = pd.DataFrame()
feature_importance_df["Feature"] = feat
feature_importance_df["importance"] = clf.feature_importance()
feature_importance_df

Unnamed: 0,Feature,importance
0,企业类型,21
1,经营期限至,37
2,登记机关,29
3,企业状态,1
4,投资总额,12
5,注册资本,38
6,核准日期,34
7,行业代码,62
8,注销时间,7
9,经营期限自,48


In [13]:
prob_oof= clf.predict(x_test, num_iteration=clf.best_iteration)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, prob_oof)
score

0.9153535939653297