In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('../data/train.csv')
label = pd.read_csv('../data/train_label.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/submission.csv')
train_data = pd.merge(train,label,on='ID')

drop_feat = ['ID', '经营期限至', '经营期限自','成立日期','经营范围','邮政编码','核准日期','注销时间','Label']
feat = [item for item in train_data.columns if item not in drop_feat]

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
X = train_data[feat]
y = train_data['Label'].values
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [29]:
params = {'num_leaves': 60, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "is_unbalance": True,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,             #l1正则
          # 'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},  ##评价函数选择
          "random_state": 2019, #随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }



In [30]:
trn_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_test, label=y_test)

In [31]:
num_round = 1000
clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=20,
                    early_stopping_rounds=60)

Training until validation scores don't improve for 60 rounds.
[20]	training's binary_logloss: 0.165998	training's auc: 0.965019	valid_1's binary_logloss: 0.173425	valid_1's auc: 0.919586
[40]	training's binary_logloss: 0.168648	training's auc: 0.967365	valid_1's binary_logloss: 0.189735	valid_1's auc: 0.920359
[60]	training's binary_logloss: 0.178294	training's auc: 0.96921	valid_1's binary_logloss: 0.209601	valid_1's auc: 0.920204
Early stopping, best iteration is:
[16]	training's binary_logloss: 0.168524	training's auc: 0.963886	valid_1's binary_logloss: 0.172473	valid_1's auc: 0.918992


In [32]:
pd.set_option('max_rows', None)
feature_importance_df = pd.DataFrame()
feature_importance_df["Feature"] = feat
feature_importance_df["importance"] = clf.feature_importance()

In [33]:
feature_importance_df

Unnamed: 0,Feature,importance
0,企业类型,34
1,登记机关,39
2,企业状态,5
3,投资总额,20
4,注册资本,73
5,行业代码,115
6,行业门类,30
7,企业类别,2
8,管辖机关,30
9,增值税,76


In [34]:
prob_oof= clf.predict(x_test, num_iteration=clf.best_iteration)

In [35]:
# threshold = 0.4
# prob_oof = [1 if item > threshold else 0 for item in prob_oof ]

In [36]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, prob_oof)
score

0.9189918038732137

In [18]:
from collections import Counter
print(Counter(y_test))
print(Counter(prob_oof))

Counter({0: 4055, 1: 255})
Counter({0: 4310})


In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x.shape[0], ))
test_pred_prob = np.zeros((test.shape[0], ))


## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print("fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y[trn_idx])
    val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y[val_idx])


    clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=20,
                    early_stopping_rounds=60)
    prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    test_pred_prob += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

threshold = 0.5
for pred in test_pred_prob:
    result = 1 if pred > threshold else 0

In [37]:
###cat


In [3]:
from catboost import CatBoostClassifier

In [4]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='Logloss',
                               eval_metric="AUC",
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=20000,
                               od_type="Iter",
        
#                                depth=8,
                               early_stopping_rounds=500,
    #                            l2_leaf_reg=1,
    #                            border_count=96,
                               random_seed=2019
                              )
        
    return clf

In [5]:
# x_train,x_test,y_train,y_test
drop_feat = ['ID', '经营期限至', '经营期限自','成立日期','经营范围','核准日期','注销时间','Label']
feat = [item for item in train_data.columns if item not in drop_feat]
cat_features=['邮政编码']
train_data[cat_features] = list(str(item) for item in train_data[cat_features])
X = train_data[feat]
y = train_data['Label'].values
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=42)


cat_clf = make_classifier()
cat_clf.fit(x_train, y_train, eval_set=(x_test, y_test),
            use_best_model=True, verbose=500, cat_features=cat_features)
    

0:	test: 0.8481405	best: 0.8481405 (0)	total: 76.8ms	remaining: 25m 35s
500:	test: 0.9152603	best: 0.9152603 (500)	total: 7.03s	remaining: 4m 33s
1000:	test: 0.9180242	best: 0.9180242 (1000)	total: 14.5s	remaining: 4m 35s
1500:	test: 0.9198835	best: 0.9201349 (1433)	total: 21.9s	remaining: 4m 29s
2000:	test: 0.9195208	best: 0.9202611 (1647)	total: 28.8s	remaining: 4m 18s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.9202611155
bestIteration = 1647

Shrink model to first 1648 iterations.


<catboost.core.CatBoostClassifier at 0x10a735470>

In [6]:
from sklearn.metrics import roc_auc_score
prob_oof = cat_clf.predict_proba(x_test)
prob_oof = [item[1] for item in prob_oof]
score = roc_auc_score(y_test, prob_oof)
score

0.920261115543628

In [9]:
pd.set_option('max_rows', None)
feature_importance_df = pd.DataFrame()
feature_importance_df["Feature"] = cat_clf.feature_names_
feature_importance_df["importance"] = cat_clf.feature_importances_

feature_importance_df

Unnamed: 0,Feature,importance
0,企业类型,1.755587
1,登记机关,1.895786
2,企业状态,0.729484
3,邮政编码,0.0
4,投资总额,1.324455
5,注册资本,2.578459
6,行业代码,2.799386
7,行业门类,1.588462
8,企业类别,0.995954
9,管辖机关,2.306347
