In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.preprocessing import Imputer, LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [16]:
target = '标签'
uid = '申请编号'

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

def ks_score_lgb(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return 'KS', KS, True

In [3]:
############## Load

X = pd.read_csv('./tmp/1_X.csv', header=0, index_col=0)
X.shape
X.head()

y = pd.read_csv('./tmp/1_y.csv', header=0, index_col=0)
y.shape
y.head()

(140000, 329)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,319,320,321,322,323,324,325,326,327,328
0,0.0,460190.889355,17463.042019,419951.511045,7.0,-10107.0,-342.0,-5421.0,-3292.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,424370.659603,15585.046388,384597.975692,7.0,-13980.0,-1110.0,-3387.0,-826.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
2,0.0,469330.587153,18837.558252,419951.511045,1.0,-13331.0,-2246.0,-3870.0,-171.0,1.0,...,1.0,0.1,7.0,0.7,1.0,0.1,1.0,0.1,0.0,0.0
3,0.0,464188.35314,16754.587069,409850.500944,7.0,-16540.0,-1864.9312,-970.0,-2916.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,434196.988738,16165.0092,394698.985793,7.0,-17919.0,-11037.0,-9350.0,-3588.0,1.0,...,1.0,0.2,0.0,0.0,2.0,0.4,1.0,0.2,0.0,0.0


(140000, 1)

Unnamed: 0,0
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0


In [31]:
########## Grid Search

scale_pos_weight = 119/21

param_general = {
#     'n_iter' : 100,
#     'cv' : 5, 
    'n_iter' : 2,
    'cv' : 2, 
    'scoring' : 'roc_auc', 
    'n_jobs' : -1, 
    'random_state' : 123, 
    'verbose' : 1}

# RF
param_dist_rf = {
    # Shape
#     'n_estimators' : range(100, 1000, 100),
    'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_samples_split' : range(50, 100, 10),
    'min_samples_leaf' : range(50, 100, 10),
    # Sample
    'class_weight' : ['balanced', None],
    'max_features' : ['sqrt', 'log2'],
    # Objective
    'criterion' : ['gini', 'entropy']
}

# XGB
param_dist_xgb = {
    # Shape
#     'n_estimatores' : range(100, 1000, 100),
    'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'scale_pos_weight' : [scale_pos_weight, 1],
    'subsample' : np.linspace(0.5, 0.9, 5),
    'colsample_bytree' : np.linspace(0.5, 0.9, 5),
    'colsample_bylevel' : np.linspace(0.5, 0.9, 5),
    'subsample' : np.linspace(0.5, 0.9, 5),
    # Algo
    'eta' : np.linspace(0.01, 0.2, 20), # Learning_rate
    'alpha' : np.linspace(0, 1, 10),
    'lambda' : range(0, 50, 5),
    'early_stopping_rounds' : range(10, 20, 5)
}

# LGB
param_dist_lgb = {
    # Shape
#     'num_boost_round' : range(100, 1000, 100),
    'num_boost_round' : range(50, 100, 10),
    'num_leaves' : range(2**3, 2**10, 100),
    'min_data_in_leaf' : range(50, 100, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'is_unbalance' : [True, False],
    'bagging_freq': range(2, 10), # >0 enable bagging_fraction
    'bagging_fraction': np.linspace(0.5, 0.9, 5),
    'feature_fraction': np.linspace(0.5, 0.9, 5),
    'subsample' : np.linspace(0.5, 0.9, 5),
    # Algo
    'learning_rate':np.linspace(0.01, 0.2, 20),    
    'lambda_l1': np.linspace(0, 1, 10),
    'lambda_l2': range(0, 50, 5),
    'cat_smooth': range(1, 40, 5)
#     'early_stopping_rounds' : range(10, 20, 5)
}

##########

# RF
param_fixed_rf = {
    'n_jobs' : -1,
    'oob_score' : True,
    'random_state':123,
    'verbose':0
}

# XGB
param_fixed_xgb = {
    'n_jobs' : -1,
    'eval_metric': 'auc',
    'seed' : 123,
    'silent' : 1,
    'verbose_eval':0
}

# LGB
param_fixed_lgb = {
    'n_jobs' : -1,
    'metric' : 'auc',
    'bagging_seed':123,
    'feature_fraction_seed':123,
    'verbose_eval' : 0
}

In [41]:
model_params

array({'n_estimators': 9, 'min_samples_split': 50, 'min_samples_leaf': 80, 'max_features': 'log2', 'max_depth': 9, 'criterion': 'entropy', 'class_weight': 'balanced'},
      dtype=object)

In [47]:
model_params
param_fixed_rf
{**model_params, **param_fixed_rf}

{'n_estimators': 9,
 'min_samples_split': 50,
 'min_samples_leaf': 80,
 'max_features': 'log2',
 'max_depth': 9,
 'criterion': 'entropy',
 'class_weight': 'balanced'}

{'n_jobs': -1, 'oob_score': True, 'random_state': 123, 'verbose': 0}

{'n_estimators': 9,
 'min_samples_split': 50,
 'min_samples_leaf': 80,
 'max_features': 'log2',
 'max_depth': 9,
 'criterion': 'entropy',
 'class_weight': 'balanced',
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 123,
 'verbose': 0}

In [52]:
############## RF

''' Baseline '''
baseline = RandomForestClassifier(**param_fixed_rf)
baseline.fit(X, y)
pred_baseline = baseline.predict_proba(X)
ks_score(y, pred_baseline[:,1])

''' Best '''
grid = RandomizedSearchCV(RandomForestClassifier(**param_fixed_rf), param_dist_rf, **param_general)
grid.fit(X, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_rf.npy', best_params)

''' Test Clone Model '''
model1 = grid.best_estimator_
model1.fit(X, y)
ks_score(y, model1.predict_proba(X)[:,1])

''' Test Save Params '''
best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
model2_params = {**best_params_load, **param_fixed_rf}
model2 = RandomForestClassifier(**model2_params)
model2.fit(X, y)
ks_score(y, model2.predict_proba(X)[:,1])

' Baseline '

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=True, random_state=123, verbose=0,
                       warm_start=False)

0.9848

' Best '

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    2.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    2.6s finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=-1, 

0.6763524677871149

' Test Clone Model '

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=9, max_features='log2',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=80,
                       min_samples_split=50, min_weight_fraction_leaf=0.0,
                       n_estimators=9, n_jobs=-1, oob_score=True,
                       random_state=123, verbose=0, warm_start=False)

0.3181

' Test Save Params '

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=9, max_features='log2',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=80,
                       min_samples_split=50, min_weight_fraction_leaf=0.0,
                       n_estimators=9, n_jobs=-1, oob_score=True,
                       random_state=123, verbose=0, warm_start=False)

0.3181

In [54]:
############## XGB

''' Baseline '''
baseline = XGBClassifier(**param_fixed_xgb)
baseline.fit(X, y)
pred_baseline = baseline.predict_proba(X)
ks_score(y, pred_baseline[:,1])

''' Best '''
grid = RandomizedSearchCV(XGBClassifier(**param_fixed_xgb), param_dist_xgb, **param_general)
grid.fit(X, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_xgb.npy', best_params)

''' Test Clone Model '''
model1 = grid.best_estimator_
model1.fit(X, y)
ks_score(y, model1.predict_proba(X)[:,1])

''' Test Save Params '''
best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
model2_params = {**best_params_load, **param_fixed_xgb}
model2 = XGBClassifier(**model2_params)
model2.fit(X, y)
ks_score(y, model2.predict_proba(X)[:,1])

' Baseline '

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123, silent=1,
              subsample=1, verbose_eval=0, verbosity=1)

0.3423

' Best '

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.5s finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           eval_metric='auc', gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=-1, nthread=None,
                                           objective='binary:logistic',
                                           random_...
                                        'eta': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 

0.6841948163265307

' Test Clone Model '

XGBClassifier(alpha=0.5555555555555556, base_score=0.5, booster='gbtree',
              colsample_bylevel=0.7, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=15, eta=0.04, eval_metric='auc', gamma=0,
              lambda=20, learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=2, missing=None, n_estimators=7, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123, silent=1,
              subsample=0.8, verbose_eval=0, verbosity=1)

0.3466

' Test Save Params '

XGBClassifier(alpha=0.5555555555555556, base_score=0.5, booster='gbtree',
              colsample_bylevel=0.7, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=15, eta=0.04, eval_metric='auc', gamma=0,
              lambda=20, learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=2, missing=None, n_estimators=7, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123, silent=1,
              subsample=0.8, verbose_eval=0, verbosity=1)

0.3466

In [53]:
############## LGB

''' Baseline '''
baseline = LGBMClassifier(**param_fixed_lgb)
baseline.fit(X, y)
pred_baseline = baseline.predict_proba(X) #, num_iteration=baseline.best_iteration_)
ks_score(y, pred_baseline[:,1])

''' Best '''
grid = RandomizedSearchCV(LGBMClassifier(**param_fixed_lgb), param_dist_lgb, **param_general)
grid.fit(X, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_lgb.npy', best_params)

''' Test Clone Model '''
model1 = grid.best_estimator_
model1.fit(X, y)
ks_score(y, model1.predict_proba(X)[:,1])

''' Test Save Params '''
best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model2_params = {**best_params_load, **param_fixed_lgb}
model2 = LGBMClassifier(**model2_params)
model2.fit(X, y)
ks_score(y, model2.predict_proba(X)[:,1])

' Baseline '

LGBMClassifier(bagging_seed=123, boosting_type='gbdt', class_weight=None,
               colsample_bytree=1.0, feature_fraction_seed=123,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               metric='auc', min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0, verbose_eval=0)

0.4297

' Best '

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.8s finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=LGBMClassifier(bagging_seed=123,
                                            boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            feature_fraction_seed=123,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            metric='auc', min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leav...
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.1

0.7140608083233293

' Test Clone Model '

LGBMClassifier(bagging_fraction=0.5, bagging_freq=3, bagging_seed=123,
               boosting_type='gbdt', cat_smooth=16, class_weight=None,
               colsample_bytree=1.0, feature_fraction=0.6,
               feature_fraction_seed=123, importance_type='split',
               is_unbalance=False, lambda_l1=0.1111111111111111, lambda_l2=35,
               learning_rate=0.03, max_depth=-1, metric='auc',
               min_child_samples=20, min_child_weight=5, min_data_in_leaf=60,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
               num_boost_round=90, num_leaves=1008, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.9, ...)

0.468

' Test Save Params '

LGBMClassifier(bagging_fraction=0.5, bagging_freq=3, bagging_seed=123,
               boosting_type='gbdt', cat_smooth=16, class_weight=None,
               colsample_bytree=1.0, feature_fraction=0.6,
               feature_fraction_seed=123, importance_type='split',
               is_unbalance=False, lambda_l1=0.1111111111111111, lambda_l2=35,
               learning_rate=0.03, max_depth=-1, metric='auc',
               min_child_samples=20, min_child_weight=5, min_data_in_leaf=60,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
               num_boost_round=90, num_leaves=1008, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.9, ...)

0.468

In [25]:
# Test

rf = joblib.load('./model/base_rf.m')
lgb = lgb.Booster(model_file='./model/base_lgb.txt')

# for m in rf, xgb, lgb:
for m in rf, lgb:
    print(f'{m.__class__}')
    pred = rf1.predict_proba(X)

<class 'sklearn.ensemble.forest.RandomForestClassifier'>


NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.