In [171]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.preprocessing import Imputer, LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [172]:
target = '标签'
uid = '申请编号'

def get_time():
    now = datetime.datetime.now().strftime("%m-%d %H:%M")
    print(now)

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

In [185]:
################  Load Features

''' Load '''
X = pd.read_csv('./tmp/train_d1234.csv', header=0, index_col=0)
X.shape

y = pd.read_csv('./data/train_label.csv', header=0, index_col=0)
y.shape

''' Merge '''
# Merge
xy = pd.merge(X, y, on=uid, how='inner')
xy.drop(uid, axis=1, inplace=True)
xy.shape

''' Split '''
# X, y
X = xy.copy()
y = X.pop(target)
X.shape
y.shape

' Load '

(140000, 405)

(140000, 1)

' Merge '

(140000, 405)

' Split '

(140000, 404)

(140000,)

In [174]:
########## Grid Search

scale_pos_weight = 119/21
cv = 5

param_general = {
    'n_iter' : 50,
    'cv' : cv, 
    'scoring' : 'roc_auc', 
    'n_jobs' : -1, 
    'random_state' : 123, 
    'verbose' : 1}

# RF
param_dist_rf = {
    # Shape
    'n_estimators' : range(50, 500, 50),
#     'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_samples_split' : range(50, 100, 10),
    'min_samples_leaf' : range(50, 100, 10),
    # Sample
    'class_weight' : ['balanced', None],
    'max_features' : ['sqrt', 'log2'],
    # Objective
    'criterion' : ['gini', 'entropy']
}

# XGB
param_dist_xgb = {
    # Shape
    'n_estimators' : range(50, 500, 50),
#     'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'scale_pos_weight' : [scale_pos_weight, 1],
    'subsample' : np.linspace(0.5, 0.9, 5),
    'colsample_bytree' : np.linspace(0.5, 0.9, 5),
    'colsample_bylevel' : np.linspace(0.5, 0.9, 5),
    # Algo
    'eta' : np.linspace(0.01, 0.2, 20), # Learning_rate
    'alpha' : np.linspace(0, 1, 10),
    'lambda' : range(0, 50, 5),
    'early_stopping_rounds' : range(10, 20, 5)
}

# XGBClassifier(alpha=0.7777777777777777, base_score=0.5, booster='gbtree',
#               colsample_bylevel=0.7, colsample_bynode=1, colsample_bytree=0.6,
#               early_stopping_rounds=10, eta=0.03, eval_metric='auc', gamma=0,
#               lambda=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
#               min_child_weight=7, missing=None, n_estimatores=200,
#               n_estimators=100, n_jobs=1, nthread=None,
#               objective='binary:logistic', random_state=0, reg_alpha=0,
#               reg_lambda=1, scale_pos_weight=1, seed=123, silent=1,
#               subsample=0.8, verbose_eval=0, verbosity=1)

# LGB
param_dist_lgb = {
    # Shape
    'num_boost_round' : range(50, 500, 50),
#     'num_boost_round' : range(50, 100, 10),
    'num_leaves' : range(2**3, 2**10, 100),
    'min_data_in_leaf' : range(50, 100, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'is_unbalance' : [True, False],
    'bagging_freq': range(2, 10), # >0 enable bagging_fraction
    'bagging_fraction': np.linspace(0.5, 0.9, 5),
    'feature_fraction': np.linspace(0.5, 0.9, 5),
    'subsample' : np.linspace(0.5, 0.9, 5),
    # Algo
    'learning_rate':np.linspace(0.01, 0.2, 20),    
    'lambda_l1': np.linspace(0, 1, 10),
    'lambda_l2': range(0, 50, 5),
    'cat_smooth': range(1, 40, 5)
#     'early_stopping_rounds' : range(10, 20, 5)
}

param_dist_lr = {
    # Shape
    'max_iter' : range(50, 500, 50),
    # Sample
    'class_weight' : [scale_pos_weight, 1],
    # Algo
    'solver' : ['sag', 'lbfgs', 'newton-cg'],
    'C': [0.001, 0.01, 0.1, 1] # 1/λ
}

##########

# RF
param_fixed_rf = {
    'n_jobs' : -1,
    'oob_score' : True,
    'random_state':123,
    'verbose':0
}

# XGB
param_fixed_xgb = {
    'n_jobs' : -1,
    'eval_metric': 'auc',
    'seed' : 123,
    'silent' : 1,
    'verbose_eval':0
}

# LGB
param_fixed_lgb = {
    'n_jobs' : -1,
    'metric' : 'auc',
    'random_state' : 123,
    'bagging_seed':123,
    'feature_fraction_seed':123,
    'verbose_eval' : 0
}

# LR
param_fixed_lr = {
    'n_jobs' : -1,
    'random_state' : 123,
    'verbose' : 0     
}

In [32]:
############## RF

get_time()
''' Baseline '''
baseline = RandomForestClassifier(**param_fixed_rf)
baseline.fit(X, y)
pred_baseline = baseline.predict_proba(X)
ks_score(y, pred_baseline[:,1])

get_time()
''' Best '''
grid = RandomizedSearchCV(RandomForestClassifier(**param_fixed_rf), param_dist_rf, **param_general)
grid.fit(X, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_rf.npy', best_params)
get_time()

# ''' Test Clone Model '''
# model1 = grid.best_estimator_
# model1.fit(X, y)
# ks_score(y, model1.predict_proba(X)[:,1])
# 
# ''' Test Save Params '''
# best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
# model2_params = {**best_params_load, **param_fixed_rf}
# model2 = RandomForestClassifier(**model2_params)
# model2.fit(X, y)
# ks_score(y, model2.predict_proba(X)[:,1])

10-20 09:51


' Baseline '

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=True, random_state=123, verbose=0,
                       warm_start=False)

0.9852

10-20 09:51


' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 51.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 68.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=-1, 

0.7002853321328532

10-20 11:01


In [34]:
############## XGB

get_time()
''' Baseline '''
baseline = XGBClassifier(**param_fixed_xgb)
baseline.fit(X, y)
pred_baseline = baseline.predict_proba(X)
ks_score(y, pred_baseline[:,1])

get_time()
''' Best '''
grid = RandomizedSearchCV(XGBClassifier(**param_fixed_xgb), param_dist_xgb, **param_general)
grid.fit(X, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_xgb.npy', best_params)
get_time()
get_time()

# ''' Test Clone Model '''
# model1 = grid.best_estimator_
# model1.fit(X, y)
# ks_score(y, model1.predict_proba(X)[:,1])

# ''' Test Save Params '''
# best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
# model2_params = {**best_params_load, **param_fixed_xgb}
# model2 = XGBClassifier(**model2_params)
# model2.fit(X, y)
# ks_score(y, model2.predict_proba(X)[:,1])

10-20 12:13


' Baseline '

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123, silent=1,
              subsample=1, verbose_eval=0, verbosity=1)

0.3422

10-20 12:14


' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 160.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 220.0min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           eval_metric='auc', gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=-1, nthread=None,
                                           objective='binary:logistic',
                                           random_...
                                        'eta': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 

0.7251946138455383

10-20 15:55
10-20 15:55


In [184]:
X[X.columns[X.columns.str.startswith('外部评分')]].sum(axis=0)

外部评分_x    66440.027278
外部评分_y    66440.027278
dtype: float64

In [33]:
############## LGB

get_time()
''' Baseline '''
baseline = LGBMClassifier(**param_fixed_lgb)
baseline.fit(X, y)
pred_baseline = baseline.predict_proba(X) #, num_iteration=baseline.best_iteration_)
ks_score(y, pred_baseline[:,1])

get_time()
''' Best '''
grid = RandomizedSearchCV(LGBMClassifier(**param_fixed_lgb), param_dist_lgb, **param_general)
grid.fit(X, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_lgb.npy', best_params)
get_time()

# ''' Test Clone Model '''
# model1 = grid.best_estimator_
# model1.fit(X, y)
# ks_score(y, model1.predict_proba(X)[:,1])

# ''' Test Save Params '''
# best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
# model2_params = {**best_params_load, **param_fixed_lgb}
# model2 = LGBMClassifier(**model2_params)
# model2.fit(X, y)
# ks_score(y, model2.predict_proba(X)[:,1])

10-20 11:03


' Baseline '

LGBMClassifier(bagging_seed=123, boosting_type='gbdt', class_weight=None,
               colsample_bytree=1.0, feature_fraction_seed=123,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               metric='auc', min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0, verbose_eval=0)

0.4274

10-20 11:03


' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 52.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 69.1min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=LGBMClassifier(bagging_seed=123,
                                            boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            feature_fraction_seed=123,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            metric='auc', min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leav...
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.1

0.7294587094837935

10-20 12:13


In [170]:
############## LR

X_meta = pd.read_csv('./tmp/meta_X.csv', header=0, index_col=0)
X_meta.shape
y.shape

''' Baseline '''
baseline = LogisticRegression(**param_fixed_lr)
baseline.fit(X_meta, y)
pred_baseline = baseline.predict_proba(X_meta)
ks_score(y, pred_baseline[:,1])

''' Best '''
grid = RandomizedSearchCV(LogisticRegression(**param_fixed_lr), param_dist_lr, **param_general)
grid.fit(X_meta, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_lr.npy', best_params)

''' Test Clone Model '''
model1 = grid.best_estimator_
model1.fit(X_meta, y)
ks_score(y, model1.predict_proba(X_meta)[:,1])

''' Test Save Params '''
best_params_load = np.load('./model/base_lr.npy', allow_pickle=True).item()
model2_params = {**best_params_load, **param_fixed_lr}
model2 = LogisticRegression(**model2_params)
model2.fit(X_meta, y)
ks_score(y, model2.predict_proba(X_meta)[:,1])

(140000, 3)

(140000,)

' Baseline '

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

0.3503

' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   20.9s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='warn', n_jobs=-1,
                                                penalty='l2', random_state=123,
                                                solver='warn', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='warn', n_iter=50, n_jobs=-1,
                   param_distributions={'C': [0.001, 0.01, 0.1, 1],
                                        'class_weight': [5.666666666666667, 1],
                                        'max_iter': range(50, 500, 50),
                                        'solver': ['sag',

0.7358445518207283

' Test Clone Model '

LogisticRegression(C=1, class_weight=1, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

0.3504

' Test Save Params '

LogisticRegression(C=1, class_weight=1, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

0.3504

In [186]:
################ Important Features

######## RF

best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_rf}
RF = RandomForestClassifier(**model_params)

# Train
RF.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(RF.feature_importances_, columns=['rf'])
fscore_rf = pd.concat([f, score], axis=1).sort_values(by='rf', ascending=False).reset_index(drop=True)
fscore_rf.head()

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=9, max_features='sqrt',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=50,
                       min_samples_split=50, min_weight_fraction_leaf=0.0,
                       n_estimators=400, n_jobs=-1, oob_score=True,
                       random_state=123, verbose=0, warm_start=False)

Unnamed: 0,feature,rf
0,外部评分,0.069753
1,出生日期距申请日期天数,0.045533
2,工作日期距申请日期天数,0.033629
3,教育程度,0.026104
4,信用额度sum比,0.021362


In [187]:
############# XGB importance

# XGB
best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_xgb}
XGB = XGBClassifier(**model_params)

# Train
XGB.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(XGB.feature_importances_, columns=['xgb'])
fscore_xgb = pd.concat([f, score], axis=1).sort_values(by='xgb', ascending=False).reset_index(drop=True)
fscore_xgb.head()

XGBClassifier(alpha=0.7777777777777777, base_score=0.5, booster='gbtree',
              colsample_bylevel=0.7, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=10, eta=0.03, eval_metric='auc', gamma=0,
              lambda=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=7, missing=None, n_estimatores=200,
              n_estimators=100, n_jobs=-1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=123, silent=1,
              subsample=0.8, verbose_eval=0, verbosity=1)

Unnamed: 0,feature,xgb
0,教育程度,0.018646
1,客户类型_0sum,0.014576
2,收入类型,0.01374
3,外部评分,0.012082
4,客户居住地评分2,0.011659


In [188]:
############# LGB importance

# LGB
best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lgb}
LGB = LGBMClassifier(**model_params)

# Train
LGB.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(LGB.feature_importances_, columns=['lgb'])
fscore_lgb = pd.concat([f, score], axis=1).sort_values(by='lgb', ascending=False).reset_index(drop=True)
fscore_lgb.head()

LGBMClassifier(bagging_fraction=0.9, bagging_freq=4, bagging_seed=123,
               boosting_type='gbdt', cat_smooth=16, class_weight=None,
               colsample_bytree=1.0, feature_fraction=0.9,
               feature_fraction_seed=123, importance_type='split',
               is_unbalance=True, lambda_l1=0.8888888888888888, lambda_l2=25,
               learning_rate=0.02, max_depth=-1, metric='auc',
               min_child_samples=20, min_child_weight=2, min_data_in_leaf=90,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
               num_boost_round=400, num_leaves=308, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.8, ...)

Unnamed: 0,feature,lgb
0,贷款年金,3160
1,出生日期距申请日期天数,3073
2,身份认证日期距申请日期天数,3006
3,信用额度,2918
4,工作日期距申请日期天数,2840


In [189]:
############# correlations

correlations = xy.corr()

# Save
correlations.apply(abs).to_csv('./ana/0_correlations_abs.csv')
correlations.to_csv('./ana/0_correlations.csv')

# Abs
correlations_target_abs = correlations.loc[correlations.index != target, target].apply(abs).sort_values(ascending=False)

f = pd.DataFrame(correlations_target_abs.index, columns=['feature'])
score_corr = pd.DataFrame(correlations_target_abs.values, columns=['corr'])
fscore_corr = pd.concat([f, score_corr], axis=1).sort_values(by='corr', ascending=False).reset_index(drop=True)
fscore_corr.head()


# # >=0.01
# features_top = {}
# features_top['corr'] = correlations_target_abs[correlations_target_abs>=0.01]
# features_top['corr'].count()

Unnamed: 0,feature,corr
0,外部评分,0.13148
1,出生日期距申请日期天数,0.103142
2,客户居住地评分2,0.078824
3,客户居住地评分1,0.076496
4,最近一次换手机号码距申请日天数,0.074596


In [190]:
#### Merge important

fscore = pd.merge(fscore_corr, fscore_rf, on='feature')
fscore = pd.merge(fscore, fscore_xgb, on='feature')
fscore = pd.merge(fscore, fscore_lgb, on='feature')
# Add rank
frank = fscore.rank(numeric_only=True, method='min', ascending=False)
fscore = pd.merge(fscore, frank, left_index=True, right_index=True, suffixes=['', '_rank'])

fscore.shape
fscore.head()
fscore.to_csv('./model/f_score.csv')

(404, 9)

Unnamed: 0,feature,corr,rf,xgb,lgb,corr_rank,rf_rank,xgb_rank,lgb_rank
0,外部评分,0.13148,0.069753,0.012082,2253,1.0,1.0,4.0,8.0
1,出生日期距申请日期天数,0.103142,0.045533,0.007693,3073,2.0,2.0,11.0,2.0
2,客户居住地评分2,0.078824,0.009034,0.011659,326,3.0,32.0,5.0,106.0
3,客户居住地评分1,0.076496,0.007921,0.008773,208,4.0,37.0,8.0,133.0
4,最近一次换手机号码距申请日天数,0.074596,0.019166,0.005011,2442,5.0,11.0,34.0,7.0


In [209]:
#### Intersection

n = len(fscore)
m = round(n * 0.6)
m

top_f = {}
inter_cnt = {}
for k in 'corr', 'rf', 'xgb', 'lgb':
    top_f[k] = set(fscore.loc[fscore[f'{k}_rank']<=m, 'feature'])
    inter_cnt[k] = round(len(top_f['corr'].intersection(top_f[k]))/m, 2)
    
inter_cnt['rf_xgb'] = round(len(top_f['rf'].intersection(top_f['xgb']))/m, 2)
inter_cnt['rf_lgb'] = round(len(top_f['rf'].intersection(top_f['lgb']))/m, 2)
inter_cnt['xgb_lgb'] = round(len(top_f['xgb'].intersection(top_f['lgb']))/m, 2)

''' Corr '''
pd.DataFrame(inter_cnt, index=['Intersection'])

''' Describe '''
fscore[['corr', 'rf', 'xgb', 'lgb']].describe()

''' Importance ''' 
for k in 'corr', 'rf', 'xgb', 'lgb':
    k
    fscore.loc[(fscore[f'{k}_rank']>(m-3)) & (fscore[f'{k}_rank']<(m+3)), k].values

242

' Corr '

Unnamed: 0,corr,rf,xgb,lgb,rf_xgb,rf_lgb,xgb_lgb
Intersection,1.0,0.82,0.79,0.79,0.83,0.92,0.86


' Describe '

Unnamed: 0,corr,rf,xgb,lgb
count,402.0,404.0,404.0,404.0
mean,0.01865047,0.002475,0.002475,303.960396
std,0.0188935,0.00598,0.002375,538.189261
min,1.216878e-15,0.0,0.0,0.0
25%,0.003952374,2e-06,0.0,1.0
50%,0.01133761,0.00049,0.002663,67.5
75%,0.02872666,0.001745,0.003452,352.5
max,0.1314799,0.069753,0.018646,3160.0


' Importance '

'corr'

array([0.00811653, 0.008091  , 0.0078265 , 0.00772824, 0.00771769])

'rf'

array([0.00024913, 0.00024818, 0.00020295, 0.00019876, 0.00022517])

'xgb'

array([0.00233615, 0.00232134, 0.00234626, 0.00233459, 0.00229456],
      dtype=float32)

'lgb'

array([27, 29, 30, 31])

In [211]:
#### Save Top Features

m

feats = {}
for k in 'rf', 'xgb', 'lgb':
    feats[k] = fscore.loc[fscore[f'{k}_rank']<=m, 'feature'].to_list()

np.save('./model/base_features.npy', feats)

# Check
inter_cnt = {}
inter_cnt['rf_xgb'] = round(len(set(feats['rf']).intersection(set(feats['xgb'])))/m, 2)
inter_cnt['rf_lgb'] = round(len(set(feats['rf']).intersection(set(feats['lgb'])))/m, 2)
inter_cnt['xgb_lgb'] = round(len(set(feats['xgb']).intersection(set(feats['lgb'])))/m, 2)
pd.DataFrame(inter_cnt, index=['Intersection'])

# f = np.load('./model/base_features.npy', allow_pickle=True).item()

242

Unnamed: 0,rf_xgb,rf_lgb,xgb_lgb
Intersection,0.83,0.92,0.86


In [212]:
### Check Corr

corr_top = 200

top = {}
top['corr'] = set(fscore.loc[fscore['corr_rank']<=corr_top, 'feature'].to_list())
top['all'] = set(feats['rf'] + feats['xgb'] + feats['lgb'])
len(top['corr'])
len(top['all'])
''' Include? '''

''' Diff '''
diff = top['corr'].difference(top['all'])
len(diff)
diff

200

284

' Include? '

' Diff '

7

{'商品类别_17mean',
 '商品类别_17sum',
 '行业_1sum',
 '行业_3sum',
 '贷款用途_14sum',
 '贷款用途_20sum',
 '贷款用途_21sum'}