In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats as spstats
from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.preprocessing import Imputer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [2]:
target = '标签'
uid = '申请编号'

def get_time():
    now = datetime.datetime.now().strftime("%m-%d %H:%M")
    print(now)

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

In [25]:
########## Grid Search

scale_pos_weight = 119/21
cv = 5

param_general = {
    'n_iter' : 50,
    'cv' : cv, 
    'scoring' : 'roc_auc', 
    'n_jobs' : -1, 
    'random_state' : 123, 
    'verbose' : 1}

# RF
param_dist_rf = {
    # Shape
    'n_estimators' : range(50, 500, 50),
#     'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_samples_split' : range(50, 100, 10),
    'min_samples_leaf' : range(50, 100, 10),
    # Sample
    'class_weight' : ['balanced', None],
    'max_features' : ['sqrt', 'log2'],
    # Objective
    'criterion' : ['gini', 'entropy']
}

# XGB
param_dist_xgb = {
    # Shape
    'n_estimators' : range(50, 500, 50),
#     'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'scale_pos_weight' : [scale_pos_weight, 1],
    'subsample' : np.linspace(0.5, 0.9, 5),
    'colsample_bytree' : np.linspace(0.5, 0.9, 5),
    'colsample_bylevel' : np.linspace(0.5, 0.9, 5),
    # Algo
    'eta' : np.linspace(0.01, 0.2, 20), # Learning_rate
    'alpha' : np.linspace(0, 1, 10),
    'lambda' : range(0, 50, 5),
    'early_stopping_rounds' : range(10, 20, 5)
}

# LGB
param_dist_lgb = {
    # Shape
    'num_boost_round' : range(50, 500, 50),
#     'num_boost_round' : range(50, 100, 10),
    'num_leaves' : range(2**3, 2**10, 100),
    'min_data_in_leaf' : range(50, 100, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'is_unbalance' : [True, False],
    'bagging_freq': range(2, 10), # >0 enable bagging_fraction
    'bagging_fraction': np.linspace(0.5, 0.9, 5),
    'feature_fraction': np.linspace(0.5, 0.9, 5),
    'subsample' : np.linspace(0.5, 0.9, 5),
    # Algo
    'learning_rate':np.linspace(0.01, 0.2, 20),    
    'lambda_l1': np.linspace(0, 1, 10),
    'lambda_l2': range(0, 50, 5),
    'cat_smooth': range(1, 40, 5)
#     'early_stopping_rounds' : range(10, 20, 5)
}

param_dist_lr = {
    # Shape
    'max_iter' : range(50, 500, 50),
    # Sample
    'class_weight' : [scale_pos_weight, 1],
    # Algo
    'solver' : ['sag', 'lbfgs', 'newton-cg'],
    'C': [0.001, 0.01, 0.1, 1, 10] # 1/λ
}

##########

# RF
param_fixed_rf = {
    'n_jobs' : -1,
    'oob_score' : True,
    'random_state':123,
    'verbose':0
}

# XGB
param_fixed_xgb = {
    'n_jobs' : -1,
    'eval_metric': 'auc',
    'seed' : 123,
    'silent' : 1,
    'verbose_eval':0
}

# LGB
param_fixed_lgb = {
    'n_jobs' : -1,
    'metric' : 'auc',
    'random_state' : 123,
    'bagging_seed':123,
    'feature_fraction_seed':123,
    'verbose_eval' : 0
}

# LR
param_fixed_lr = {
    'n_jobs' : -1,
    'random_state' : 123,
    'verbose' : 0     
}

In [4]:
################  Load Features

''' *** With nona *** '''

''' Load '''
Xid = pd.read_csv('./tmp/train_d1234_nona.csv', header=0, index_col=0)
Xid.shape

yid = pd.read_csv('./data/train_label.csv', header=0, index_col=0)
yid.shape

''' Merge '''
xy = pd.merge(Xid, yid, on=uid, how='inner')
xy.drop(uid, axis=1, inplace=True)
xy.shape

''' Split '''
# X, y
X = xy.copy()
y = X.pop(target)
X.shape
y.shape

''' *** With na *** '''

''' Load '''
Xid1 = pd.read_csv('./tmp/train_d1234_na.csv', header=0, index_col=0)
Xid1.shape

''' Merge '''
xy1 = pd.merge(Xid1, yid, on=uid, how='inner')
xy1.drop(uid, axis=1, inplace=True)
xy1.shape

''' Split '''
# X, y
X1 = xy1.copy()
y1 = X1.pop(target)
X1.shape
y1.shape

' *** With nona *** '

' Load '

(140000, 812)

(140000, 1)

' Merge '

(140000, 812)

' Split '

(140000, 811)

(140000,)

' *** With na *** '

' Load '

(140000, 812)

' Merge '

(140000, 812)

' Split '

(140000, 811)

(140000,)

In [6]:
################ Important Features ################

######## Base Models ########

### RF ###

best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_rf}
RF = RandomForestClassifier(**model_params)

# Train
RF.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(RF.feature_importances_, columns=['rf'])
fscore_rf = pd.concat([f, score], axis=1).sort_values(by='rf', ascending=False).reset_index(drop=True)
fscore_rf.head()

### XGB ###

# XGB
best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_xgb}
XGB = XGBClassifier(**model_params)

# Train
XGB.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(XGB.feature_importances_, columns=['xgb'])
fscore_xgb = pd.concat([f, score], axis=1).sort_values(by='xgb', ascending=False).reset_index(drop=True)
fscore_xgb.head()

### LGB ###

# LGB
best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lgb}
LGB = LGBMClassifier(**model_params)

# Train
LGB.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(LGB.feature_importances_, columns=['lgb'])
fscore_lgb = pd.concat([f, score], axis=1).sort_values(by='lgb', ascending=False).reset_index(drop=True)
fscore_lgb.head()

### LGB with Na ###

best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lgb}
LGB = LGBMClassifier(**model_params)

# Train
LGB.fit(X1, y1)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(LGB.feature_importances_, columns=['lgbna'])
fscore_lgb_na = pd.concat([f, score], axis=1).sort_values(by='lgbna', ascending=False).reset_index(drop=True)
fscore_lgb_na.head()

######## correlations ########

correlations = xy.corr()

# Save
correlations.apply(abs).to_csv('./tmp/0_correlations_abs.csv')
correlations.to_csv('./tmp/0_correlations.csv')

# Abs
correlations_target_abs = correlations.loc[correlations.index != target, target].apply(abs).sort_values(ascending=False)

f = pd.DataFrame(correlations_target_abs.index, columns=['feature'])
score_corr = pd.DataFrame(correlations_target_abs.values, columns=['corr'])
fscore_corr = pd.concat([f, score_corr], axis=1).sort_values(by='corr', ascending=False).reset_index(drop=True)
fscore_corr.fillna(0, inplace=True)
fscore_corr.head()

######## Merge ########

fscore = pd.merge(fscore_corr, fscore_rf, on='feature')
fscore = pd.merge(fscore, fscore_xgb, on='feature')
fscore = pd.merge(fscore, fscore_lgb, on='feature')
fscore = pd.merge(fscore, fscore_lgb_na, on='feature')

# Add rank
frank = fscore.rank(numeric_only=True, method='min', ascending=False)
# fscore.fillna(0, inplace=True)
fscore = pd.merge(fscore, frank, left_index=True, right_index=True, suffixes=['', '_rank'])
fscore['rank'] = fscore['corr_rank'] + fscore['rf_rank'] + fscore['xgb_rank'] + fscore['lgb_rank'] + fscore['lgbna_rank']
fscore.sort_values(by='rank', inplace=True)

fscore.shape
fscore.head()
fscore.to_csv('./model/f_score.csv')

''' Describe '''
fscore[['corr', 'rf', 'xgb', 'lgb', 'lgbna', 'rank']].describe()


LGBMClassifier(bagging_fraction=0.9, bagging_freq=4, bagging_seed=123,
               boosting_type='gbdt', cat_smooth=16, class_weight=None,
               colsample_bytree=1.0, feature_fraction=0.9,
               feature_fraction_seed=123, importance_type='split',
               is_unbalance=True, lambda_l1=0.8888888888888888, lambda_l2=25,
               learning_rate=0.02, max_depth=-1, metric='auc',
               min_child_samples=20, min_child_weight=2, min_data_in_leaf=90,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
               num_boost_round=400, num_leaves=308, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.8, ...)

Unnamed: 0,feature,lgbna
0,信用额度,3291
1,贷款年金,3097
2,身份认证日期距申请日期天数,2762
3,最近一次换手机号码距申请日天数,2367
4,外部评分,2352


Unnamed: 0,feature,corr
0,外部评分,0.131385
1,出生日期距申请日期天数,0.102948
2,拟申请拒绝率_d,0.102523
3,信用额度sum比_d,0.095559
4,贷款年金sum比_d,0.095343


(811, 12)

Unnamed: 0,feature,corr,rf,xgb,lgb,lgbna,corr_rank,rf_rank,xgb_rank,lgb_rank,lgbna_rank,rank
0,外部评分,0.131385,0.052359,0.006587,2058,2352,1.0,1.0,9.0,7.0,5.0,23.0
1,出生日期距申请日期天数,0.102948,0.026522,0.004676,2063,2095,2.0,3.0,18.0,6.0,7.0,36.0
6,工作日期距申请日期天数,0.087621,0.036302,0.01128,1482,1480,7.0,2.0,3.0,15.0,15.0,42.0
37,最近一次换手机号码距申请日天数,0.074325,0.017826,0.002661,2119,2367,38.0,4.0,68.0,5.0,4.0,119.0
58,身份认证日期距申请日期天数,0.067446,0.008831,0.002541,2729,2762,59.0,20.0,80.0,3.0,3.0,165.0


' Describe '

Unnamed: 0,corr,rf,xgb,lgb,lgbna,rank
count,811.0,811.0,811.0,811.0,811.0,811.0
mean,0.02241,0.001233,0.001233,151.410604,151.418002,1825.810111
std,0.023247,0.003241,0.001465,364.580104,366.322191,798.553042
min,0.0,0.0,0.0,0.0,0.0,23.0
25%,0.003873,0.0,0.0,0.0,0.0,1199.0
50%,0.012549,0.000146,0.00131,13.0,13.0,1776.0
75%,0.034977,0.000987,0.001862,109.0,106.0,2645.0
max,0.131385,0.052359,0.012934,3366.0,3291.0,2906.0


In [39]:
######## Intersection by Score ########

# By score
# th_imp = {'corr':0.01, 'rf':0.001, 'xgb':0, 'lgb':0, 'lgbna':0} # 453, 198
# th_imp = {'corr':0.03, 'rf':0.001, 'xgb':0, 'lgb':0, 'lgbna':0} # 453, 198
# th_imp = {'corr':0.05, 'rf':0.001, 'xgb':0, 'lgb':0, 'lgbna':0} # 121, 198
# th_imp = {'corr':0.06, 'rf':0.001, 'xgb':0, 'lgb':0, 'lgbna':0} # 76, 198
th_imp = {'corr':0.07, 'rf':0.001, 'xgb':0, 'lgb':0, 'lgbna':0} # 54, 198
# th_imp = {'corr':0.075, 'rf':0.001, 'xgb':0, 'lgb':0, 'lgbna':0} # 45, 198

top_f = {}
cnt_f = {}
inter_cnt = {}
for k in 'corr', 'rf', 'xgb', 'lgb', 'lgbna':
    # top
    t = fscore.loc[fscore[f'{k}']>=th_imp[k], 'feature']
    top_f[k] = set(t) # set
    # len
    cnt_f[f'cnt_{k}'] = len(t)
#     # intersection with Corr
#     inter_cnt[f'corr_{k}'] = round(len(top_f['corr'].intersection(top_f[k]))/len(top_f['corr']), 2)

# Intersection
inter_cnt['corr/all'] = round(len(top_f['rf'].intersection(top_f['lgb']))/len(top_f['lgb']), 2)
inter_cnt['rf/all'] = round(len(top_f['rf'].intersection(top_f['lgb']))/len(top_f['lgb']), 2)
inter_cnt['rf/corr'] = round(len(top_f['rf'].intersection(top_f['corr']))/len(top_f['corr']), 2)

# inter_cnt['rf_xgb'] = round(len(top_f['rf'].intersection(top_f['xgb']))/len(top_f['rf']), 2)
# inter_cnt['rf_lgb'] = round(len(top_f['rf'].intersection(top_f['lgb']))/len(top_f['rf']), 2)
# inter_cnt['lgb_rf'] = round(len(top_f['rf'].intersection(top_f['lgb']))/len(top_f['lgb']), 2)
# inter_cnt['rf_lgbna'] = round(len(top_f['rf'].intersection(top_f['lgbna']))/len(top_f['rf']), 2)
# inter_cnt['lgbna_rf'] = round(len(top_f['rf'].intersection(top_f['lgbna']))/len(top_f['lgbna']), 2)
# inter_cnt['xgb_lgb'] = round(len(top_f['xgb'].intersection(top_f['lgb']))/len(top_f['xgb']), 2)
# inter_cnt['lgb_xgb'] = round(len(top_f['xgb'].intersection(top_f['lgb']))/len(top_f['lgb']), 2)
# inter_cnt['lgb_lgbna'] = round(len(top_f['lgb'].intersection(top_f['lgbna']))/len(top_f['lgb']), 2)

''' Corr '''
pd.DataFrame(cnt_f, index=['Count'])
pd.DataFrame(inter_cnt, index=['Intersection'])

# ''' all VS corr '''
# top = {}
# # top['all'] = top_f['rf'].union(top_f['xgb']).union(top_f['lgb'])
# diff = top_f['corr'].difference(top['rf'])
# len(diff)
# # diff

### Save
for k, v in top_f.items():
    print(f'{k}:{len(v)}')
np.save('./model/base_features.npy', top_f)


# # + diff
# ''' top_f_final ''' 
# top_f_final = {} # 入模特征列表
# for k in 'rf', 'xgb', 'lgb':
#     top_f_final[k] = top_f[k].union(diff) # 保存特征列表
#     len(top_f_final[k])

# ### Save
# np.save('./model/base_features.npy', top_f_final)

' Corr '

Unnamed: 0,cnt_corr,cnt_rf,cnt_xgb,cnt_lgb,cnt_lgbna
Count,54,198,811,811,811


Unnamed: 0,corr/all,rf/all,rf/corr
Intersection,0.24,0.24,0.91


corr:54
rf:198
xgb:811
lgb:811
lgbna:811


In [None]:
# ####### Polynomial

# m = 50 # 交集
# n = 10 # 合集

# top_f_inters = set(fscore.loc[(fscore['corr_rank'] <= m) & 
#                           (fscore['rf_rank'] <= m) & 
#                           (fscore['xgb_rank'] <= m) & 
#                           (fscore['lgb_rank'] <= m), 'feature'])
# len(top_f_inters)
# top_f_union = set(fscore.loc[(fscore['corr_rank'] <= n) | 
#                           (fscore['rf_rank'] <= n) | 
#                           (fscore['xgb_rank'] <= n) | 
#                           (fscore['lgb_rank'] <= n), 'feature'])
# len(top_f_union)
# top_poly = top_f_inters.union(top_f_union)
# len(top_poly)
# top_poly

# np.save('./tmp/0_feats_poly.npy', top_poly)

In [None]:
# ############## RF

# ''' Baseline '''
# baseline = RandomForestClassifier(**param_fixed_rf)
# baseline.fit(X, y)
# pred_baseline = baseline.predict_proba(X)
# ks_score(y, pred_baseline[:,1])

# ''' Best '''
# grid = RandomizedSearchCV(RandomForestClassifier(**param_fixed_rf), param_dist_rf, **param_general)
# grid.fit(X, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_rf.npy', best_params)

# # ''' Test Clone Model '''
# # model1 = grid.best_estimator_
# # model1.fit(X, y)
# # ks_score(y, model1.predict_proba(X)[:,1])
# # 
# # ''' Test Save Params '''
# # best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
# # model2_params = {**best_params_load, **param_fixed_rf}
# # model2 = RandomForestClassifier(**model2_params)
# # model2.fit(X, y)
# # ks_score(y, model2.predict_proba(X)[:,1])

In [None]:
# ############## XGB

# ''' Baseline '''
# baseline = XGBClassifier(**param_fixed_xgb)
# baseline.fit(X, y)
# pred_baseline = baseline.predict_proba(X)
# ks_score(y, pred_baseline[:,1])

# ''' Best '''
# grid = RandomizedSearchCV(XGBClassifier(**param_fixed_xgb), param_dist_xgb, **param_general)
# grid.fit(X, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_xgb.npy', best_params)

# # ''' Test Clone Model '''
# # model1 = grid.best_estimator_
# # model1.fit(X, y)
# # ks_score(y, model1.predict_proba(X)[:,1])

# # ''' Test Save Params '''
# # best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
# # model2_params = {**best_params_load, **param_fixed_xgb}
# # model2 = XGBClassifier(**model2_params)
# # model2.fit(X, y)
# # ks_score(y, model2.predict_proba(X)[:,1])

In [None]:
# ############## LGB

# ''' Baseline '''
# baseline = LGBMClassifier(**param_fixed_lgb)
# baseline.fit(X, y)
# pred_baseline = baseline.predict_proba(X) #, num_iteration=baseline.best_iteration_)
# ks_score(y, pred_baseline[:,1])

# ''' Best '''
# grid = RandomizedSearchCV(LGBMClassifier(**param_fixed_lgb), param_dist_lgb, **param_general)
# grid.fit(X, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_lgb.npy', best_params)

# # ''' Test Clone Model '''
# # model1 = grid.best_estimator_
# # model1.fit(X, y)
# # ks_score(y, model1.predict_proba(X)[:,1])

# # ''' Test Save Params '''
# # best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
# # model2_params = {**best_params_load, **param_fixed_lgb}
# # model2 = LGBMClassifier(**model2_params)
# # model2.fit(X, y)
# # ks_score(y, model2.predict_proba(X)[:,1])

In [26]:
############## LR

''' Baseline '''
baseline = LogisticRegression(**param_fixed_lr)
baseline.fit(X, y)
pred_baseline = baseline.predict_proba(X)
ks_score(y, pred_baseline[:,1])

''' Best '''
grid = RandomizedSearchCV(LogisticRegression(**param_fixed_lr), param_dist_lr, **param_general)
grid.fit(X, y)
grid.best_score_
best_params = grid.best_params_
np.save('./model/base_lr.npy', best_params)

''' Test Clone Model '''
model1 = grid.best_estimator_
model1.fit(X, y)
ks_score(y, model1.predict_proba(X)[:,1])

''' Test Save Params '''
best_params_load = np.load('./model/base_lr.npy', allow_pickle=True).item()
model2_params = {**best_params_load, **param_fixed_lr}
model2 = LogisticRegression(**model2_params)
model2.fit(X_meta, y)
ks_score(y, model2.predict_proba(X_meta)[:,1])

' Baseline '

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

0.238

' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
# ############## LR Meta

# X_meta = pd.read_csv('./tmp/meta_X.csv', header=0, index_col=0).values
# poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
# X_meta = poly.fit_transform(X_meta)
# X_meta.shape
# y_meta = y.values
s
# ''' Baseline '''
# baseline = LogisticRegression(**param_fixed_lr)
# baseline.fit(X_meta, y)
# pred_baseline = baseline.predict_proba(X_meta)
# ks_score(y, pred_baseline[:,1])

# ''' Best '''
# grid = RandomizedSearchCV(LogisticRegression(**param_fixed_lr), param_dist_lr, **param_general)
# grid.fit(X_meta, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_lr_meta.npy', best_params)

# ''' Test Clone Model '''
# model1 = grid.best_estimator_
# model1.fit(X_meta, y)
# ks_score(y, model1.predict_proba(X_meta)[:,1])

# ''' Test Save Params '''
# best_params_load = np.load('./model/base_lr_meta.npy', allow_pickle=True).item()
# model2_params = {**best_params_load, **param_fixed_lr}
# model2 = LogisticRegression(**model2_params)
# model2.fit(X_meta, y)
# ks_score(y, model2.predict_proba(X_meta)[:,1])

In [None]:
# ######## Test Meta K-fold

# X_meta = pd.read_csv('./tmp/meta_X.csv', header=0, index_col=0).values
# poly = PolynomialFeatures(3, interaction_only=True)
# X_meta = poly.fit_transform(X_meta)[:,1:]
# X_meta.shape

# y_meta = y.values

# # LR
# best_params_load = np.load('./model/base_lr.npy', allow_pickle=True).item()
# model_params = {**best_params_load, **param_fixed_lr}
# LR = LogisticRegression(**model_params)

# # Tune
#         ks = []
#         meta_model = LR
        
#         kfold = KFold(n_splits=5, shuffle=True, random_state=123)
#         j = 0
#         meta_models_ = []
#         for train_index, valid_index in kfold.split(X_meta, y_meta):
#             instance = clone(meta_model)
#             meta_models_.append(instance)
#             instance.fit(X_meta[train_index],  y_meta[train_index])
#             y_pred = instance.predict_proba(X_meta[valid_index])[:,1]
#             ks.append(ks_score(y_meta[valid_index], y_pred))
#             print(ks)
#             j += 1
#         pd.DataFrame(ks)