In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats as spstats
from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.preprocessing import Imputer, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [2]:
target = '标签'
uid = '申请编号'

def get_time():
    now = datetime.datetime.now().strftime("%m-%d %H:%M")
    print(now)

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

In [3]:
########## Grid Search

scale_pos_weight = 119/21
cv = 5

param_general = {
    'n_iter' : 50,
    'cv' : cv, 
    'scoring' : 'roc_auc', 
    'n_jobs' : -1, 
    'random_state' : 123, 
    'verbose' : 1}

# RF
param_dist_rf = {
    # Shape
    'n_estimators' : range(50, 500, 50),
#     'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_samples_split' : range(50, 100, 10),
    'min_samples_leaf' : range(50, 100, 10),
    # Sample
    'class_weight' : ['balanced', None],
    'max_features' : ['sqrt', 'log2'],
    # Objective
    'criterion' : ['gini', 'entropy']
}

# XGB
param_dist_xgb = {
    # Shape
    'n_estimators' : range(50, 500, 50),
#     'n_estimators' : range(5, 10),
    'max_depth' : range(3, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'scale_pos_weight' : [scale_pos_weight, 1],
    'subsample' : np.linspace(0.5, 0.9, 5),
    'colsample_bytree' : np.linspace(0.5, 0.9, 5),
    'colsample_bylevel' : np.linspace(0.5, 0.9, 5),
    # Algo
    'eta' : np.linspace(0.01, 0.2, 20), # Learning_rate
    'alpha' : np.linspace(0, 1, 10),
    'lambda' : range(0, 50, 5),
    'early_stopping_rounds' : range(10, 20, 5)
}

# LGB
param_dist_lgb = {
    # Shape
    'num_boost_round' : range(50, 500, 50),
#     'num_boost_round' : range(50, 100, 10),
    'num_leaves' : range(2**3, 2**10, 100),
    'min_data_in_leaf' : range(50, 100, 10),
    'min_child_weight' : range(1, 9, 1), # 最小叶子节点样本权重和
    # Sample
    'is_unbalance' : [True, False],
    'bagging_freq': range(2, 10), # >0 enable bagging_fraction
    'bagging_fraction': np.linspace(0.5, 0.9, 5),
    'feature_fraction': np.linspace(0.5, 0.9, 5),
    'subsample' : np.linspace(0.5, 0.9, 5),
    # Algo
    'learning_rate':np.linspace(0.01, 0.2, 20),    
    'lambda_l1': np.linspace(0, 1, 10),
    'lambda_l2': range(0, 50, 5),
    'cat_smooth': range(1, 40, 5)
#     'early_stopping_rounds' : range(10, 20, 5)
}

param_dist_lr = {
    # Shape
    'max_iter' : range(50, 500, 50),
    # Sample
    'class_weight' : [scale_pos_weight, 1],
    # Algo
    'solver' : ['sag', 'lbfgs', 'newton-cg'],
    'C': [0.001, 0.01, 0.1] # 1/λ
}

##########

# RF
param_fixed_rf = {
    'n_jobs' : -1,
    'oob_score' : True,
    'random_state':123,
    'verbose':0
}

# XGB
param_fixed_xgb = {
    'n_jobs' : -1,
    'eval_metric': 'auc',
    'seed' : 123,
    'silent' : 1,
    'verbose_eval':0
}

# LGB
param_fixed_lgb = {
    'n_jobs' : -1,
    'metric' : 'auc',
    'random_state' : 123,
    'bagging_seed':123,
    'feature_fraction_seed':123,
    'verbose_eval' : 0
}

# LR
param_fixed_lr = {
    'n_jobs' : -1,
    'random_state' : 123,
    'verbose' : 0     
}

In [4]:
################  Load Features

''' *** With nona *** '''

''' Load '''
Xid = pd.read_csv('./tmp/train_d1234_nona.csv', header=0, index_col=0)
Xid.shape

yid = pd.read_csv('./data/train_label.csv', header=0, index_col=0)
yid.shape

''' Merge '''
xy = pd.merge(Xid, yid, on=uid, how='inner')
xy.drop(uid, axis=1, inplace=True)
xy.shape

''' Split '''
# X, y
X = xy.copy()
y = X.pop(target)
X.shape
y.shape

''' *** With na *** '''

''' Load '''
Xid1 = pd.read_csv('./tmp/train_d1234_na.csv', header=0, index_col=0)
Xid1.shape

''' Merge '''
xy1 = pd.merge(Xid1, yid, on=uid, how='inner')
xy1.drop(uid, axis=1, inplace=True)
xy1.shape

''' Split '''
# X, y
X1 = xy1.copy()
y1 = X1.pop(target)
X1.shape
y1.shape

' *** With nona *** '

' Load '

(140000, 812)

(140000, 1)

' Merge '

(140000, 812)

' Split '

(140000, 811)

(140000,)

' *** With na *** '

' Load '

(140000, 812)

' Merge '

(140000, 812)

' Split '

(140000, 811)

(140000,)

In [5]:
### LGB with Na ###

# LGB
best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lgb}
LGB = LGBMClassifier(**model_params)

# Train
LGB.fit(X1, y1)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(LGB.feature_importances_, columns=['lgb'])
fscore_lgb = pd.concat([f, score], axis=1).sort_values(by='lgb', ascending=False).reset_index(drop=True)
fscore_lgb.head()

LGBMClassifier(bagging_fraction=0.9, bagging_freq=4, bagging_seed=123,
               boosting_type='gbdt', cat_smooth=16, class_weight=None,
               colsample_bytree=1.0, feature_fraction=0.9,
               feature_fraction_seed=123, importance_type='split',
               is_unbalance=True, lambda_l1=0.8888888888888888, lambda_l2=25,
               learning_rate=0.02, max_depth=-1, metric='auc',
               min_child_samples=20, min_child_weight=2, min_data_in_leaf=90,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
               num_boost_round=400, num_leaves=308, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.8, ...)

Unnamed: 0,feature,lgb
0,信用额度,3291
1,贷款年金,3097
2,身份认证日期距申请日期天数,2762
3,最近一次换手机号码距申请日天数,2367
4,外部评分,2352


In [6]:
################ Important Features ################

######## Base Models ########

### RF ###

best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_rf}
RF = RandomForestClassifier(**model_params)

# Train
RF.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(RF.feature_importances_, columns=['rf'])
fscore_rf = pd.concat([f, score], axis=1).sort_values(by='rf', ascending=False).reset_index(drop=True)
fscore_rf.head()

### XGB ###

# XGB
best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_xgb}
XGB = XGBClassifier(**model_params)

# Train
XGB.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(XGB.feature_importances_, columns=['xgb'])
fscore_xgb = pd.concat([f, score], axis=1).sort_values(by='xgb', ascending=False).reset_index(drop=True)
fscore_xgb.head()

### LGB ###

# LGB
best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lgb}
LGB = LGBMClassifier(**model_params)

# Train
LGB.fit(X, y)

# Importance
f = pd.DataFrame(X.columns, columns=['feature'])
score = pd.DataFrame(LGB.feature_importances_, columns=['lgb'])
fscore_lgb = pd.concat([f, score], axis=1).sort_values(by='lgb', ascending=False).reset_index(drop=True)
fscore_lgb.head()

######## correlations ########

correlations = xy.corr()

# Save
correlations.apply(abs).to_csv('./ana/0_correlations_abs.csv')
correlations.to_csv('./ana/0_correlations.csv')

# Abs
correlations_target_abs = correlations.loc[correlations.index != target, target].apply(abs).sort_values(ascending=False)

f = pd.DataFrame(correlations_target_abs.index, columns=['feature'])
score_corr = pd.DataFrame(correlations_target_abs.values, columns=['corr'])
fscore_corr = pd.concat([f, score_corr], axis=1).sort_values(by='corr', ascending=False).reset_index(drop=True)
fscore_corr.head()

######## Merge ########

fscore = pd.merge(fscore_corr, fscore_rf, on='feature')
fscore = pd.merge(fscore, fscore_xgb, on='feature')
fscore = pd.merge(fscore, fscore_lgb, on='feature')

# Add rank
frank = fscore.rank(numeric_only=True, method='min', ascending=False)
fscore.fillna(0, inplace=True)
fscore = pd.merge(fscore, frank, left_index=True, right_index=True, suffixes=['', '_rank'])
fscore['rank'] = fscore['corr_rank'] + fscore['rf_rank'] + fscore['xgb_rank'] + fscore['lgb_rank']
fscore.sort_values(by='rank', inplace=True)

fscore.shape
fscore.head()
fscore.to_csv('./model/f_score.csv')

''' Describe '''
fscore[['corr', 'rf', 'xgb', 'lgb', 'rank']].describe()


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=9, max_features='sqrt',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=50,
                       min_samples_split=50, min_weight_fraction_leaf=0.0,
                       n_estimators=400, n_jobs=-1, oob_score=True,
                       random_state=123, verbose=0, warm_start=False)

Unnamed: 0,feature,rf
0,外部评分,0.052359
1,工作日期距申请日期天数,0.036302
2,出生日期距申请日期天数,0.026522
3,最近一次换手机号码距申请日天数,0.017826
4,教育程度_1,0.015429


In [None]:
######## Intersection by Score ########

# By score
# th_imp = {'corr':0.01, 'rf':0.002, 'xgb':0.003, 'lgb':5} # 404
# th_imp = {'corr':0.01, 'rf':0.0005, 'xgb':0.0015, 'lgb':5} # 801, 518	318	397
th_imp = {'corr':0.01, 'rf':0.002, 'xgb':0, 'lgb':0} # 518	318	397
# th_imp = {'corr':0, 'rf':0, 'xgb':0, 'lgb':0} 

top_f = {}
cnt_f = {}
inter_cnt = {}
for k in 'corr', 'rf', 'xgb', 'lgb':
    # top
    t = fscore.loc[fscore[f'{k}']>=th_imp[k], 'feature']
    top_f[k] = set(t) # set
    # len
    cnt_f[f'cnt_{k}'] = len(t)
    # intersection with Corr
    inter_cnt[f'corr_{k}'] = round(len(top_f['corr'].intersection(top_f[k]))/len(top_f['corr']), 2)

# Intersection
inter_cnt['rf_xgb'] = round(len(top_f['rf'].intersection(top_f['xgb']))/len(top_f['rf']), 2)
inter_cnt['xgb_rf'] = round(len(top_f['rf'].intersection(top_f['xgb']))/len(top_f['xgb']), 2)
inter_cnt['rf_lgb'] = round(len(top_f['rf'].intersection(top_f['lgb']))/len(top_f['rf']), 2)
inter_cnt['lgb_rf'] = round(len(top_f['rf'].intersection(top_f['lgb']))/len(top_f['lgb']), 2)
inter_cnt['xgb_lgb'] = round(len(top_f['xgb'].intersection(top_f['lgb']))/len(top_f['xgb']), 2)
inter_cnt['lgb_xgb'] = round(len(top_f['xgb'].intersection(top_f['lgb']))/len(top_f['lgb']), 2)

''' Corr '''
pd.DataFrame(cnt_f, index=['Count'])
pd.DataFrame(inter_cnt, index=['Intersection'])

''' all VS corr '''
top = {}
# top['all'] = top_f['rf'].union(top_f['xgb']).union(top_f['lgb'])
top['all'] = top_f['rf'].union(top_f['xgb'])
diff = top_f['corr'].difference(top['all'])
len(diff)
# diff

### Save
np.save('./model/base_features.npy', top_f)

# # + diff
# ''' top_f_final ''' 
# top_f_final = {} # 入模特征列表
# for k in 'rf', 'xgb', 'lgb':
#     top_f_final[k] = top_f[k].union(diff) # 保存特征列表
#     len(top_f_final[k])

# ### Save
# np.save('./model/base_features.npy', top_f_final)

In [None]:
####### Polynomial

m = 50 # 交集
n = 10 # 合集

top_f_inters = set(fscore.loc[(fscore['corr_rank'] <= m) & 
                          (fscore['rf_rank'] <= m) & 
                          (fscore['xgb_rank'] <= m) & 
                          (fscore['lgb_rank'] <= m), 'feature'])
len(top_f_inters)
top_f_union = set(fscore.loc[(fscore['corr_rank'] <= n) | 
                          (fscore['rf_rank'] <= n) | 
                          (fscore['xgb_rank'] <= n) | 
                          (fscore['lgb_rank'] <= n), 'feature'])
len(top_f_union)
top_poly = top_f_inters.union(top_f_union)
len(top_poly)
top_poly

np.save('./tmp/0_feats_poly.npy', top_poly)

In [32]:
# ############## RF

# get_time()
# ''' Baseline '''
# baseline = RandomForestClassifier(**param_fixed_rf)
# baseline.fit(X, y)
# pred_baseline = baseline.predict_proba(X)
# ks_score(y, pred_baseline[:,1])

# get_time()
# ''' Best '''
# grid = RandomizedSearchCV(RandomForestClassifier(**param_fixed_rf), param_dist_rf, **param_general)
# grid.fit(X, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_rf.npy', best_params)
# get_time()

# # ''' Test Clone Model '''
# # model1 = grid.best_estimator_
# # model1.fit(X, y)
# # ks_score(y, model1.predict_proba(X)[:,1])
# # 
# # ''' Test Save Params '''
# # best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
# # model2_params = {**best_params_load, **param_fixed_rf}
# # model2 = RandomForestClassifier(**model2_params)
# # model2.fit(X, y)
# # ks_score(y, model2.predict_proba(X)[:,1])

10-20 09:51


' Baseline '

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=True, random_state=123, verbose=0,
                       warm_start=False)

0.9852

10-20 09:51


' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 51.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 68.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=-1, 

0.7002853321328532

10-20 11:01


In [34]:
# ############## XGB

# get_time()
# ''' Baseline '''
# baseline = XGBClassifier(**param_fixed_xgb)
# baseline.fit(X, y)
# pred_baseline = baseline.predict_proba(X)
# ks_score(y, pred_baseline[:,1])

# get_time()
# ''' Best '''
# grid = RandomizedSearchCV(XGBClassifier(**param_fixed_xgb), param_dist_xgb, **param_general)
# grid.fit(X, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_xgb.npy', best_params)
# get_time()
# get_time()

# # ''' Test Clone Model '''
# # model1 = grid.best_estimator_
# # model1.fit(X, y)
# # ks_score(y, model1.predict_proba(X)[:,1])

# # ''' Test Save Params '''
# # best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
# # model2_params = {**best_params_load, **param_fixed_xgb}
# # model2 = XGBClassifier(**model2_params)
# # model2.fit(X, y)
# # ks_score(y, model2.predict_proba(X)[:,1])

10-20 12:13


' Baseline '

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123, silent=1,
              subsample=1, verbose_eval=0, verbosity=1)

0.3422

10-20 12:14


' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 160.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 220.0min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           eval_metric='auc', gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=-1, nthread=None,
                                           objective='binary:logistic',
                                           random_...
                                        'eta': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 

0.7251946138455383

10-20 15:55
10-20 15:55


In [33]:
# ############## LGB

# get_time()
# ''' Baseline '''
# baseline = LGBMClassifier(**param_fixed_lgb)
# baseline.fit(X, y)
# pred_baseline = baseline.predict_proba(X) #, num_iteration=baseline.best_iteration_)
# ks_score(y, pred_baseline[:,1])

# get_time()
# ''' Best '''
# grid = RandomizedSearchCV(LGBMClassifier(**param_fixed_lgb), param_dist_lgb, **param_general)
# grid.fit(X, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_lgb.npy', best_params)
# get_time()

# # ''' Test Clone Model '''
# # model1 = grid.best_estimator_
# # model1.fit(X, y)
# # ks_score(y, model1.predict_proba(X)[:,1])

# # ''' Test Save Params '''
# # best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
# # model2_params = {**best_params_load, **param_fixed_lgb}
# # model2 = LGBMClassifier(**model2_params)
# # model2.fit(X, y)
# # ks_score(y, model2.predict_proba(X)[:,1])

10-20 11:03


' Baseline '

LGBMClassifier(bagging_seed=123, boosting_type='gbdt', class_weight=None,
               colsample_bytree=1.0, feature_fraction_seed=123,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               metric='auc', min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0, verbose_eval=0)

0.4274

10-20 11:03


' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 52.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 69.1min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=LGBMClassifier(bagging_seed=123,
                                            boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            feature_fraction_seed=123,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            metric='auc', min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leav...
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.1

0.7294587094837935

10-20 12:13


In [349]:
# ######## Tune Meta

# X_meta = pd.read_csv('./tmp/meta_X.csv', header=0, index_col=0).values
# poly = PolynomialFeatures(3, interaction_only=True)
# X_meta = poly.fit_transform(X_meta)[:,1:]
# X_meta.shape

# y_meta = y.values

# # LR
# best_params_load = np.load('./model/base_lr.npy', allow_pickle=True).item()
# model_params = {**best_params_load, **param_fixed_lr}
# LR = LogisticRegression(**model_params)

# # Tune
#         ks = []
#         meta_model = LR
        
#         kfold = KFold(n_splits=5, shuffle=True, random_state=123)
#         j = 0
#         meta_models_ = []
#         for train_index, valid_index in kfold.split(X_meta, y_meta):
#             instance = clone(meta_model)
#             meta_models_.append(instance)
#             instance.fit(X_meta[train_index],  y_meta[train_index])
#             y_pred = instance.predict_proba(X_meta[valid_index])[:,1]
#             ks.append(ks_score(y_meta[valid_index], y_pred))
#             print(ks)
#             j += 1
#         pd.DataFrame(ks)

(140000, 7)

In [351]:
# ############## LR

# X_meta = pd.read_csv('./tmp/meta_X.csv', header=0, index_col=0).values
# poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
# X_meta = poly.fit_transform(X_meta)
# X_meta.shape
# y_meta = y.values

# ''' Baseline '''
# baseline = LogisticRegression(**param_fixed_lr)
# baseline.fit(X_meta, y)
# pred_baseline = baseline.predict_proba(X_meta)
# ks_score(y, pred_baseline[:,1])

# ''' Best '''
# grid = RandomizedSearchCV(LogisticRegression(**param_fixed_lr), param_dist_lr, **param_general)
# grid.fit(X_meta, y)
# grid.best_score_
# best_params = grid.best_params_
# np.save('./model/base_lr.npy', best_params)

# ''' Test Clone Model '''
# model1 = grid.best_estimator_
# model1.fit(X_meta, y)
# ks_score(y, model1.predict_proba(X_meta)[:,1])

# ''' Test Save Params '''
# best_params_load = np.load('./model/base_lr.npy', allow_pickle=True).item()
# model2_params = {**best_params_load, **param_fixed_lr}
# model2 = LogisticRegression(**model2_params)
# model2.fit(X_meta, y)
# ks_score(y, model2.predict_proba(X_meta)[:,1])

(140000, 6)

' Baseline '

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

0.3507

' Best '

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   20.8s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='warn', n_jobs=-1,
                                                penalty='l2', random_state=123,
                                                solver='warn', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='warn', n_iter=50, n_jobs=-1,
                   param_distributions={'C': [0.001, 0.01, 0.1],
                                        'class_weight': [5.666666666666667, 1],
                                        'max_iter': range(50, 500, 50),
                                        'solver': ['sag', 'l

0.735806162464986

' Test Clone Model '

LogisticRegression(C=0.1, class_weight=1, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

0.3503

' Test Save Params '

LogisticRegression(C=0.1, class_weight=1, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

0.3503