In [1]:
import time
tic = time.time()

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
import random
import lightgbm as lgb
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import pickle

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42)

In [None]:
train_label = pd.read_csv('../problems/problem_1/train/train_label.csv')
train_profile = pd.read_csv('../problems/problem_1/train/train_profile.csv')
test_profile = pd.read_csv('../problems/problem_1/B/test_profile_B.csv')

In [None]:
## 0.129980
train = train_profile.merge(train_label, 'left', '用户标识')
test = test_profile.copy()

In [None]:
credit_time_min = pd.read_hdf('./feats_v6/credit_time_min.h5', key='data')  # 0.251811
credit_time_max = pd.read_hdf('./feats_v6/credit_time_max.h5', key='data')  # 0.350979
credit_time_std = pd.read_hdf('./feats_v6/credit_time_std.h5', key='data')  # 0.351767
credit_time_skew = pd.read_hdf('./feats_v6/credit_time_skew.h5', key='data')  # 0.359765 BUG
credit_time_mm2 = pd.read_hdf('./feats_v6/credit_time_mm2.h5', key='data')  # 0.359878
credit_time_gap = pd.read_hdf('./feats_v6/credit_time_gap.h5', key='data') # 0.359967

In [None]:
bankstat_cnt = pd.read_hdf('./feats_v6/bankstat_cnt.h5', key='data') # 0.369053
bankstat_time = pd.read_hdf('./feats_v6/bankstat_time.h5', key='data') # 0.374102
bankstat_time2 = pd.read_hdf('./feats_v6/bankstat_time2.h5', key='data') # 0.374510
bankstat_amt2 = pd.read_hdf('./feats_v6/bankstat_amt2.h5', key='data') # 0.378059

In [None]:
behavior_cnt = pd.read_hdf('./feats_v6/behavior_cnt.h5', key='data') # 0.393364
behavior_cnt2 = pd.read_hdf('./feats_v6/behavior_cnt2.h5', key='data') # 0.401803
behavior_cnt3 = pd.read_hdf('./feats_v6/behavior_cnt3.h5', key='data') # 0.450277
behavior_cnt4 = pd.read_hdf('./feats_v6/behavior_cnt4.h5', key='data') # 0.463653 (all) 0.463548 (0.9)
behavior_time = pd.read_hdf('./feats_v6/behavior_time.h5', key='data') # 0.466040 (300) 0.466140 (1000)

In [None]:
# 上期账单金额
creditbill_amt1_sum = pd.read_hdf('./feats_v6/creditbill_amt1_sum.h5', key='data') # 0.466829
# 上期还款金额
creditbill_amt2_mean = pd.read_hdf('./feats_v6/creditbill_amt2_mean.h5', key='data') # 0.471863
creditbill_amt2_min = pd.read_hdf('./feats_v6/creditbill_amt2_min.h5', key='data') # 0.475831
# 本期账单余额
creditbill_amt3_max = pd.read_hdf('./feats_v6/creditbill_amt3_max.h5', key='data') # 0.475847
creditbill_amt3_std = pd.read_hdf('./feats_v6/creditbill_amt3_std.h5', key='data') # 0.476084
# 信用卡额度
creditbill_amt4_mean = pd.read_hdf('./feats_v6/creditbill_amt4_mean.h5', key='data') # 0.478530
creditbill_amt4_max = pd.read_hdf('./feats_v6/creditbill_amt4_max.h5', key='data') # 0.479430
creditbill_amt4_skew = pd.read_hdf('./feats_v6/creditbill_amt4_skew.h5', key='data') # 0.480112
# 上期账单金额-上期还款金额
creditbill_amt5_mean = pd.read_hdf('./feats_v6/creditbill_amt5_mean.h5', key='data') # 0.480179
creditbill_amt5_min = pd.read_hdf('./feats_v6/creditbill_amt5_min.h5', key='data') # 0.481522
creditbill_amt5_max = pd.read_hdf('./feats_v6/creditbill_amt5_max.h5', key='data') # 0.485155
creditbill_amt5_std = pd.read_hdf('./feats_v6/creditbill_amt5_std.h5', key='data') # 0.486459
creditbill_amt5_skew = pd.read_hdf('./feats_v6/creditbill_amt5_skew.h5', key='data') # 0.486882
# 上期账单金额-信用卡额度
creditbill_amt7_max = pd.read_hdf('./feats_v6/creditbill_amt7_max.h5', key='data') # 0.488431
creditbill_amt7_std = pd.read_hdf('./feats_v6/creditbill_amt7_std.h5', key='data') # 0.488563
creditbill_amt7_sum = pd.read_hdf('./feats_v6/creditbill_amt7_sum.h5', key='data')# 0.489102
creditbill_amt7_skew = pd.read_hdf('./feats_v6/creditbill_amt7_skew.h5', key='data') # 0.491199
# 本期账单余额-信用卡额度
creditbill_amt10_min = pd.read_hdf('./feats_v6/creditbill_amt10_min.h5', key='data') # 0.492479
creditbill_amt10_mean = pd.read_hdf('./feats_v6/creditbill_amt10_mean.h5', key='data') # 0.492547 (300) # 0.493882 (1000)

In [None]:
credit_time_diff = pd.read_hdf('./feats_v6/credit_time_diff.h5', key='data') # 0.493540 (300) 0.496487 (1000)

In [None]:
all_feat_dfs = [credit_time_min, credit_time_max, credit_time_std, credit_time_mm2, credit_time_gap, 
                bankstat_cnt, bankstat_time, bankstat_time2, bankstat_amt2,
                behavior_cnt, behavior_cnt2, behavior_cnt3, behavior_cnt4, behavior_time,
                creditbill_amt1_sum, 
                creditbill_amt2_mean, creditbill_amt2_min, 
                creditbill_amt3_max, creditbill_amt3_std,
                creditbill_amt4_mean, creditbill_amt4_max, creditbill_amt4_skew,
                creditbill_amt5_mean, creditbill_amt5_min, creditbill_amt5_max, creditbill_amt5_std, creditbill_amt5_skew,
                creditbill_amt7_max, creditbill_amt7_std, creditbill_amt7_sum, creditbill_amt7_skew,
                creditbill_amt10_min, creditbill_amt10_mean, 
                credit_time_diff] 

In [None]:
for df in all_feat_dfs:
    train = train.merge(df, 'left', '用户标识')
    test = test.merge(df, 'left', '用户标识')

In [None]:
import gc
gc.collect()

In [None]:
drop_feat = []

In [None]:
used_feat = [f for f in test.columns if f not in ['用户标识'] + list(drop_feat)]
print(len(used_feat))
print(used_feat)

In [None]:
train_x = train[used_feat].reset_index(drop=True)
train_y = train['标签'].reset_index(drop=True)
test_x = test[used_feat].reset_index(drop=True)

In [None]:
def ks(labels, preds):
    fpr,tpr,thresholds = roc_curve(y_true=labels, y_score=preds)
    return 'ks', max(tpr-fpr), True

In [None]:
lgbs = []
has_saved = False

In [None]:
preds = np.zeros((test_x.shape[0], 2))
scores = []

imp = pd.DataFrame()
imp['feat'] = used_feat

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for index, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('*' * 30)
    X_train, y_train, X_valid, y_valid = train_x.iloc[tr_idx], train_y.iloc[tr_idx], train_x.iloc[va_idx], train_y.iloc[va_idx]
    
    eval_set = [(X_valid, y_valid)]
    if not has_saved: 
        lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=64, reg_alpha=0, reg_lambda=1.9, max_bin=64, 
                                    max_depth=-1, n_estimators=10000, objective='binary', metrics='None', 
                                    bagging_fraction=0.8, is_unbalance=False, bagging_freq=5, min_child_samples=80, 
                                    feature_fraction=0.8, learning_rate=0.01, random_state=42, n_jobs=8,
                                    )
#         lgb_model.set_params(**params)
        lgb_model.fit(X_train, y_train, eval_set=eval_set, eval_metric=ks ,verbose=300, early_stopping_rounds=1000)
        with open('./models/fold%d_lgb.mdl' % index, 'wb') as file:
            pickle.dump(lgb_model, file)
    else:
        with open('./models/fold%d_lgb.mdl' % index, 'rb') as file:
            lgb_model = pickle.load(file)
    
    imp['score%d' % (index+1)] = lgb_model.feature_importances_
    
    score = lgb_model.best_score_['valid_0']['ks']
    scores.append(score)
    print('fold %d round %d : score: %.6f | mean score %.6f' % (index+1, lgb_model.best_iteration_, score,np.mean(scores))) 
    preds += lgb_model.predict_proba(test_x)  
    
    lgbs.append(lgb_model)


In [None]:
print(scores)
print(np.mean(scores))

In [None]:
imp['score'] = imp['score1'] + imp['score2'] + imp['score3'] + imp['score4'] + imp['score5'] 
imp.sort_values(by='score', ascending=False)

In [None]:
result = pd.DataFrame()
result['客户号'] = test_profile['用户标识'] 
result['违约概率'] = preds[:, 1]/5
print(len(result))
display(result.head())
result.to_csv('./out/upload_B501785.csv', index=False, header=False)