In [None]:
import numpy as np 
import pandas as pd 
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble._hist_gradient_boosting import binning
## 等频计算IV
def get_bins(df_train, feature, NUM_BINS=10):
    i = feature
    est_model = binning._BinMapper(n_bins=NUM_BINS)
    normal_binned = est_model.fit_transform(df_train[i].values.reshape(-1, 1))
    df_train[i] = normal_binned.reshape(-1,)

    # ## 计算特征IV
    len_bad = len(df_train[df_train['TARGET']==1])+1e-5
    len_good = len(df_train[df_train['TARGET']==0])
    df_bins=df_train.groupby([i],as_index=False)['TARGET'].count()
    df_bins.columns = [i,'bin_count']
    df_bins['bad_count'] = df_train.groupby(i)['TARGET'].sum().values+1e-10
    df_bins['bad_rate'] = (df_bins['bad_count'])/len_bad
    df_bins['good_rate'] = (df_bins['bin_count']-df_bins['bad_count'])/len_good
    df_bins['IV'] = (df_bins['bad_rate']-df_bins['good_rate'])*np.log((df_bins['bad_rate']+1e-10)/(df_bins['good_rate']+1e-5))
    return sum(df_bins['IV']), df_bins

IV = []
IV_df = []
columns = []
data_train1['MessageId']=train['MessageId']
train_iv = data_train1.loc[:, feats1+['Label','MessageId']].fillna(0)
# 月与日不一致的数量 、同一分钟内相同账户交易次数、sender reciever 同一分钟内的交易频次 
# TransactionReference 对应BeneficiaryAccount 数量、 account 对应sender个数、 sender 与 OrderingAccount 对应的银行不一致
for i,j in enumerate(data_train1.loc[:, feats1].columns):
        iv,iv_df = get_bins(train_iv, j, NUM_BINS=10)
        IV.append(iv)
        columns.append(j)
        IV_df.append(iv_df)

IV_dict = dict(zip(columns, IV))
IV_dict = dict(sorted(IV_dict.items(), key=lambda kv: kv[1], reverse=True))
IV_dict

In [None]:
## lightgbm 画图
import lightgbm as lgb
def custom_auprc(preds, train_data):
    labels = train_data.get_label()
    # preds = 1. / (1. + np.exp(-preds))
    return 'auprc', metrics.average_precision_score(labels, preds,), True
params = {
    'learning_rate': 0.02,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['auc','average_precision_score','binary_logloss'],
    #'max_depth':i,
    'num_leaves':22,
    'verbose': -1,
    'seed': 16,
    'n_jobs': -1,
    'min_data_in_leaf':40,
    'lambda_l2':10.5,
    'lambda_l1':1,

    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    # 'min_child_weight': 10,
    'max_bin':255
}

train = lgb.Dataset(data_train1.loc[:, feats],
                            data_train1.loc[:, LABEL])

val = lgb.Dataset(data_test1.loc[:, feats],
                    data_test1.loc[:, LABEL])

results={}
clf = lgb.train(params, train, 
                valid_sets=[train, val],
                valid_names = ['train','test'], 
                num_boost_round=50,
                verbose_eval=10,
                evals_result=results,
                callbacks=[lgb.early_stopping(10)],
                feval=[custom_auprc]
                )

history = pd.DataFrame(results['train'])
history = history.join(pd.DataFrame(results['test']), 
                lsuffix='_train', rsuffix='_test')

# plotting
fig = history[['binary_logloss_train', 'binary_logloss_test']].iloc[0:,:].plot(kind='line',ylim=0).get_figure()
# fig.savefig(f'curves/loss.png')
fig = history[['auprc_train', 'auprc_test']].iloc[0:,:].plot(kind='line',title="auprc").get_figure()
# fig.savefig(f'curves/auprc.png')

## 特征重要性
import matplotlib.pyplot as plt
ax = lgb.plot_importance(clf, max_num_features=100, importance_type='gain',figsize=(6,20))
plt.show(6, 6.5)

## 按照step预测信息进行画图
def get_metrics(model, df_test, df_train, max_iter=80):
    from sklearn.metrics import average_precision_score, log_loss
    auprc_test = []
    loss_test = []
    auprc_train = []
    loss_train = []
    for i in range(1,max_iter+1):
        pred_proba = model.predict_proba(df_test.loc[:, feats],num_iteration=i)[:,1]
        pred = model.predict(df_test.loc[:, feats],num_iteration=i)
        auprc = average_precision_score(df_test['Label'].values,
                                        pred_proba)
        loss = log_loss(df_test['Label'].values,
                        pred)
        auprc_test.append(auprc)
        loss_test.append(loss)
    
    for i in range(1, max_iter+1):
        pred_proba_train = model.predict_proba(df_train.loc[:, feats],num_iteration=i)[:,1]
        pred = model.predict(df_train.loc[:, feats],num_iteration=i)
        auprc = average_precision_score(df_train['Label'].values,
                                        pred_proba_train)
        loss = log_loss(df_train['Label'].values,
                        pred)
        auprc_train.append(auprc)
        loss_train.append(loss)
    
    history = pd.DataFrame({
        'average_precision_train': auprc_train,
        'logloss_train': loss_train,
        'average_precision_test': auprc_test,
        'logloss_test': loss_test
    })
    # fig = history[['binary_logloss_train', 'binary_logloss_test']].iloc[0:,:].plot(kind='line',title="log_loss").get_figure()
    return history
history = get_metrics(model, data_test1, data_train1)