In [1]:
import pandas as pd
from tqdm import tqdm
import warnings
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [2]:
# 目标编码
def kfold_mean(df_train, df_test, target, target_mean_list):
    folds = StratifiedKFold(n_splits=5)

    mean_of_target = df_train[target].mean()

    for fold_, (trn_idx, val_idx) in tqdm(
            enumerate(folds.split(df_train, y=df_train['label']))):
        tr_x = df_train.iloc[trn_idx, :]
        vl_x = df_train.iloc[val_idx, :]

        for col in target_mean_list:
            df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(
                tr_x.groupby(col)[target].mean())

    for col in target_mean_list:
        df_train[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)

        df_test[f'{col}_target_enc'] = df_test[col].map(
            df_train.groupby(col)[f'{col}_target_enc'].mean())

        df_test[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)
    return pd.concat([df_train, df_test], ignore_index=True)

feature_list =  ['HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT'
                   ]
data = pd.concat([train, test], ignore_index=True)
data = kfold_mean(data[~data['label'].isna()], data[data['label'].isna()],
                  'label',
                  feature_list)

5it [00:01,  4.87it/s]


In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,CSNY,DKFFE,DKLL,DKYE,DWJJLX,DWSSHY,DWYJCE,GRJCJS,GRYJCE,GRZHDNGJYE,GRZHSNJZYE,GRZHYE,GRZHZT,HYZK,XINGBIE,XUELI,ZHICHEN,ZHIWU,ZHIYE,id,label,HYZK_target_enc,ZHIYE_target_enc,ZHICHEN_target_enc,ZHIWU_target_enc,XUELI_target_enc,DWJJLX_target_enc,DWSSHY_target_enc,GRZHZT_target_enc
0,1038672000,175237,2.708,154112.935,150,12,312.0,1737.0,312.0,837.0,801.31,3223.515,1,90,1,99,999,0,90,train_0,0.0,0.068926,0.068813,0.068651,0.068915,0.068915,0.024447,0.061192,0.064884
1,504892800,300237,2.979,298252.945,110,0,795.84,4894.0,795.84,1065.2,53213.22,18055.195,1,90,2,99,999,0,90,train_1,0.0,0.068926,0.068813,0.068651,0.068915,0.068915,0.093762,0.016744,0.064884
2,736185600,150237,2.708,147339.13,150,9,1444.2,10297.0,1444.2,7230.02,13963.14,27426.6,1,90,1,99,999,0,90,train_2,0.0,0.068926,0.068813,0.068651,0.068915,0.068915,0.024447,0.04409,0.064884
3,428515200,350237,2.708,300653.78,150,7,1417.14,10071.5,1417.14,2271.295,99701.265,111871.13,1,90,1,99,999,0,90,train_3,0.0,0.068926,0.068813,0.068651,0.068915,0.068915,0.024447,0.012766,0.064884
4,544204800,150237,2.708,145185.01,900,14,325.5,2007.0,325.5,35.78,11028.875,237.0,1,90,2,99,999,0,90,train_4,0.0,0.068926,0.068813,0.068651,0.068915,0.068915,0.112081,0.076842,0.064884


In [4]:
# 频数统计
cat_col = ['HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT']
for col in cat_col:
    data[col + '_COUNT'] = data[col].map(data[col].value_counts())
    col_idx = data[col].value_counts()
    for idx in col_idx[col_idx < 10].index:
        data[col] = data[col].replace(idx, -1)  

In [5]:
# 偏离值特征
group_list = ['HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT']
num_feature_list = ['GRYJCE', 'DKFFE', 'DKLL', 'DKYE', 'GRJCJS', 'GRZHSNJZYE', 'GRZHDNGJYE']                   
for group in group_list:
    for feature in num_feature_list:
        tmp = data.groupby(group)[feature].agg([sum, min, max, np.mean]).reset_index()
        tmp = pd.merge(data, tmp, on=group, how='left')
        data['{}-mean_gb_{}'.format(feature, group)] = data[feature] - tmp['mean']
        data['{}-min_gb_{}'.format(feature, group)] = data[feature] - tmp['min']
        data['{}-max_gb_{}'.format(feature, group)] = data[feature] - tmp['max']
        data['{}/sum_gb_{}'.format(feature, group)] = data[feature] / tmp['sum']  

In [6]:
X_train, X_test = data[~data['label'].isna()], data[data['label'].isna()]
y = X_train['label']
drop_features = ['label', 'id', 'CSNY']

X_train = X_train.drop(drop_features, axis=1)
X_test = X_test.drop(drop_features, axis=1)

cat_col = ['HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT']
X_train[cat_col] = X_train[cat_col].astype('category')
X_test[cat_col] = X_test[cat_col].astype('category')

In [7]:
# 评价指标
def tpr_weight_funtion(y_true,y_predict):

    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [8]:
# 训练&预测
KF = StratifiedKFold(n_splits=5, random_state=2020)
params = {
    'verbose':-1, 
    'objective':'binary',
    'metric':'auc',
    'num_iterations': 10000, 
}


oof_lgb = np.zeros(len(X_train))
predictions_lgb = np.zeros((len(X_test)))

for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)
    trn_data = lgb.Dataset(X_train.iloc[trn_idx],label=y.iloc[trn_idx])    
    val_data = lgb.Dataset(X_train.iloc[val_idx],label=y.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(
        params,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=200,  
        categorical_feature=cat_col
    )
        
    oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(X_test, num_iteration=clf.best_iteration) 
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("TPR weight: {}".format(tpr_weight_funtion(y, oof_lgb)))

fold n°0
trn_idx: [ 7984  7985  7986 ... 39997 39998 39999]
val_idx: [   0    1    2 ... 8282 8288 8306]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[188]	training's auc: 0.998578	valid_1's auc: 0.937945
fold n°1
trn_idx: [    0     1     2 ... 39997 39998 39999]
val_idx: [ 7984  7985  7986 ... 15999 16011 16014]
Training until validation scores don't improve for 200 rounds
[500]	training's auc: 1	valid_1's auc: 0.931402
Early stopping, best iteration is:
[337]	training's auc: 0.999957	valid_1's auc: 0.933264
fold n°2
trn_idx: [    0     1     2 ... 39997 39998 39999]
val_idx: [16000 16001 16002 ... 24455 24458 24460]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[53]	training's auc: 0.979346	valid_1's auc: 0.917012
fold n°3
trn_idx: [    0     1     2 ... 39997 39998 39999]
val_idx: [23964 23965 23966 ... 32014 32019 32053]
Training until validation scores don't improve for 200 roun

In [1]:
submit = pd.read_csv('../data/submit.csv')
submit['label'] = predictions_lgb / 5
submit.to_csv('0111-01.csv', index=False)

NameError: name 'pd' is not defined