In [3]:
import pandas as pd
import numpy as np
import os 
import random 
import gc
from sklearn.metrics import *
from tqdm import tqdm 
import warnings 
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)

In [5]:
data_path_1 = 'A榜数据/主表数据/'
data_path_2 = 'A榜数据/其他数据表/'

In [6]:
def seed_everything(seed=2022):
    seed = int(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed=1998)

train = pd.read_csv(data_path_1 + 'x_train.csv')
test = pd.read_csv(data_path_1 + 'x_test.csv')
train_label = pd.read_csv(data_path_1 + 'y_train.csv')
train = train.merge(train_label, on='id', how='left')

test.rename(columns={'c2':'a2','c3':'a3'}, inplace=True)
df = pd.concat([train, test]).reset_index(drop=True)
print(df.shape)
df['rank'] = [i for i in range(df.shape[0])]

(2953495, 6)


## 客户信息表

In [7]:
d = pd.read_csv(data_path_2 + 'd.csv')
#缺失的客户等级按照5来填充（LGB是否考虑不填充）
d.fillna(5, inplace=True)
age = d['d3'].value_counts()

In [8]:
def age_level(array, level, dd_data):
    
    less_num = array
    less_num_array = []
    for i in range(len(less_num)):
        less_num_array.append(str(i))

    for j in range(len(less_num)):
        less_num_array[j] = dd_data[dd_data['d3'] == less_num[j]]

    less_data = pd.concat([less_num_array[0], less_num_array[1]] ,ignore_index=True)

    for i in range(2,len(less_num_array),1):
        less_data = pd.concat([less_data, less_num_array[i]], ignore_index=True)

    less_data['age_level'] = level
    return less_data

In [9]:
def split_level(number, split_num, age):
    num = split_num / number
    
    num_array = [str(i) for i in range(number)]
    new_array = [str(i) for i in range(number)]

    for i in range(number):
        num_array[i] = age[age.values < (i+1) * num ].index

    sum = 0 
    for i in range(number):
        if i == 0:
            new_array[i] = list(set(num_array[i]))
        else:
            new_array[i] = list(set(num_array[i]) - set(num_array[i-1]))
        sum = sum + len(new_array[i])
    return new_array

In [10]:
# age划分为4个阶段

split_num = 10000
number = 4  # 修改该位置，调整年龄的范围
num = split_num / number

new_array_data = [str(i) for i in range(number)]
new_array = split_level(number, split_num, age)

dd_data = d.copy()
for i in range(len(new_array)):
    try:
        new_array_data[i] = age_level(new_array[i], i, dd_data)
    except:
        continue

d_data_4 = pd.concat([new_array_data[0], new_array_data[1]])

for i in range(2, number, 1):
    try:
        d_data_4 = pd.concat([d_data_4, new_array_data[i]])
    except:
        continue
d_data_4.rename(columns={'age_level': 'age_level_4'}, inplace=True)


#### age划分为10个阶段
split_num = 10000
number = 10  # 修改该位置，调整年龄的范围
num = split_num / number

new_array_data = [str(i) for i in range(number)]
new_array = split_level(number, split_num, age)

#dd_data = d_data.copy()
for i in range(len(new_array)):
    try:
        new_array_data[i] = age_level(new_array[i], i, dd_data)
    except:
        continue

d_data_10 = pd.concat([new_array_data[0], new_array_data[1]])

for i in range(2, number, 1):
    try:
        d_data_10 = pd.concat([d_data_10, new_array_data[i]])
    except:
        continue
d_data_10.rename(columns={'age_level': 'age_level_10'}, inplace=True)

In [11]:
from functools import reduce
d_list = [d_data_4, d_data_10]
d_data = reduce(lambda left,right: pd.merge(left,right,on=['core_cust_id', 'd1', 'd2', 'd3']), d_list)

In [12]:
df = df.merge(d_data, on='core_cust_id', how='left')

In [13]:
# 客户风险表
e = pd.read_csv(data_path_2 + 'e.csv')
e['date'] = e['e2'].apply(lambda x: str(x)[:6])

dfs = []
dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    
    stat_1 = e[e['date'] == dict_[month]].groupby('core_cust_id')['e1'].count().reset_index()
    stat_1.columns = ['core_cust_id','risk_count']
    stat_1['risk_level_mean']   = e[e['date'] == dict_[month]].groupby('core_cust_id')['e1'].agg('mean').values
    
    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

2021-07-01
2021-08-01
2021-09-01
2021-10-01


In [14]:
df.head(2)

Unnamed: 0,id,core_cust_id,prod_code,a2,a3,y,rank,d1,d2,d3,age_level_4,age_level_10,risk_count,risk_level_mean
0,9b716f007df84378944ac6a9854838b0,6e2105d9fe,DECD21062102,2,2021-07-01,0.0,5,1,4.0,32,3,8,,
1,458c3ff67ee14b12a13dcd21df8a0db0,6e2105d9fe,DECD21062107,2,2021-07-01,0.0,7,1,4.0,32,3,8,,


## 产品表信息

In [15]:
product = pd.read_pickle('product.pkl')

#产品的再次分类
product['prod_4'] = product['prod_code'].apply(lambda x:x[:4])
product['prod_3'] = product['prod_code'].apply(lambda x:x[:3])
product['prod_1'] = product['prod_code'].apply(lambda x:x[:1])

#收益风险比
product['收益风险比'] = product['预期收益率'] / product['风险等级']

#持有天数和收益率关系
product['天数收益比'] = product['预期收益率'] / product['持有天数']

In [16]:
com_pid = set(product['prod_code'].unique()).intersection(set(df['prod_code'].unique()))  # 求交集

In [17]:
tmp = product[product['prod_code'].isin(com_pid)].describe().T

In [18]:
useful_cols = list(tmp[(tmp['std'] != 0) & (tmp['std'] != np.nan)].index) # 判断标准差
useful_cols = [c for c in useful_cols if c not in ['数据日期']] 
print(useful_cols)

if len(useful_cols) > 0:
    df = df.merge(product[['prod_code']+useful_cols], on='prod_code', how='left')

['计价类型', '周期类型', '模式', '风险等级', '是否允许变更分红方式', '持有天数', '管理方式', '业务模式', '收益特点', '期限', '投资模式', '预期收益率', '展示等级', 'prod_class', '收益风险比', '天数收益比']


In [19]:
df.head(2)

Unnamed: 0,id,core_cust_id,prod_code,a2,a3,y,rank,d1,d2,d3,age_level_4,age_level_10,risk_count,risk_level_mean,计价类型,周期类型,模式,风险等级,是否允许变更分红方式,持有天数,管理方式,业务模式,收益特点,期限,投资模式,预期收益率,展示等级,prod_class,收益风险比,天数收益比
0,9b716f007df84378944ac6a9854838b0,6e2105d9fe,DECD21062102,2,2021-07-01,0.0,5,1,4.0,32,3,8,,,0.0,0.0,2.0,,0.0,0.0,,,,,2.0,0.02315,5.0,2,,inf
1,458c3ff67ee14b12a13dcd21df8a0db0,6e2105d9fe,DECD21062107,2,2021-07-01,0.0,7,1,4.0,32,3,8,,,0.0,0.0,2.0,,0.0,0.0,,,,,2.0,0.043625,5.0,2,,inf


## 产品交易流水

In [20]:
p_record = pd.read_pickle('prod_record.pkl')
#日期的年月
p_record['date'] = p_record['trade_date'].apply(lambda x: str(x)[:6])
#申请金额apply_amont
p_record.rename(columns={'申请金额': 'apply_amt'}, inplace=True)
#交易状态 + 资金状态 + 业务代码 + 渠道标识 排序组合
p_record['deal_bus'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str')
p_record['deal_bus_channel'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['渠道标识'].astype('str')
p_record['deal_bus_fund'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['资金状态'].astype('str')
p_record['deal_bus_c_f'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str') + p_record['渠道标识'].astype('str') + p_record['资金状态'].astype('str')
#p_record['deal_bus'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str')
#整个月的平均值与最大值或最小值相比，如果等，那就是不买，如果不等，那就购买

In [21]:
p_record.rename(columns={'trade_date':'a3'},inplace = True)
p_record.rename(columns={'prod_class':'a2'},inplace = True)

In [22]:
p_record.columns

Index(['流水号', '业务代码', '渠道标识', 'core_cust_id', 'prod_code', '净值', 'apply_amt',
       '资金状态', '交易状态', 'a3', 'a2', 'date', 'deal_bus', 'deal_bus_channel',
       'deal_bus_fund', 'deal_bus_c_f'],
      dtype='object')

In [23]:
df.shape

(2953495, 30)

In [24]:
df.head(2)

Unnamed: 0,id,core_cust_id,prod_code,a2,a3,y,rank,d1,d2,d3,age_level_4,age_level_10,risk_count,risk_level_mean,计价类型,周期类型,模式,风险等级,是否允许变更分红方式,持有天数,管理方式,业务模式,收益特点,期限,投资模式,预期收益率,展示等级,prod_class,收益风险比,天数收益比
0,9b716f007df84378944ac6a9854838b0,6e2105d9fe,DECD21062102,2,2021-07-01,0.0,5,1,4.0,32,3,8,,,0.0,0.0,2.0,,0.0,0.0,,,,,2.0,0.02315,5.0,2,,inf
1,458c3ff67ee14b12a13dcd21df8a0db0,6e2105d9fe,DECD21062107,2,2021-07-01,0.0,7,1,4.0,32,3,8,,,0.0,0.0,2.0,,0.0,0.0,,,,,2.0,0.043625,5.0,2,,inf


In [25]:
df = df.merge(p_record, on= ['core_cust_id','prod_code','a3','a2'], how='left')

In [26]:
df.shape

(2953495, 42)

In [27]:
df.head(2)

Unnamed: 0,id,core_cust_id,prod_code,a2,a3,y,rank,d1,d2,d3,age_level_4,age_level_10,risk_count,risk_level_mean,计价类型,周期类型,模式,风险等级,是否允许变更分红方式,持有天数,管理方式,业务模式,收益特点,期限,投资模式,预期收益率,展示等级,prod_class,收益风险比,天数收益比,流水号,业务代码,渠道标识,净值,apply_amt,资金状态,交易状态,date,deal_bus,deal_bus_channel,deal_bus_fund,deal_bus_c_f
0,9b716f007df84378944ac6a9854838b0,6e2105d9fe,DECD21062102,2,2021-07-01,0.0,5,1,4.0,32,3,8,,,0.0,0.0,2.0,,0.0,0.0,,,,,2.0,0.02315,5.0,2,,inf,,,,,,,,,,,,
1,458c3ff67ee14b12a13dcd21df8a0db0,6e2105d9fe,DECD21062107,2,2021-07-01,0.0,7,1,4.0,32,3,8,,,0.0,0.0,2.0,,0.0,0.0,,,,,2.0,0.043625,5.0,2,,inf,,,,,,,,,,,,


In [28]:
def diff_max_min(x):
    return x.max() - x.min()

In [29]:
p_record.columns

Index(['流水号', '业务代码', '渠道标识', 'core_cust_id', 'prod_code', '净值', 'apply_amt',
       '资金状态', '交易状态', 'a3', 'a2', 'date', 'deal_bus', 'deal_bus_channel',
       'deal_bus_fund', 'deal_bus_c_f'],
      dtype='object')

In [30]:
dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'], 
          '2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}
#申请金额统计量特征
for k, data in enumerate([p_record]):
    print('p == ', k+1)
    dfs = []
    
    for month in sorted(df['a3'].unique()):
        print(month)
        tmp_df = df[df['a3'] == month]
        ####################################################
        #按照用户聚类，查看对应产品的数量、不同产品数量、平均不同产品数、申请金额的统计量
        #小于（目标月-1）前面所有月份
        stat_1 = data[data['date'] < dict_ [month]].groupby('core_cust_id')['prod_code'].count().reset_index()
        stat_1.columns = ['core_cust_id',f'uid_count{k+1}_bbnow']
        #print(stat_1.head(2))
        stat_1[f'pid_nunique{k+1}_bbnow']     = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
        stat_1[f'pid_mean_count{k+1}_bbnow']  = stat_1[f'uid_count{k+1}_bbnow'] / stat_1[f'pid_nunique{k+1}_bbnow']
        stat_1[f'apply_mean{k+1}_bbnow']      = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
        stat_1[f'apply_max{k+1}_bbnow']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values    # 我们加的
        stat_1[f'apply_min{k+1}_bbnow']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values    # 我们加的
        stat_1[f'apply_std{k+1}_bbnow']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values    # 我们加的
        stat_1[f'apply_median{k+1}_bbnow']    = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values # 我们加的
        stat_1[f'apply_sum{k+1}_bbnow']       = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
        stat_1[f'apply_max_min{k+1}_bbnow']   = data[data['date'] < dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
        stat_1[f'flow{k+1}_bbnow']            = data[data['date'] < dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
        
        #目标月前一个月
        stat_2 = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
        stat_2.columns = ['core_cust_id',f'uid_count{k+1}']
        stat_2[f'pid_nunique{k+1}']     = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
        stat_2[f'pid_mean_count{k+1}']  = stat_2[f'uid_count{k+1}'] / stat_2[f'pid_nunique{k+1}']
        stat_2[f'apply_mean{k+1}']      = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
        stat_2[f'apply_std{k+1}']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values
        stat_2[f'apply_max{k+1}']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('max').values        # 我们加的
        stat_2[f'apply_min{k+1}']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('min').values        # 我们加的
        stat_2[f'apply_median{k+1}']    = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values     # 我们加的
        stat_2[f'apply_sum{k+1}']       = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
        stat_2[f'apply_max_min{k+1}']   = data[data['date'] == dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values # 我们加的
        stat_2[f'flow{k+1}']            = data[data['date'] == dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
        #目标月前面所有月
#         stat_3 = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
#         stat_3.columns = ['core_cust_id',f'uid_count{k+1}_bnow']
#         stat_3[f'pid_nunique{k+1}_bnow']     = data[data['date'] <= dict_[month]].groupby(['core_cust_id'])['prod_code'].agg('nunique').values
#         stat_3[f'pid_mean_count{k+1}_bnow']  = stat_3[f'uid_count{k+1}_bnow'] / stat_3[f'pid_nunique{k+1}_bnow']
#         stat_3[f'apply_mean{k+1}_bnow']      = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('mean').values
#         stat_3[f'apply_std{k+1}_bnow']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('std').values
#         stat_3[f'apply_median{k+1}_bnow']    = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('median').values
#         stat_3[f'apply_sum{k+1}_bnow']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg('sum').values
#         stat_3[f'apply_max_min{k+1}_bnow']   = data[data['date'] <= dict_[month]].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
#         stat_3[f'flow{k+1}_bnow']       = data[data['date'] <= dict_[month]].groupby('core_cust_id')['流水号'].agg('count').values
        #目标月前两个月
        stat_4 = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].count().reset_index()
        stat_4.columns = ['core_cust_id',f'uid_count{k+1}_b2']
        stat_4[f'pid_nunique{k+1}_b2']     = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['prod_code'].agg('nunique').values
        stat_4[f'pid_mean_count{k+1}_b2']  = stat_4[f'uid_count{k+1}_b2'] / stat_4[f'pid_nunique{k+1}_b2']
        stat_4[f'apply_mean{k+1}_b2']      = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('mean').values
        stat_4[f'apply_std{k+1}_b2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('std').values
        stat_4[f'apply_max{k+1}_b2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('max').values
        stat_4[f'apply_min{k+1}_b2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('min').values
        stat_4[f'apply_median{k+1}_b2']    = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('median').values
        stat_4[f'apply_sum{k+1}_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg('sum').values
        stat_4[f'apply_max_min{k+1}_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['apply_amt'].agg(diff_max_min).values
        stat_4[f'flow{k+1}_2']       = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')['流水号'].agg('count').values
        ################################################################
        # 交易流水
        #小于（目标月-1）前面所有月份
        stat_5 = data[data['date'] < dict_ [month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
        stat_5.columns = ['core_cust_id', 'prod_code', f'pflow_count{k+1}_bbnow']
        #目标月前一个月
        stat_6 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
        stat_6.columns = ['core_cust_id', 'prod_code', f'pflow_count{k+1}_all']
        #目标月前面所有月
#         stat_7 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
#         stat_7.columns = ['core_cust_id', 'prod_code', f'pflow_count{k+1}']
        #目标月前两个月
        stat_8 = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
        stat_8.columns = ['core_cust_id', 'prod_code', f'pflow_count{k+1}']
        #####################################################################
        # 交易状态 + 资金状态 + 业务代码 + 渠道标识 排序组合
        
        # 整个月的平均值与最大值或最小值相比，如果等，那就是不买，如果不等，那就购买
        
        tmp_df = tmp_df.merge(stat_1, on= 'core_cust_id', how='left')
        tmp_df = tmp_df.merge(stat_2, on= 'core_cust_id', how='left')
#         tmp_df = tmp_df.merge(stat_3, on='core_cust_id', how='left')
        tmp_df = tmp_df.merge(stat_4, on= 'core_cust_id', how='left')
        tmp_df = tmp_df.merge(stat_5, on= ['core_cust_id','prod_code'], how='left') # 
        tmp_df = tmp_df.merge(stat_6, on= ['core_cust_id','prod_code'], how='left')
#         tmp_df = tmp_df.merge(stat_7, on='core_cust_id', how='left')
        tmp_df = tmp_df.merge(stat_8, on= ['core_cust_id','prod_code'], how='left')
        dfs.append(tmp_df)

    df = pd.concat(dfs).reset_index(drop=True)

p ==  1
2021-07-01
2021-08-01
2021-09-01
2021-10-01


In [31]:
df.columns

Index(['id', 'core_cust_id', 'prod_code', 'a2', 'a3', 'y', 'rank', 'd1', 'd2',
       'd3', 'age_level_4', 'age_level_10', 'risk_count', 'risk_level_mean',
       '计价类型', '周期类型', '模式', '风险等级', '是否允许变更分红方式', '持有天数', '管理方式', '业务模式',
       '收益特点', '期限', '投资模式', '预期收益率', '展示等级', 'prod_class', '收益风险比', '天数收益比',
       '流水号', '业务代码', '渠道标识', '净值', 'apply_amt', '资金状态', '交易状态', 'date',
       'deal_bus', 'deal_bus_channel', 'deal_bus_fund', 'deal_bus_c_f',
       'uid_count1_bbnow', 'pid_nunique1_bbnow', 'pid_mean_count1_bbnow',
       'apply_mean1_bbnow', 'apply_std1_bbnow', 'apply_median1_bbnow',
       'apply_sum1_bbnow', 'apply_max_min1_bbnow', 'flow1_bbnow', 'uid_count1',
       'pid_nunique1', 'pid_mean_count1', 'apply_mean1', 'apply_std1',
       'apply_median1', 'apply_sum1', 'apply_max_min1', 'flow1',
       'uid_count1_b2', 'pid_nunique1_b2', 'pid_mean_count1_b2',
       'apply_mean1_b2', 'apply_std1_b2', 'apply_median1_b2', 'apply_sum1_2',
       'apply_max_min1_2', 'flow1_2', 'pfl

## APP点击行为表

In [33]:
r = pd.read_csv(data_path_2 + 'r.csv')
r['date'] = r['r5'].apply(lambda x: x[:7])

dict_ = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dfs = []

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]

    stat_1 = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat_1.columns = ['core_cust_id','uid_click_action_count_all']
    stat_1['pid_click_action_nunique_all'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    
    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')

    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

2021-07-01
2021-08-01
2021-09-01
2021-10-01


## 账户交易流水表

In [34]:
s = pd.read_csv(data_path_2 + 's.csv')
s['date'] = s['s7'].apply(lambda x: x[:7])
s['s4'] = s['s4'].apply(lambda x: str(x).replace(',','')).astype('float') 

dict_ = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dfs = []

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    
    stat_1 = s[s['date'] == dict_[month]].groupby('s3')['s1'].count().reset_index()
    stat_1.columns = ['core_cust_id','borrow_count']
    stat_1['borrow_mean']   = s[s['date'] == dict_[month]].groupby('s3')['s4'].agg('mean').values
    stat_1['borrow_sum']    = s[s['date'] == dict_[month]].groupby('s3')['s4'].agg('sum').values
    stat_1['borrow_max']    = s[s['date'] == dict_[month]].groupby('s3')['s4'].agg('max').values    # 我们加的
    stat_1['borrow_min']    = s[s['date'] == dict_[month]].groupby('s3')['s4'].agg('min').values    # 我们加的
    stat_1['borrow_std']    = s[s['date'] == dict_[month]].groupby('s3')['s4'].agg('std').values    # 我们加的
    stat_1['borrow_median'] = s[s['date'] == dict_[month]].groupby('s3')['s4'].agg('median').values # 我们加的
    
    stat_2 = s[s['date'] == dict_[month]].groupby('s6')['s1'].count().reset_index()
    stat_2.columns = ['core_cust_id','loan_count']
    stat_2['loan_mean']   = s[s['date'] == dict_[month]].groupby('s6')['s4'].agg('mean').values
    stat_2['loan_sum']    = s[s['date'] == dict_[month]].groupby('s6')['s4'].agg('sum').values
    stat_2['loan_max']    = s[s['date'] == dict_[month]].groupby('s6')['s4'].agg('max').values    # 我们加的
    stat_2['loan_min']    = s[s['date'] == dict_[month]].groupby('s6')['s4'].agg('min').values    # 我们加的
    stat_2['loan_std']    = s[s['date'] == dict_[month]].groupby('s6')['s4'].agg('std').values    # 我们加的
    stat_2['loan_median'] = s[s['date'] == dict_[month]].groupby('s6')['s4'].agg('median').values # 我们加的

    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')
    
    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

2021-07-01
2021-08-01
2021-09-01
2021-10-01


## 资产信息表

In [35]:
f = pd.read_csv(data_path_2 + 'f.csv')

f['date'] = f['f22'].apply(lambda x: str(x)[:6])
used_cols = [c for c in f.columns if c not in ['core_cust_id','f1','f22','date']]

for c in used_cols:
    f[c] = f[c].apply(lambda x: str(x).replace(',','')).astype('float')

dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dfs = []

for month in sorted(df['a3'].unique()):
    print(month)

    tmp_df = df[df['a3'] == month]
    
    stat_1 = f[f['date'] == dict_[month]].groupby('core_cust_id')[used_cols].mean().reset_index()
    stat_1.columns = ['core_cust_id'] + [f'{c}_mean' for c in used_cols]
    
    stat_2 = f[f['date'] < dict_[month]].groupby('core_cust_id')[used_cols].mean().reset_index()
    stat_2.columns = ['core_cust_id'] + [f'{c}_mean_all' for c in used_cols]
 
    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')

    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

2021-07-01
2021-08-01
2021-09-01
2021-10-01


In [36]:
df = df.sort_values('rank').reset_index(drop=True)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2953495 entries, 0 to 2953494
Columns: 124 entries, id to f21_mean_all
dtypes: float64(107), int64(7), object(10)
memory usage: 2.7+ GB


In [38]:
df.select_dtypes(include='object').head(2)

Unnamed: 0,id,core_cust_id,prod_code,a3,流水号,date,deal_bus,deal_bus_channel,deal_bus_fund,deal_bus_c_f
0,4e3c3d57b83e425f8087b1d6d32a50f7,6e2105d9fe,90318011,2021-08-01,,,,,,
1,aa83c5fc05414c4d9727f0b32882f80e,6e2105d9fe,GRHLA20211530,2021-09-01,,,,,,


In [39]:
features = [c for c in df.columns if c != 'y']

In [42]:
drop_cols = [c for c in features if df[c].dtype != 'object' and df[c].std() == 0]

In [44]:
drop_fea = ['id', 'core_cust_id', 'a3', 'y', 'prod_code', 'rank']

feature= [x for x in df.columns if x not in drop_fea]
print(len(feature))
print(feature)

118
['a2', 'd1', 'd2', 'd3', 'age_level_4', 'age_level_10', 'risk_count', 'risk_level_mean', '计价类型', '周期类型', '模式', '风险等级', '是否允许变更分红方式', '持有天数', '管理方式', '业务模式', '收益特点', '期限', '投资模式', '预期收益率', '展示等级', 'prod_class', '收益风险比', '天数收益比', '流水号', '业务代码', '渠道标识', '净值', 'apply_amt', '资金状态', '交易状态', 'date', 'deal_bus', 'deal_bus_channel', 'deal_bus_fund', 'deal_bus_c_f', 'uid_count1_bbnow', 'pid_nunique1_bbnow', 'pid_mean_count1_bbnow', 'apply_mean1_bbnow', 'apply_std1_bbnow', 'apply_median1_bbnow', 'apply_sum1_bbnow', 'apply_max_min1_bbnow', 'flow1_bbnow', 'uid_count1', 'pid_nunique1', 'pid_mean_count1', 'apply_mean1', 'apply_std1', 'apply_median1', 'apply_sum1', 'apply_max_min1', 'flow1', 'uid_count1_b2', 'pid_nunique1_b2', 'pid_mean_count1_b2', 'apply_mean1_b2', 'apply_std1_b2', 'apply_median1_b2', 'apply_sum1_2', 'apply_max_min1_2', 'flow1_2', 'pflow_count1_bbnow', 'pflow_count1_all', 'pflow_count1', 'uid_click_action_count_all', 'pid_click_action_nunique_all', 'borrow_count', 'borrow_mean', 

In [45]:
df['流水号']            = df['流水号'].astype('category')
df['date']             = df['date'].astype('category')
df['deal_bus']         = df['deal_bus'].astype('category')
df['deal_bus_channel'] = df['deal_bus_channel'].astype('category')
df['deal_bus_fund']    = df['deal_bus_fund'].astype('category')
df['deal_bus_c_f']     = df['deal_bus_c_f'].astype('category')

In [46]:
object_list = ['流水号', 'date', 'deal_bus', 'deal_bus_channel', 'deal_bus_fund', 'deal_bus_c_f']

In [47]:
df_0 = df[(df['a3'] < '2021-10-01') & (df['y'] == 0)].drop_duplicates(feature)
df_1 = df[(df['a3'] < '2021-10-01') & (df['y'] == 1)]
df_ = pd.concat([df_0, df_1]).sample(frac=1, random_state=2).reset_index(drop=True)

drop_fea = ['id','core_cust_id','a3','y','prod_code','rank'] 

In [48]:
df_0.shape, df_1.shape, df_.shape

((1053681, 124), (25434, 124), (1079115, 124))

In [73]:
X_train = df_[df_["a3"] == '2021-07-01'][feature_list].reset_index(drop=True)
y_train = df_[df_["a3"] == '2021-07-01']["y"]
X_valid = df_[df_["a3"] == '2021-08-01'][feature_list].reset_index(drop=True)
y_valid = df_[df_["a3"] == '2021-08-01']["y"]
X_test = df[df["a3"] == '2021-10-01'][feature_list].reset_index(drop=True)

In [74]:
X_test.shape, X_train.shape,X_valid.shape

((567362, 74), (332946, 74), (262095, 74))

In [77]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(num_leaves=256,
                     n_estimators=20000,
                     learning_rate=0.005,
                     verbose = -1,
                     max_bin = 100,
                     max_depth = 10,
                     feature_fraction_seed = 66,
                     feature_fraction = 0.7,
                     bagging_seed = 66,
                     bagging_freq = 1,
                     bagging_fraction = 0.95,
                     metric='auc',
                     lambda_l1=0.1,
                     lambda_l2=0.1, 
                     min_child_weight=30,
                     n_jobs=20)
# clf = LGBMClassifier()

clf.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100,verbose=200) #, categorical_feature = object_list

gc.collect()

[200]	valid_0's auc: 0.905289
[400]	valid_0's auc: 0.908704
[600]	valid_0's auc: 0.910311
[800]	valid_0's auc: 0.911353
[1000]	valid_0's auc: 0.912412
[1200]	valid_0's auc: 0.913056
[1400]	valid_0's auc: 0.913372
[1600]	valid_0's auc: 0.913606
[1800]	valid_0's auc: 0.913738
[2000]	valid_0's auc: 0.913864
[2200]	valid_0's auc: 0.913974
[2400]	valid_0's auc: 0.914061
[2600]	valid_0's auc: 0.914159
[2800]	valid_0's auc: 0.914183


406

In [78]:
oof_prob = clf.predict_proba(X_valid[feature_list])[:, 1]  

oof_prob = clf.predict_proba(X_valid[feature_list])[:, 1]  

def find_best_threshold(y_valid, oof_prob):
    best_f2 = 0
    recall = 0
    precision = 0 
    for th in tqdm([i/2000 for i in range(0, 200)]):
        oof_prob_copy = oof_prob.copy()
        oof_prob_copy[oof_prob_copy >= th] = 1
        oof_prob_copy[oof_prob_copy < th] = 0

        recall = recall_score(y_valid, oof_prob_copy)
        precision = precision_score(y_valid, oof_prob_copy)
        f2 = 5*recall*precision / (4*precision+recall)
        if f2 > best_f2:
            best_th = th
            best_f2 = f2
#         gc.collect()
    recall = recall
    precision = precision
        
    return best_th, best_f2, recall, precision
    
best_th, best_f2, recall, precision = find_best_threshold(y_valid, oof_prob)
print("分界值", best_th)
print("F2评价分数", best_f2)
print("recall召回率", recall)
print("precision精确度", precision)

100%|██████████| 200/200 [00:41<00:00,  4.82it/s]

分界值 0.0995
F2评价分数 0.4514360070638536
recall召回率 0.646996136938857
precision精确度 0.2043589851474734





In [54]:
oof_prob = clf.predict_proba(X_valid[feature])[:, 1]  

oof_prob = clf.predict_proba(X_valid[feature])[:, 1]  

def find_best_threshold(y_valid, oof_prob):
    best_f2 = 0
    recall = 0
    precision = 0 
    for th in tqdm([i/2000 for i in range(0, 200)]):
        oof_prob_copy = oof_prob.copy()
        oof_prob_copy[oof_prob_copy >= th] = 1
        oof_prob_copy[oof_prob_copy < th] = 0

        recall = recall_score(y_valid, oof_prob_copy)
        precision = precision_score(y_valid, oof_prob_copy)
        f2 = 5*recall*precision / (4*precision+recall)
        if f2 > best_f2:
            best_th = th
            best_f2 = f2
#         gc.collect()
    recall = recall
    precision = precision
        
    return best_th, best_f2, recall, precision
    
best_th, best_f2, recall, precision = find_best_threshold(y_valid, oof_prob)
print("分界值", best_th)
print("F2评价分数", best_f2)
print("recall召回率", recall)
print("precision精确度", precision)

100%|██████████| 200/200 [00:41<00:00,  4.83it/s]

分界值 0.0995
F2评价分数 0.4515839976104692
recall召回率 0.6444651658452112
precision精确度 0.20553124601724798





In [79]:
lgb_predictors = [i for i in X_train.columns]
lgb_feat_imp = pd.Series(clf.feature_importances_, lgb_predictors).sort_values(ascending=False)

In [80]:
feature_list = lgb_feat_imp[lgb_feat_imp.values > 0].index.tolist()

In [81]:
lgb_feat_imp[:40]

预期收益率                           9751
持有天数                            8097
d3                              7716
pid_mean_count1_bbnow           7119
天数收益比                           7022
uid_click_action_count_all      6825
borrow_sum                      5472
apply_sum1_bbnow                5370
pid_click_action_nunique_all    5220
apply_mean1_bbnow               4921
loan_sum                        4858
borrow_median                   4582
apply_max_min1_bbnow            4194
apply_std1_bbnow                4046
uid_count1_bbnow                3922
pid_mean_count1                 3723
borrow_mean                     3668
borrow_std                      3556
d2                              3500
loan_mean                       3483
pid_nunique1_bbnow              3438
loan_median                     3274
收益风险比                           3248
uid_count1                      2954
apply_sum1                      2942
apply_median1_bbnow             2938
apply_std1                      2897
b

In [82]:
lgb_feat_imp[40:80]

pflow_count1_bbnow    1486
pid_nunique1          1435
d1                    1407
f3_mean               1386
f13_mean              1377
f19_mean              1093
flow1_bbnow           1058
loan_count            1030
期限                     990
flow1                  830
f11_mean               734
f4_mean                692
f14_mean               626
周期类型                   623
risk_level_mean        615
f20_mean               545
f9_mean                491
age_level_4            414
f21_mean               410
f16_mean               390
f15_mean               373
f6_mean                338
f10_mean               268
risk_count             257
f5_mean                256
风险等级                   224
a2                     100
投资模式                    84
模式                      72
管理方式                    43
业务模式                     5
prod_class               5
计价类型                     4
收益特点                     0
dtype: int32

In [84]:
X = pd.concat([X_train, X_valid]).reset_index(drop=True) # 
y = pd.concat([y_train, y_valid]).reset_index(drop=True) #

clf1 = LGBMClassifier(num_leaves=256,
                     n_estimators=int(clf.best_iteration_*1.2),
                     learning_rate=0.005,
                     verbose = -1,
                     max_bin = 100,
                     max_depth = 10,
                     feature_fraction_seed = 66,
                     feature_fraction = 0.7,
                     bagging_seed = 66,
                     bagging_freq = 1,
                     bagging_fraction = 0.95,
                     metric='auc',
                     lambda_l1=0.1,
                     lambda_l2=0.1, 
                     min_child_weight=30,
                     n_jobs=200)

clf1.fit(X, y)



KeyboardInterrupt: 

In [62]:
gc.collect
y_pre = clf1.predict_proba(X_test)[:, 1]  

res = test[['id']]
res['y'] = y_pre
res.loc[res['y'] >= best_th, 'y'] = 1
res.loc[res['y'] < best_th, 'y'] = 0
res.to_csv('submission_0118_v0.csv',index = False) 