In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

from datetime import date
import datetime as dt

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
os.getcwd()

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# import data
data_path = '../../ml-datasets/O2O-Coupon-Usage-Forecast/'
feature_path = '../../ml-datasets/O2O-Coupon-Usage-Forecast/features/'

dfoff = pd.read_csv(data_path + 'ccf_offline_stage1_train.csv', header=0, keep_default_na=False)
# 注意这里的 keep_default_na 在判断 nan 的时候有区别，这样就统一了判断标准是 null，而且想 Date 这类特征也被看成 string 了，不用处理小数
dfoff.columns = map(lambda x: x.lower(), dfoff.columns)

# dfon = pd.read_csv(data_path + 'ccf_online_stage1_train.csv', header=0, keep_default_na=False)

dftest = pd.read_csv(data_path + 'ccf_offline_stage1_test_revised.csv', header=0, keep_default_na=False)
dftest.columns = map(lambda x: x.lower(), dftest.columns)

dfoff.head()
dftest.head()

Unnamed: 0,user_id,merchant_id,coupon_id,discount_rate,distance,date_received,date
0,1439408,2632,,,0,,20160217.0
1,1439408,4663,11002.0,150:20,1,20160528.0,
2,1439408,2632,8591.0,20:1,0,20160217.0,
3,1439408,2632,1078.0,20:1,0,20160319.0,
4,1439408,2632,8591.0,20:1,0,20160613.0,


Unnamed: 0,user_id,merchant_id,coupon_id,discount_rate,distance,date_received
0,4129537,450,9983,30:5,1.0,20160712
1,6949378,1300,3429,30:5,,20160706
2,2166529,7113,6928,200:20,5.0,20160727
3,2166529,7113,1808,100:10,5.0,20160727
4,6172162,7605,6500,30:1,2.0,20160708


|  | 预测区间 | 特征区间 |
| :-: | :-: | :-: |
| 测试集 | Dataset3: 20160701-20160731 | feature3: 20160315-20160630 |
| 训练集 2 | Dataset2: 20160515-20160615 | feature2: 20160201-20160514 |
| 训练集 1 | Dataset1: 20160414-20160514 | feature1: 20160101-20160413 |

In [3]:
# 交叉训练集 1：
dataset1 = dfoff[(dfoff['date_received'] >= '20160414') & (dfoff['date_received'] <= '20160514')]
feature1 = dfoff[((dfoff['date'] >= '20160101') & (dfoff['date'] <= '20160413')) | 
                 ((dfoff['date'] == 'null')&((dfoff['date_received'] >= '20160101') & 
                                             (dfoff['date_received'] <= '20160413')))]

# 交叉训练集 2：
dataset2 = dfoff[(dfoff['date_received'] >= '20160515') & (dfoff['date_received'] <= '20160615')]
feature2 = dfoff[((dfoff['date'] >= '20160201') & (dfoff['date'] <= '20160514')) | 
                 ((dfoff['date'] == 'null')&((dfoff['date_received'] >= '20160201') & 
                                             (dfoff['date_received'] <= '20160514')))]

# 交叉训练集 3：
# 交叉训练集 2：
dataset3 = dftest
feature3 = dfoff[((dfoff['date'] >= '20160315') & (dfoff['date'] <= '20160630')) | 
                 ((dfoff['date'] == 'null')&((dfoff['date_received'] >= '20160315') & 
                                             (dfoff['date_received'] <= '20160630')))]

In [4]:
# 1. 获取预测区间中有关优惠券的特征：利用了预测空间和特征空间

# 统一计算出折扣率
def calc_discount_rate(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return float(s[0])
    else:
        return 1.0-float(s[1])/float(s[0])

# 计算出满多少钱才打到折扣的金额
def get_discount_man(s):
    s = str(s)
    s = s.split(':')
    if len(s)==1:
        return 'null'
    else:
        return int(s[0])

# 计算满足条件可以减少的金额
def get_discount_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return 'null'
    else:
        return int(s[1])

# 判断是否是满减类型的券，还是直接打折
def is_man_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s)==1:
        return 0
    else:
        return 1

    
# 汇总所有有关优惠券的特征
def get_coupon_feats(dataset, feature):
    # 先计算出 weekday，方便后面计算有关日期的特征
    dataset['day_of_week'] = dataset['date_received'].astype('str').apply(lambda x: date(int(x[0:4]), int(x[4:6]), 
                                                                                         int(x[6:8])).weekday()+1)
    
    # 月份特征
    dataset['day_of_month'] = dataset['date_received'].astype('str').apply(lambda x: int(x[6:8]))
    
    # 当月领券日期离前三个月优惠券最后使用日期（可能就是截止日期）的距离天数
    ## 获取每个 feature 中 date 最大的日期，可以想象成是这一波优惠券的截止日期！
    max_date = feature[feature['date'] != 'null']['date'].unique()
    max_date = max(max_date)
    dataset['days_distance'] = dataset['date_received'].astype('str').apply(
        lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(int(max_date[0:4]),int(max_date[4:6]),int(max_date[6:8]))).days)
    
    # 满多少钱能用优惠券的特征
    dataset['discount_man'] = dataset['discount_rate'].apply(get_discount_man)
    
    # 满足满减条件后减少的金额特征
    dataset['discount_jian'] = dataset['discount_rate'].apply(get_discount_jian)
    
    # 优惠券类别是否是满减券
    dataset['is_man_jian'] = dataset['discount_rate'].apply(is_man_jian)
    
    # 打折力度的特征
    dataset['discount_rate'] = dataset['discount_rate'].apply(calc_discount_rate)
    
    # 不同优惠券种类的数量统计量特征
    coupon_id = dataset[['coupon_id']]
    ## 后面 groupby 并聚合时，会在 coupon_count 上做加和操作
    coupon_id['coupon_count'] = 1
    coupon_id = coupon_id.groupby('coupon_id').agg('sum').reset_index()
    dataset = pd.merge(dataset, coupon_id, on='coupon_id', how='left')
    
    return dataset


In [5]:
# 2. 获取特征区间商户的相关特征：只跟特征区间有关
def get_merchant_feats(feature):
    merchant = feature[['merchant_id','coupon_id','distance','date_received','date']].copy()
    
    # 卖出的每个商品的销售数量
    t1 = merchant[merchant['date']!='null'][['merchant_id']].copy()
    t1['total_sales'] = 1
    t1 = t1.groupby('merchant_id').agg('sum').reset_index()
    
    # 使用优惠券卖出的商品销售数量
    t2 = merchant[(merchant['date']!='null') & (merchant.coupon_id!='null')][['merchant_id']].copy()
    t2['sales_use_coupon'] = 1
    t2 = t2.groupby('merchant_id').agg('sum').reset_index()
    
    # 同一个商品的优惠券数量
    t3 = merchant[merchant['coupon_id'] != 'null'][['merchant_id']].copy()
    t3 ['total_coupon'] = 1
    t3 = t3.groupby('merchant_id').agg('sum').reset_index()
    
    # 商品与距离的关系，计算用户离着商品的距离最小，最大，平均值，中位数值
    t4 = merchant[(merchant['date'] != 'null')&(merchant['coupon_id'] != 'null')][['merchant_id','distance']].copy()
    ## 下面三行代码的主要作用就是为了将distance字段的数据类型转化为int
    ## 把数据中的null值全部替换为-1
    t4.replace('null',-1,inplace=True)
    t4.distance = t4.distance.astype('int')
    ## 再把数据中的-1全部替换为NaN
    t4.replace(-1,np.nan,inplace=True)
    
    ## 用户离商品距离的最小值
    t5 = t4.groupby('merchant_id').agg('min').reset_index()
    t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)
    
    ## 用户离商品距离的最大值
    t6 = t4.groupby('merchant_id').agg('max').reset_index()
    t6.rename(columns={'distance':'merchant_max_distance'},inplace = True)
    
    ## 用户离商品距离的平均值
    t7 = t4.groupby('merchant_id').agg('mean').reset_index()
    t7.rename(columns = {'distance':'merchant_mean_distance'},inplace= True)
    
    ## 用户离商品距离的中位数
    t8 = t4.groupby('merchant_id').agg('median').reset_index()
    t8.rename(columns={'distance':'merchant_median_distance'},inplace = True)
    
    # 汇总商品的特征
    t = merchant[['merchant_id']].copy()
    ## 删除重复行数据
    t.drop_duplicates(inplace=True)
    
    ## 合并特征
    merchant_feature = pd.merge(t,t1,on='merchant_id',how='left')
    merchant_feature = pd.merge(merchant_feature,t2,on='merchant_id',how='left')
    merchant_feature = pd.merge(merchant_feature,t3,on='merchant_id',how='left')
    merchant_feature = pd.merge(merchant_feature,t5,on='merchant_id',how='left')
    merchant_feature = pd.merge(merchant_feature,t6,on='merchant_id',how='left')
    merchant_feature = pd.merge(merchant_feature,t7,on='merchant_id',how='left')
    merchant_feature = pd.merge(merchant_feature,t8,on='merchant_id',how='left')
    
    # 将数据中的 NaN 用 0 来替换
    merchant_feature['sales_use_coupon'] = merchant_feature['sales_use_coupon'].replace(np.nan,0)
    # 优惠券的使用率
    merchant_feature['merchant_coupon_transfer_rate'] = merchant_feature['sales_use_coupon'].astype('float') / merchant_feature['total_coupon']
    # 即卖出商品中使用优惠券的占比
    merchant_feature['coupon_rate'] = merchant_feature['sales_use_coupon'].astype('float') / merchant_feature['total_sales']
    # 将数据中的 NaN 用 0 来替换
    merchant_feature['total_coupon'] = merchant_feature['total_coupon'].replace(np.nan, 0)
    
    return merchant_feature
    

In [6]:
# 3. 获取特征区间用户的相关特征：只在特征区间

# 用户从领券到使用优惠券的时间间隔
def get_user_date_datereceived_gap(s):
    s = s.split(':')
    return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days

# 汇总用户的特征
def get_user_feats(feature):
    user = feature[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']].copy()
    
    # 用户一共买的商品个数
    t1 = user[user['date'] != 'null'][['user_id','merchant_id']].copy()
    t1.drop_duplicates(inplace=True)
    t1['merchant_id'] = 1
    t1 = t1.groupby('user_id').agg('sum').reset_index()
    t1.rename(columns={'merchant_id':'count_merchant'}, inplace=True)
    
    # 用户离使用优惠券线下购买商品的商店距离特征：最小距离，最大距离，平均距离，中间距离
    t2 = user[(user['date']!='null')&(user['coupon_id'] != 'null')][['user_id','distance']].copy()
    t2.replace('null',-1,inplace=True)
    t2.distance = t2.distance.astype('int')
    t2.replace(-1,np.nan,inplace=True)
    
    # 用户离使用优惠券线下购买商品的商店距离最小值
    t3 = t2.groupby('user_id').agg('min').reset_index()
    t3.rename(columns={'distance':'user_min_distance'},inplace=True)
    
    # 用户离使用优惠券线下购买商品的商店距离最大值
    t4 = t2.groupby('user_id').agg('max').reset_index()
    t4.rename(columns={'distance':'user_max_distance'},inplace=True)
    
    # 用户离使用优惠券线下购买商品的商店距离平均值
    t5 = t2.groupby('user_id').agg('mean').reset_index()
    t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
    
    # 用户离使用优惠券线下购买商品的商店距离中间值
    t6 = t2.groupby('user_id').agg('median').reset_index()
    t6.rename(columns={'distance':'user_median_distance'},inplace=True)
    
    # 用户使用优惠券购买的次数
    t7 = user[(user.date!='null')&(user.coupon_id!='null')][['user_id']]
    t7['buy_use_coupon'] = 1
    t7 = t7.groupby('user_id').agg('sum').reset_index()
    
    # 用户使用优惠券购买商品的次数
    t8 = user[user['date'] != 'null'][['user_id']]
    t8['buy_total'] = 1
    t8 = t8.groupby('user_id').agg('sum').reset_index()
    
    # 用户收到优惠券的总数
    t9 = user[user['coupon_id']!='null'][['user_id']]
    t9['coupon_received'] = 1
    t9 = t9.groupby('user_id').agg('sum').reset_index()
    
    # 用户从收到优惠券到消费的时间间隔关系：平均时间间隔，最小时间间隔，最大时间间隔
    t10 = user[(user['date_received']!='null')&(user['date']!='null')][['user_id','date_received','date']]
    t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
    t10['user_date_datereceived_gap'] = t10['user_date_datereceived_gap'].apply(get_user_date_datereceived_gap)
    t10 = t10[['user_id','user_date_datereceived_gap']]
    
    # 用户从收到优惠券到消费的平均时间间隔
    t11 = t10.groupby('user_id').agg('mean').reset_index()
    t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
    
    # 用户从收到优惠券到消费的最小时间间隔
    t12 = t10.groupby('user_id').agg('min').reset_index()
    t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
    
    # 用户从收到优惠券到消费的最大时间间隔
    t13 = t10.groupby('user_id').agg('max').reset_index()
    t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
    
    # 汇总特征
    t = user[['user_id']].copy()
    t.drop_duplicates(inplace=True)
    user_feature = pd.merge(t,t1,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t3,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t4,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t5,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t6,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t7,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t8,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t9,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t11,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t12,on='user_id',how='left')
    user_feature = pd.merge(user_feature,t13,on='user_id',how='left')
    
    # 空值的处理
    user_feature['count_merchant'] = user_feature['count_merchant'].replace(np.nan,0)
    user_feature['buy_use_coupon'] = user_feature['buy_use_coupon'].replace(np.nan,0)
    
    # 用户在购买时采用优惠券的概率
    user_feature['buy_use_coupon_rate'] = user_feature['buy_use_coupon'].astype('float') / user_feature['buy_total'].astype('float')
    user_feature['user_coupon_transfer_rate'] = user_feature['buy_use_coupon'].astype('float') / user_feature['coupon_received'].astype('float')
    user_feature['buy_total'] = user_feature['buy_total'].replace(np.nan,0)
    user_feature['coupon_received'] = user_feature['coupon_received'].replace(np.nan,0)

    return user_feature
    
    

In [7]:
# 4. 获取特征区域客户和商铺相互互动的到的特征
def get_user_merchant_mutual_feats(feature):
    # 一个客户在一个商家一共购买商品的次数
    t = feature[['user_id','merchant_id','date']].copy()
    t = t[t.date!='null'][['user_id','merchant_id']]
    t['user_merchant_buy_total'] = 1
    t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t.drop_duplicates(inplace=True)
    
    # 一个客户在一个商家一共收到的优惠券数量
    t1 = feature[['user_id','merchant_id','coupon_id']]
    t1 = t1[t1['coupon_id']!='null'][['user_id','merchant_id']]
    t1['user_merchant_received'] = 1
    t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t1.drop_duplicates(inplace=True)
    
    # 一个客户在一个商家使用优惠券购买的次数
    t2 = feature[['user_id','merchant_id','date','date_received']]
    t2 = t2[(t2['date']!='null')&(t2['date_received']!='null')][['user_id','merchant_id']]
    t2['user_merchant_buy_use_coupon'] = 1
    t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t2.drop_duplicates(inplace=True)
    
    # 一个客户在一个商家浏览的次数
    t3 = feature[['user_id','merchant_id']]
    t3['user_merchant_any'] = 1
    t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t3.drop_duplicates(inplace=True)
    
    # 一个客户在一个商家没有使用优惠券购买的次数
    t4 = feature[['user_id','merchant_id','date','coupon_id']]
    t4 = t4[(t4['date']!='null')&(t4['coupon_id']=='null')][['user_id','merchant_id']]
    t4['user_merchant_buy_common'] = 1
    t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t4.drop_duplicates(inplace=True)
    
    # 汇总特征
    all_user_merchant = feature[['user_id','merchant_id']].copy()
    all_user_merchant.drop_duplicates(inplace=True)
    user_merchant = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
    user_merchant = pd.merge(user_merchant,t1,on=['user_id','merchant_id'],how='left')
    user_merchant = pd.merge(user_merchant,t2,on=['user_id','merchant_id'],how='left')
    user_merchant = pd.merge(user_merchant,t3,on=['user_id','merchant_id'],how='left')
    user_merchant = pd.merge(user_merchant,t4,on=['user_id','merchant_id'],how='left')
    
    # 缺失值替换
    user_merchant['user_merchant_buy_use_coupon'] = user_merchant['user_merchant_buy_use_coupon'].replace(np.nan,0)
    user_merchant['user_merchant_buy_common'] = user_merchant['user_merchant_buy_common'].replace(np.nan,0)
    
    # 转化率、购买率等比例特征
    user_merchant['user_merchant_coupon_transfer_rate'] = user_merchant['user_merchant_buy_use_coupon'].astype('float') / user_merchant['user_merchant_received'].astype('float')
    user_merchant['user_merchant_coupon_buy_rate'] = user_merchant['user_merchant_buy_use_coupon'].astype('float') / user_merchant['user_merchant_buy_total'].astype('float')
    user_merchant['user_merchant_rate'] = user_merchant['user_merchant_buy_total'].astype('float') / user_merchant['user_merchant_any'].astype('float')
    user_merchant['user_merchant_common_buy_rate'] = user_merchant['user_merchant_buy_common'].astype('float') / user_merchant['user_merchant_buy_total'].astype('float')
    
    return user_merchant



In [8]:
# 5. 预测区域内的数据的特征挖掘，其实是有点儿 Leakage 的意味了

# 确定当前优惠券是否是同一个用户领取到的优惠券中的第一个（时间最近的）或者最后一个（时间最远的）
def is_firstlastone(x):
    if x==0:
        return 1
    elif x>0:
        return 0
    else:
        return -1

# 计算同一个用户收到的不同优惠券的时间差值，当前优惠券在前
def get_day_gap_before(s):
    date_received,dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        #将时间差转化为天数
        this_gap = (dt.date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-dt.date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
        if this_gap>0:
            gaps.append(this_gap)
    if len(gaps)==0:
        return -1
    else:
        return min(gaps)

# 计算同一个用户收到的不同优惠券的时间差值，当前优惠券在后
def get_day_gap_after(s):
    date_received,dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (dt.datetime(int(d[0:4]),int(d[4:6]),int(d[6:8]))-dt.datetime(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
        if this_gap>0:
            gaps.append(this_gap)
    if len(gaps)==0:
        return -1
    else:
        return min(gaps)
    
# 汇总其他特征
def get_other_feats(dataset):
    # 本月获得的优惠券数量
    t = dataset[['user_id']].copy()
    t['this_month_user_receive_all_coupon_count'] = 1
    t = t.groupby('user_id').agg('sum').reset_index()
    
    # 本月收到同样优惠券的数量
    t1 = dataset[['user_id','coupon_id']].copy()
    t1['this_month_user_receive_same_coupn_count'] = 1
    t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
    
    # 本月收到相同优惠券的日期合并，并统计优惠券数量
    t2 = dataset[['user_id','coupon_id','date_received']].copy()
    t2['date_received'] = t2['date_received'].astype('str')
    ## 如果出现相同的用户接收相同的优惠券在接收时间上用 ':' 连接上第n次接受优惠券的时间
    t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    ## 将接收时间的一组按着':'分开，这样就可以计算接受了优惠券的数量,apply是合并
    t2['receive_number'] = t2['date_received'].apply(lambda s:len(s.split(':')))
    ## 只保存超过两次相同优惠券的记录
    t2 = t2[t2['receive_number'] > 1]
    
    ## 本月收到相同优惠券的日期最大的 
    t2['max_date_received'] = t2['date_received'].apply(lambda s:max([int(d) for d in s.split(':')]))
    
    ## 本月收到相同优惠券的日期最小的
    t2['min_date_received'] = t2['date_received'].apply(lambda s:min([int(d) for d in s.split(':')]))
    ### 删去receive_number，与前面this_month_user_receive_same_coupn_count重复了
    t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
    
    # 某个用户接收到的优惠券最近的接收时间和最远接收时间，得到是不是最远或者最近时间接收的特征
    t3 = dataset[['user_id','coupon_id','date_received']]
    ## 将两表融合只保留左表数据,这样得到的表，相当于保留了最近接收时间和最远接受时间
    t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
    ## 这个优惠券最近接收时间
    t3['this_month_user_receive_same_coupon_lastone']= t3['max_date_received']-t3['date_received'].astype(int)
    ## 这个优惠券最远接收时间
    t3['this_month_user_receive_same_coupon_firstone'] = t3['date_received'].astype(int)-t3['min_date_received']
    
    ## 确定是不是最近的优惠券或者是不是最远接收的优惠券
    t3['this_month_user_receive_same_coupon_lastone'] = t3['this_month_user_receive_same_coupon_lastone'].apply(is_firstlastone)
    t3['this_month_user_receive_same_coupon_firstone'] = t3['this_month_user_receive_same_coupon_firstone'].apply(is_firstlastone)
    t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
    
    # 一个用户所接收到的所有优惠券的数量
    t4 = dataset[['user_id','date_received']].copy()
    t4['this_day_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
    
    # 一个用户不同时间所接收到不同优惠券的数量
    t5 = dataset[['user_id','coupon_id','date_received']].copy()
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
    
    # 一个用户不同优惠券的领取时间
    t6 = dataset[['user_id','coupon_id','date_received']].copy()
    t6['date_received'] = t6['date_received'].astype('str')
    t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t6.rename(columns={'date_received':'dates'},inplace = True)
    
    # 计算同一个用户收到的不同优惠券的时间差值
    t7 = dataset[['user_id','coupon_id','date_received']]
    t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
    t7['date_received_date'] = t7['date_received'].astype('str')+'-'+t7['dates']
    t7['day_gap_before'] = t7['date_received_date'].apply(get_day_gap_before)
    t7['day_gap_after'] = t7['date_received_date'].apply(get_day_gap_after)
    t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
    
    # 汇总特征
    other_feature = pd.merge(t1,t,on='user_id')
    other_feature = pd.merge(other_feature,t3,on=['user_id','coupon_id'])
    other_feature = pd.merge(other_feature,t4,on=['user_id','date_received'])
    other_feature = pd.merge(other_feature,t5,on=['user_id','coupon_id','date_received'])
    other_feature = pd.merge(other_feature,t7,on=['user_id','coupon_id','date_received'])

    return other_feature
    

In [9]:
# 获得类标
def get_label(s):
    s = s.split(':')
    if s[0]=='null':
        return 0
    elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
        return 1
    else:
        return -1
    
# 汇总所有的五大类特征
def DataProcess(dataset, feature, TrainFlag):
    # 各自算五大类特征
    other_feature = get_other_feats(dataset)
    merchant = get_merchant_feats(feature)
    user = get_user_feats(feature)
    user_merchant = get_user_merchant_mutual_feats(feature)
    coupon = get_coupon_feats(dataset,feature)
    
    # 汇总这五大特征
    dataset = pd.merge(coupon,merchant,on='merchant_id',how='left')
    dataset = pd.merge(dataset,user,on='user_id',how='left')
    dataset = pd.merge(dataset,user_merchant,on=['user_id','merchant_id'],how='left')
    dataset = pd.merge(dataset,other_feature,on=['user_id','coupon_id','date_received'],how='left')
    dataset.drop_duplicates(inplace=True)
 
    # 合并之后处理对应的空值
    dataset['user_merchant_buy_total'] = dataset['user_merchant_buy_total'].replace(np.nan,0)
    dataset['user_merchant_any'] = dataset['user_merchant_any'].replace(np.nan,0)
    dataset['user_merchant_received'] = dataset['user_merchant_received'].replace(np.nan,0)
    dataset['is_weekend'] = dataset['day_of_week'].apply(lambda x:1 if x in (6,7) else 0)
    
    # 拓展日期的信息
    weekday_dummies = pd.get_dummies(dataset.day_of_week)
    weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    dataset = pd.concat([dataset, weekday_dummies],axis=1)
    
    if TrainFlag:
        dataset['date'] = dataset['date'].fillna('null');
        dataset['label'] = dataset.date.astype('str') + ':' +  dataset.date_received.astype('str')
        dataset.label = dataset.label.apply(get_label)
        dataset.drop(['merchant_id','day_of_week','date','date_received','coupon_count'],axis=1,inplace=True)
    else:
        dataset.drop(['merchant_id','day_of_week','coupon_count'],axis=1,inplace=True)
    
    # 处理所有的空值
    dataset = dataset.replace('null', np.nan)
    
    return dataset
    

In [10]:
# 处理数据
ProcessDataSet1 = DataProcess(dataset1, feature1, True)
ProcessDataSet1.to_csv(os.path.join(feature_path, 'ProcessDataSet1.csv'), index=None)
print('---------------ProcessDataSet1 done-------------------')
ProcessDataSet2 = DataProcess(dataset2, feature2, True)
ProcessDataSet2.to_csv(os.path.join(feature_path, 'ProcessDataSet2.csv'), index=None)
print('---------------ProcessDataSet2 done-------------------')
ProcessDataSet3 = DataProcess(dataset3, feature3, False)
ProcessDataSet3.to_csv(os.path.join(feature_path, 'ProcessDataSet3.csv'), index=None)
print('---------------ProcessDataSet3 done-------------------')

---------------ProcessDataSet1 done-------------------
---------------ProcessDataSet2 done-------------------
---------------ProcessDataSet3 done-------------------


## References
1. [思路和代码](https://blog.csdn.net/weixin_42001089/article/details/85013073)