In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime

In [None]:
'''
对Merchant_id + Coupon_id + User_id 进行展开，同时还是只是在 offline_data 的基础上进行特征工程

Merchant_id 进行维度展开：
    （历史）在该商户中使用消费券的人数（归一化）
    在该商户中进行过消费的人数 m_1
    （历史）商户的平均折扣率
    商户当前平均 Discount_rate m_2
    商户当前平均 Coupon_rate m_3
    店铺的平均距离 m_4
    当前商户的平均满额 m_5
    当前的 Discout_rate - 商家历史平均 Discout_rate m_6
    当前的 Coupon_rate - 商家历史平均 Coupon_rate m_7
    当前的 Distance - 商家历史平均 Distance m_8
    当前的 achieve - 商家历史平均 achieve m_9
    （历史）商户的平均分布距离
    
    
User_id 进行维度展开
   用户领取优惠券之前领取过的优惠券数量（要进行归一化）u_1
    //（历史）用户从领取优惠券到使用优惠券的时间
    //（历史）用户去过的商家的数量（归一化）
     //(历史) 用户领取的优惠券的种类数（归一化）
  用户领取优惠券之前的普通购买次数 u_2
     //用户参与限时低价活动数（归一化）
     用户参与过的满减的平均满减额度 u_3(归一化)
 * 该用户之前上一次领取对应优惠券的时间间隔天数 u_4
 * 该用户之前领取过多少个对应的优惠券 u_5
  
 玄学：用优惠券的时间是不是周末
    
'''


In [2]:
'''
offline_data

shape = (1754884, 7)
真正用户数量 = (539438, 6)

Coupon_id = null 表示无优惠券消费，此时 Discount_rate 和 Date_received 字段无意义
Discount_rate 有 2 种情况：[0, 1] - 折扣率， x:y - 满x减y 
Distance 最近商铺距离 [0, 10]
样本分为 3 类： 负样本                          无影响样本                       正样本
                Date=null & Coupon_id!=null     Date!=null & Coupon_id==null     Date!=null & Coupon_id != null
                
负样本 = (977900, 7)  (0,1)， 代表领取优惠券但没有使用优惠券
    领取了但没用的人数 = (497820, 7)， 占比 92%
正样本 = (497820, 7)    (1, 1)，表示用优惠券消费, 会有同一个用户多次使用优惠券消费
    有过使用消费券行为的人数  = (46395, 7)，占比 9%
无影响样本 = (701602, 7) (1, 0)， 代表正常消费
    有过正常消费的人数 (207619, 7)， 占比 38 %
所有都是有效数据，没有出现 (0, 0) 的情况

pos_rate = 0.09
neg_rate = 0.92
norm_rate = 0.41

'''

offline_data = pd.read_csv("data/offline.csv")
offline_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0,,20160217.0
1,1439408,4663,11002.0,150:20,1,20160528.0,
2,1439408,2632,8591.0,20:1,0,20160217.0,
3,1439408,2632,1078.0,20:1,0,20160319.0,
4,1439408,2632,8591.0,20:1,0,20160613.0,


In [3]:
'''
oneline_data

shape = (11429826, 7)
真正用户数量 = (762858, 6)

Coupon_id = null 表示无优惠券消费，此时 Discount_rate 和 Date_received 字段无意义, 'fixed' 代表低价促销
Discount_rate 有 2 种情况：[0, 1] - 折扣率， x:y - 满x减y 
Distance 最近商铺距离 [0, 10]
样本分为 3 类： 负样本                          无影响样本                       正样本
                Date=null & Coupon_id!=null     Date!=null & Coupon_id==null     Date!=null & Coupon_id != null
                

负样本 = (655898, 7)  (0,1)， 代表领取优惠券行为
    有领取过优惠券的人数 = (497820, 7)， 占比 28%
    数据与 Action = 2 的数据是一致的
正样本 = (216459, 7)    (1, 1)，表示用优惠券消费
    有使用过优惠券行为的人数 = (95655, 7)， 占比 13%
无影响样本 = (10557469, 7) (1, 0)， 代表正常消费
    有过正常消费的人数 = (726146, 7), 占比 95%
    
so true_负样本 = 所有人 - (有领取过优惠券的人 & 有使用过优惠券行为的人)
   true_正样本 = (有领取过优惠券的人 & 有使用过优惠券行为的人)
   
   
所有都是有效数据，没有出现 (0, 0) 的情况
低价促销数量 (131546, 7)

pos_rate = 0.01
neg_rate = 0.06
norm_rate = 0.93

'''

online_data = pd.read_csv("data/online.csv")
online_data.head()

Unnamed: 0,User_id,Merchant_id,Action,Coupon_id,Discount_rate,Date_received,Date
0,13740231,18907,2,100017492.0,500:50,20160513.0,
1,13740231,34805,1,,,,20160321.0
2,14336199,18907,0,,,,20160618.0
3,14336199,18907,0,,,,20160618.0
4,14336199,18907,0,,,,20160618.0


In [3]:
'''
载入测试数据
'''
test_data = pd.read_csv('data/test_revised.csv')

In [4]:
'''
重新提取离线数据
'''

offline_received_but_used = offline_data.loc[(offline_data['Date'] == 'null') & (offline_data['Coupon_id'] != 'null')]
offline_received_and_used = offline_data.loc[(offline_data['Date'] != 'null') & (offline_data['Coupon_id'] != 'null')]
print("offline_received_but_used size = ", offline_received_but_used.shape)
print("offline_received_and_used size = ", offline_received_and_used.shape)

# 转换时间格式
offline_received_and_used['Date_received'] = offline_received_and_used['Date_received'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))
offline_received_but_used['Date_received'] = offline_received_but_used['Date_received'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))

offline_received_and_used['Date'] = offline_received_and_used['Date'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))
offline_received_and_used['Date_delta'] = (offline_received_and_used['Date'] - offline_received_and_used['Date_received'])
offline_received_and_used['Date_delta'] = offline_received_and_used['Date_delta'].apply(lambda row: row.days)

offline_positive_data = offline_received_and_used.loc[offline_received_and_used['Date_delta'] <= 15]
offline_negative_data = offline_received_and_used.loc[offline_received_and_used['Date_delta'] > 15]
offline_negative_data = pd.concat([offline_received_but_used, offline_negative_data])

print("offline_positive_data size = ", offline_positive_data.shape)
print("offline_negative_data", offline_negative_data.shape)

offline_received_but_used size =  (977900, 7)
offline_received_and_used size =  (75382, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.p

offline_positive_data size =  (64395, 8)
offline_negative_data (988887, 8)


In [5]:
# 删除 positive 和 negative 的 Date 和 Date_delta 两列
temp_positive_data = offline_positive_data.drop(['Date', 'Date_delta'], axis=1)
temp_negative_data = offline_negative_data.drop(['Date', 'Date_delta'], axis=1)

# 加上对应的数据标签 （0-负样本， 1-正样本）
pos_label = np.ones(temp_positive_data.shape[0], dtype=np.int8)
neg_label = np.zeros(temp_negative_data.shape[0], dtype=np.int8)
temp_positive_data['label'] = pos_label
temp_negative_data['label'] = neg_label

In [6]:
'''
处理 Distance
'''

# 把 Distance 中的 null 替换为 11
temp_negative_data.loc[temp_negative_data['Distance'] == 'null', 'Distance'] = 11
temp_positive_data.loc[temp_positive_data['Distance'] == 'null', 'Distance'] = 11

In [7]:
'''
处理 Discout_rate
'''

# 处理 positive_data 的 Discout_rate
# 把 Discount_rate 切分
a = temp_positive_data['Discount_rate'].str.split(':', expand=True)
a.columns = ['achieve', 'reduce']
a['achieve'] = pd.to_numeric(a['achieve'])

# 填充 reduce == null，即源数据是折扣比例
a['reduce'].fillna(0, inplace=True)
a['reduce'] = pd.to_numeric(a['reduce'])

# 生成 满减比例
a['Coupon_rate'] = a['reduce'] / a['achieve']
a['Discount_rate'] = a['achieve']
a.loc[a['Coupon_rate'] != 0, 'Discount_rate'] = 0

a.drop(['reduce'], axis=1, inplace=True)
a.loc[a['Discount_rate'] != 0, 'achieve'] = 0

# 处理 negative_data 的 Discount_rate

# 把 Discount_rate 切分
b = temp_negative_data['Discount_rate'].str.split(':', expand=True)
b.columns = ['achieve', 'reduce']
b['achieve'] = pd.to_numeric(b['achieve'])

# 填充 reduce == null，即源数据是折扣比例
b['reduce'].fillna(0, inplace=True)
b['reduce'] = pd.to_numeric(b['reduce'])

# 生成 满减比例
b['Coupon_rate'] = b['reduce'] / b['achieve']
b['Discount_rate'] = b['achieve']
b.loc[b['Coupon_rate'] != 0, 'Discount_rate'] = 0

b.drop(['reduce'], axis=1, inplace=True)
b.loc[b['Discount_rate'] != 0, 'achieve'] = 0

# 合并两个DataFrame
temp_positive_data.drop(['Discount_rate'], axis=1, inplace=True)
temp_negative_data.drop(['Discount_rate'], axis=1, inplace=True)

temp_positive_data = pd.concat([temp_positive_data, a], axis=1)
temp_negative_data = pd.concat([temp_negative_data, b], axis=1)

temp_positive_data = temp_positive_data[['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'achieve', 'Coupon_rate', 'Discount_rate', 'Date_received', 'label']]
temp_negative_data = temp_negative_data[['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'achieve', 'Coupon_rate', 'Discount_rate', 'Date_received', 'label']]

In [8]:
'''
将正负样本合并到一起,并按照User_id进行排序
'''
temp_data = pd.concat([temp_positive_data, temp_negative_data], axis=0)
temp_data.sort_values(by='User_id', inplace=True)

In [11]:
'''
获取正负样本中出现过的用户的数据，在此基础之上进行数据统计
'''
user = list(set(temp_data['User_id'].tolist()))
user_data = offline_data.loc[offline_data['User_id'].isin(user)]
user_data.sort_values(by='User_id', inplace=True)

print(user_data.shape)
user_data.head()

(1696448, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
679793,4,1469,2902,0.95,10,20160607,
679792,4,1433,8735,30:5,10,20160214,
678050,35,3381,9776,10:5,0,20160129,
678051,35,3381,1807,300:30,0,20160130,
678052,35,3381,11951,200:20,0,20160129,


In [12]:
'''
用户之前一共领取过的优惠券数量 u_1
'''
user_data.sort_values(by='User_id', inplace=True)
refine_data_merge = pd.merge(temp_data, temp_data, how='left', on=['User_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)

refine_data_merge = refine_data_merge[['User_id', 'Coupon_id_x',  'Date_received_x', 'Date_received_y']]
refine_data_merge = refine_data_merge.loc[(refine_data_merge['Date_received_x'] != refine_data_merge['Date_received_y'])]
refine_data_merge.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,User_id,Coupon_id_x,Date_received_x,Date_received_y
1,4,8735,2016-02-14,2016-06-07
2,4,2902,2016-06-07,2016-02-14
5,35,11951,2016-01-30,2016-01-29
6,35,11951,2016-01-30,2016-01-29
8,35,11951,2016-01-29,2016-01-30


In [13]:
refine_data_merge['Date_delta'] = refine_data_merge['Date_received_x'] - refine_data_merge['Date_received_y']
refine_data_merge['Date_delta'] = refine_data_merge['Date_delta'].apply(lambda row: row.days)
refine_data_merge = refine_data_merge.loc[refine_data_merge['Date_delta'] > 0]
refine_data_merge = refine_data_merge.groupby(['User_id', 'Date_received_x']).count()
refine_data_merge.reset_index(level=[0,1], inplace=True)
refine_data_merge = refine_data_merge[['User_id', 'Date_received_x', 'Date_delta']]
refine_data_merge.columns = ['User_id', 'Date_received', 'count']
refine_data_merge.head()

Unnamed: 0,User_id,Date_received,count
0,4,2016-06-07,1
1,35,2016-01-30,4
2,165,2016-03-22,1
3,165,2016-05-25,2
4,236,2016-02-11,2


In [14]:
# 统计同一个用户 Date_received 相同的 数目
dulp_received_date_data = temp_data.groupby(['User_id', 'Date_received']).count()
dulp_received_date_data.reset_index(level=[0,1], inplace=True)
dulp_received_date_data = dulp_received_date_data[['User_id', 'Date_received', 'Coupon_id']]
dulp_received_date_data.columns = ['User_id', 'Date_received', 'dulp_count']
dulp_received_date_data.head()

Unnamed: 0,User_id,Date_received,dulp_count
0,4,2016-02-14,1
1,4,2016-06-07,1
2,35,2016-01-29,2
3,35,2016-01-30,2
4,36,2016-01-25,2


In [15]:
refine_dulp_merge = pd.merge(refine_data_merge, dulp_received_date_data, how='left', on=['User_id', 'Date_received'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
refine_dulp_merge.head()

Unnamed: 0,User_id,Date_received,count,dulp_count
0,4,2016-06-07,1,1
1,35,2016-01-30,4,2
2,165,2016-03-22,1,1
3,165,2016-05-25,2,1
4,236,2016-02-11,2,1


In [16]:
refine_dulp_merge['u_1'] = refine_dulp_merge['count'] / refine_dulp_merge['dulp_count']
refine_dulp_merge.drop(['count', 'dulp_count'], axis=1, inplace=True)
refine_dulp_merge.head()

Unnamed: 0,User_id,Date_received,u_1
0,4,2016-06-07,1.0
1,35,2016-01-30,2.0
2,165,2016-03-22,1.0
3,165,2016-05-25,2.0
4,236,2016-02-11,2.0


In [19]:
u1_data = pd.merge(temp_data, refine_dulp_merge, how='left', on=['User_id', 'Date_received'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
u1_data.fillna(0, inplace=True)

print(u1_data.shape)
u1_data.head()

(1053282, 10)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1
0,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0
1,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0
2,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0
3,35,3381,9776,0,10.0,0.5,0.0,2016-01-29,0,0.0
4,35,3381,11951,0,200.0,0.1,0.0,2016-01-30,0,2.0


In [84]:
'''
用户领取优惠券之前的普通购买次数 u_2
'''

normal = pd.merge(user_data, user_data, how='left', on=['User_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
normal = normal.loc[(normal['Coupon_id_x'] != 'null') & (normal['Coupon_id_y'] == 'null') & (normal['Date_y'] != 'null')]
normal.head()

Unnamed: 0,User_id,Merchant_id_x,Coupon_id_x,Discount_rate_x,Distance_x,Date_received_x,Date_x,Merchant_id_y,Coupon_id_y,Discount_rate_y,Distance_y,Date_received_y,Date_y
70,165,4195,7571,30:5,0,20160525,,2934,,,0,,20160414
71,165,4195,7571,30:5,0,20160525,,2934,,,0,,20160328
73,165,4195,7571,30:5,0,20160525,,2934,,,0,,20160617
76,165,4195,7571,30:5,0,20160525,,4195,,,0,,20160420
77,165,4195,7571,30:5,0,20160525,,2934,,,0,,20160125


In [85]:
normal = normal[['User_id', 'Date_received_x', 'Date_y', 'Coupon_id_x']]
normal['Date_received_x'] = normal['Date_received_x'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))
normal['Date_y'] = normal['Date_y'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))
normal.head()

Unnamed: 0,User_id,Date_received_x,Date_y,Coupon_id_x
70,165,2016-05-25,2016-04-14,7571
71,165,2016-05-25,2016-03-28,7571
73,165,2016-05-25,2016-06-17,7571
76,165,2016-05-25,2016-04-20,7571
77,165,2016-05-25,2016-01-25,7571


In [86]:
normal['Date_delta'] = normal['Date_received_x'] - normal['Date_y']
normal.head()

Unnamed: 0,User_id,Date_received_x,Date_y,Coupon_id_x,Date_delta
70,165,2016-05-25,2016-04-14,7571,41 days
71,165,2016-05-25,2016-03-28,7571,58 days
73,165,2016-05-25,2016-06-17,7571,-23 days
76,165,2016-05-25,2016-04-20,7571,35 days
77,165,2016-05-25,2016-01-25,7571,121 days


In [87]:
normal['Date_delta'] = normal['Date_delta'].apply(lambda row: row.days)
normal = normal.loc[normal['Date_delta'] > 0]
normal.head()

Unnamed: 0,User_id,Date_received_x,Date_y,Coupon_id_x,Date_delta
70,165,2016-05-25,2016-04-14,7571,41
71,165,2016-05-25,2016-03-28,7571,58
76,165,2016-05-25,2016-04-20,7571,35
77,165,2016-05-25,2016-01-25,7571,121
78,165,2016-05-25,2016-05-10,7571,15


In [88]:
normal= normal.groupby(['User_id', 'Date_received_x']).count()
normal.reset_index(level=[0,1], inplace=True)
normal.head()

Unnamed: 0,User_id,Date_received_x,Date_y,Coupon_id_x,Date_delta
0,165,2016-01-25,1,1,1
1,165,2016-03-22,3,3,3
2,165,2016-05-25,11,11,11
3,215,2016-05-24,1,1,1
4,285,2016-05-01,1,1,1


In [89]:
normal = normal[['User_id', 'Date_received_x', 'Date_delta']]
normal.columns = ['User_id', 'Date_received', 'u_2']
normal.head()

Unnamed: 0,User_id,Date_received,u_2
0,165,2016-01-25,1
1,165,2016-03-22,3
2,165,2016-05-25,11
3,215,2016-05-24,1
4,285,2016-05-01,1


In [90]:
u2_data = pd.merge(u1_data, normal, how='left', on=['User_id', 'Date_received'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
u2_data.fillna(0, inplace=True)

print(u2_data.shape)
u2_data.head()

(1053282, 11)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,u_2
0,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0,0.0
1,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0,0.0
2,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0,0.0
3,35,3381,9776,0,10.0,0.5,0.0,2016-01-29,0,0.0,0.0
4,35,3381,11951,0,200.0,0.1,0.0,2016-01-30,0,2.0,0.0


In [21]:
'''
用户参与过的满减的平均满减额度 u_3
'''
# 使用 u_2 的版本
# achieve_data = u2_data.groupby(['User_id']).mean()
# achieve_data['User_id'] = sorted(list(set(u2_data['User_id'].tolist())))
# achieve_data = achieve_data[['User_id', 'achieve']]
# achieve_data.columns = ['User_id','u_3']
# u3_data = pd.merge(u2_data, achieve_data, how='left', on='User_id', left_on=None, right_on=None,
#       left_index=False, right_index=False, sort=True,
#       suffixes=('_x', '_y'), copy=True, indicator=False)

# print(u3_data.shape)
# u3_data.head()

achieve_data = u1_data.groupby(['User_id']).mean()
achieve_data['User_id'] = sorted(list(set(u1_data['User_id'].tolist())))
achieve_data = achieve_data[['User_id', 'achieve']]
achieve_data.columns = ['User_id','u_3']
u3_data = pd.merge(u1_data, achieve_data, how='left', on='User_id', left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)

print(u3_data.shape)
u3_data.head()

(1053282, 11)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,u_3
0,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0,15.0
1,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0,15.0
2,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0,177.5
3,35,3381,9776,0,10.0,0.5,0.0,2016-01-29,0,0.0,177.5
4,35,3381,11951,0,200.0,0.1,0.0,2016-01-30,0,2.0,177.5


In [22]:
'''
* 该用户之前是否领取过对应的优惠券 u_4
'''
used_user = list(set(user_data.loc[(user_data['Coupon_id'] != 'null')]['User_id'].tolist()))
used_user_data = user_data.loc[user_data['User_id'].isin(used_user)]

In [23]:
used_user_data_group = pd.merge(used_user_data, used_user_data, how='left', on=['User_id', 'Coupon_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)

In [24]:
# 提取出同一用户在不同领取时间上的交互数据
used_user_data_group = used_user_data_group[['User_id', 'Coupon_id', 'Date_received_x', 'Date_received_y']]
used_user_data_group = used_user_data_group.loc[used_user_data_group['Date_received_x'] != used_user_data_group['Date_received_y']]
used_user_data_group.head()

Unnamed: 0,User_id,Coupon_id,Date_received_x,Date_received_y
3,35,11951,20160129,20160130
4,35,11951,20160130,20160129
323,448,10927,20160409,20160312
324,448,10927,20160409,20160522
325,448,10927,20160312,20160409


In [25]:
used_user_data_group['Date_received_x'] = used_user_data_group['Date_received_x'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))
used_user_data_group['Date_received_y'] = used_user_data_group['Date_received_y'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))

In [26]:
# 因为计算 date_delta = Date_received_x - Date_received_y
# 所以取出 date_delta > 0 的数据，此时需要的是Date_received_x（也可以是 date_delta < 0, 此时需要的是 此时需要的是Date_received_y）
used_user_data_group['date_delta'] = used_user_data_group['Date_received_x'] - used_user_data_group['Date_received_y']
used_user_data_group['date_delta'] = used_user_data_group['date_delta'].apply(lambda row: row.days)

In [27]:
used_user_data_group = used_user_data_group.loc[used_user_data_group['date_delta'] > 0]
used_user_data_group.sort_values(by='User_id', inplace=True)

# 算出当前领取日期下的最近一次领取日期间隔
used_user_data_group = used_user_data_group.groupby(by=['User_id', 'Coupon_id', 'Date_received_x']).min()
used_user_data_group.reset_index(level=[0, 1, 2], inplace=True)

In [28]:
# 查询信息用于生成 u_5data
u5_count = used_user_data_group.groupby(['User_id', 'Coupon_id', 'Date_received_x']).count()
u5_count.reset_index(level=[0,1,2], inplace=True)
u5_count = u5_count[['User_id', 'Coupon_id', 'Date_received_x', 'date_delta']]
u5_count.columns = ['User_id', 'Coupon_id', 'Date_received', 'u_5']

In [29]:
# 再计算同一用户对同一种优惠券的平均最近一次领取日期间隔数据
used_user_data_group = used_user_data_group.groupby(['User_id', 'Coupon_id']).mean()
used_user_data_group.reset_index(level=[0,1], inplace=True)
used_user_data_group.columns  = ['User_id', 'Coupon_id', 'u_4']

In [30]:
u4_data = pd.merge(u3_data, used_user_data_group, how='left', on=['User_id', 'Coupon_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
u4_data.fillna(0, inplace=True)

print(u4_data.shape)
u4_data

(1053282, 12)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,u_3,u_4
0,4,1469,2902,10,0.0,0.000000,0.95,2016-06-07,0,1.0,15.000000,0.0
1,4,1433,8735,10,30.0,0.166667,0.00,2016-02-14,0,0.0,15.000000,0.0
2,35,3381,11951,0,200.0,0.100000,0.00,2016-01-29,0,0.0,177.500000,1.0
3,35,3381,11951,0,200.0,0.100000,0.00,2016-01-30,0,2.0,177.500000,1.0
4,35,3381,1807,0,300.0,0.100000,0.00,2016-01-30,0,2.0,177.500000,0.0
5,35,3381,9776,0,10.0,0.500000,0.00,2016-01-29,0,0.0,177.500000,0.0
6,36,5717,12349,8,20.0,0.250000,0.00,2016-01-25,0,0.0,25.000000,0.0
7,36,1041,13490,4,30.0,0.166667,0.00,2016-01-25,0,0.0,25.000000,0.0
8,64,2146,11173,2,100.0,0.100000,0.00,2016-01-29,0,0.0,100.000000,0.0
9,110,6454,14031,10,100.0,0.100000,0.00,2016-01-31,0,0.0,133.333333,0.0


In [101]:
'''
生成u_5 该用户之前领取过多少个对应的优惠券
'''

u5_data = pd.merge(u4_data, u5_count, how='left', on=['User_id', 'Coupon_id', 'Date_received'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
u5_data.fillna(0, inplace=True)

print(u5_data.shape)
u5_data.head()

(1053282, 14)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,u_2,u_3,u_4,u_5
0,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0,0.0,15.0,0.0,0.0
1,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0,0.0,15.0,0.0,0.0
2,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0,0.0,177.5,1.0,0.0
3,35,3381,11951,0,200.0,0.1,0.0,2016-01-30,0,2.0,0.0,177.5,1.0,1.0
4,35,3381,1807,0,300.0,0.1,0.0,2016-01-30,0,2.0,0.0,177.5,0.0,0.0


In [102]:
'''
去除 u_2, 测试数据中用不到
'''
u5_data.drop('u_2', axis=1, inplace=True)
u5_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,u_3,u_4,u_5
0,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0,15.0,0.0,0.0
1,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0,15.0,0.0,0.0
2,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0,177.5,1.0,0.0
3,35,3381,11951,0,200.0,0.1,0.0,2016-01-30,0,2.0,177.5,1.0,1.0
4,35,3381,1807,0,300.0,0.1,0.0,2016-01-30,0,2.0,177.5,0.0,0.0


In [103]:
'''
在该商户中进行过消费的次数 m_1
'''
u5_data = u5_data.convert_objects(convert_numeric=True)
merchant_data = user_data.loc[(user_data['Coupon_id'] == 'null') & (user_data['Date'] != 'null')]
merchant_data = merchant_data.groupby(['Merchant_id']).count()
merchant_data.reset_index(level=[0], inplace=True)
merchant_data = merchant_data[['Merchant_id', 'Coupon_id']]
merchant_data.columns = ['Merchant_id', 'm_1']
merchant_data.head()

  after removing the cwd from sys.path.


Unnamed: 0,Merchant_id,m_1
0,2,2
1,3,8
2,4,33
3,5,35
4,6,53


In [104]:
m_1data = pd.merge(u5_data, merchant_data, how='left', on=['Merchant_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
m_1data.fillna(0, inplace=True)
m_1data.sort_values(by='User_id', inplace=True)

print(m_1data.shape)
m_1data.head()

(1053282, 14)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,u_3,u_4,u_5,m_1
180152,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0,15.0,0.0,0.0,8137.0
201328,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0,15.0,0.0,0.0,11757.0
410148,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0,177.5,1.0,0.0,18796.0
410151,35,3381,9776,0,10.0,0.5,0.0,2016-01-29,0,0.0,177.5,0.0,0.0,18796.0
410149,35,3381,11951,0,200.0,0.1,0.0,2016-01-30,0,2.0,177.5,1.0,1.0,18796.0


In [105]:
'''
商户当前平均 Discount_rate m_2, 平均 Coupon_rate m_3, 平均 Distance m_4, 平均 achieve m_5
'''
ave_discount_data = m_1data.groupby('Merchant_id').mean()
ave_discount_data.reset_index(level=[0], inplace=True)
ave_discount_data = ave_discount_data[['Merchant_id', 'Discount_rate', 'Coupon_rate', 'Distance', 'achieve']]
ave_discount_data.columns = ['Merchant_id', 'm_2', 'm_3', 'm_4', 'm_5']
ave_discount_data.head()

Unnamed: 0,Merchant_id,m_2,m_3,m_4,m_5
0,2,0.0,0.2,8.142857,100.0
1,3,0.0,0.333333,6.5,30.0
2,4,0.0,0.166667,1.571429,30.0
3,5,0.0,0.25,4.142857,20.0
4,8,0.0,0.5,7.0,100.0


In [106]:
m_2_3_4_5data = pd.merge(m_1data, ave_discount_data, how='left', on=['Merchant_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
m_2_3_4_5data.fillna(0, inplace=True)
m_2_3_4_5data.sort_values(by='User_id', inplace=True)

print(m_2_3_4_5data.shape)
m_2_3_4_5data.head()

(1053282, 18)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,u_3,u_4,u_5,m_1,m_2,m_3,m_4,m_5
180152,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0,15.0,0.0,0.0,8137.0,0.000748,0.188991,5.638345,27.279523
201328,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0,15.0,0.0,0.0,11757.0,0.149498,0.283277,3.640217,58.884454
410148,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0,177.5,1.0,0.0,18796.0,0.0,0.133688,3.296262,207.468209
410151,35,3381,1807,0,300.0,0.1,0.0,2016-01-30,0,2.0,177.5,0.0,0.0,18796.0,0.0,0.133688,3.296262,207.468209
410149,35,3381,9776,0,10.0,0.5,0.0,2016-01-29,0,0.0,177.5,0.0,0.0,18796.0,0.0,0.133688,3.296262,207.468209


In [107]:
'''
当前的 Discout_rate - 商家历史平均 Discout_rate m_6
当前的 Coupon_rate - 商家历史平均 Coupon_rate m_7
当前的 Distance - 商家历史平均 Distance m_8
当前的 achieve - 商家历史平均 achieve m_9
'''
m_2_3_4_5data['m_6'] = (m_2_3_4_5data['Discount_rate'] - m_2_3_4_5data['m_2'])
m_2_3_4_5data['m_7'] = 100 * (m_2_3_4_5data['Coupon_rate'] - m_2_3_4_5data['m_3'])
m_2_3_4_5data['m_8'] = m_2_3_4_5data['Distance'] - m_2_3_4_5data['m_4']
m_2_3_4_5data['m_9'] = m_2_3_4_5data['achieve'] - m_2_3_4_5data['m_5']

print(m_2_3_4_5data.shape)
m_2_3_4_5data.head()

(1053282, 22)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,label,u_1,...,u_5,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9
180152,4,1433,8735,10,30.0,0.166667,0.0,2016-02-14,0,0.0,...,0.0,8137.0,0.000748,0.188991,5.638345,27.279523,-0.000748,-2.232455,4.361655,2.720477
201328,4,1469,2902,10,0.0,0.0,0.95,2016-06-07,0,1.0,...,0.0,11757.0,0.149498,0.283277,3.640217,58.884454,0.800502,-28.327713,6.359783,-58.884454
410148,35,3381,11951,0,200.0,0.1,0.0,2016-01-29,0,0.0,...,0.0,18796.0,0.0,0.133688,3.296262,207.468209,0.0,-3.368774,-3.296262,-7.468209
410151,35,3381,1807,0,300.0,0.1,0.0,2016-01-30,0,2.0,...,0.0,18796.0,0.0,0.133688,3.296262,207.468209,0.0,-3.368774,-3.296262,92.531791
410149,35,3381,9776,0,10.0,0.5,0.0,2016-01-29,0,0.0,...,0.0,18796.0,0.0,0.133688,3.296262,207.468209,0.0,36.631226,-3.296262,-197.468209


In [108]:
m_2_3_4_5data.drop(['User_id', 'Merchant_id', 'Date_received', 'Coupon_id'], axis=1, inplace=True)
m_2_3_4_5data = m_2_3_4_5data[['Distance', 'achieve', 'Coupon_rate', 'Discount_rate', 'u_1', 'u_3', 'u_4', 'u_5', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'label']]
m_2_3_4_5data.head()

Unnamed: 0,Distance,achieve,Coupon_rate,Discount_rate,u_1,u_3,u_4,u_5,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,label
180152,10,30.0,0.166667,0.0,0.0,15.0,0.0,0.0,8137.0,0.000748,0.188991,5.638345,27.279523,-0.000748,-2.232455,4.361655,2.720477,0
201328,10,0.0,0.0,0.95,1.0,15.0,0.0,0.0,11757.0,0.149498,0.283277,3.640217,58.884454,0.800502,-28.327713,6.359783,-58.884454,0
410148,0,200.0,0.1,0.0,0.0,177.5,1.0,0.0,18796.0,0.0,0.133688,3.296262,207.468209,0.0,-3.368774,-3.296262,-7.468209,0
410151,0,300.0,0.1,0.0,2.0,177.5,0.0,0.0,18796.0,0.0,0.133688,3.296262,207.468209,0.0,-3.368774,-3.296262,92.531791,0
410149,0,10.0,0.5,0.0,0.0,177.5,0.0,0.0,18796.0,0.0,0.133688,3.296262,207.468209,0.0,36.631226,-3.296262,-197.468209,0


In [109]:
m_2_3_4_5data_norm = (m_2_3_4_5data - m_2_3_4_5data.min()) / (m_2_3_4_5data.max() - m_2_3_4_5data.min())
m_2_3_4_5data_norm.head()

Unnamed: 0,Distance,achieve,Coupon_rate,Discount_rate,u_1,u_3,u_4,u_5,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,label
180152,0.909091,0.1,0.25,0.0,0.0,0.05,0.0,0.0,0.269259,0.000787,0.283487,0.512577,0.090932,0.49785,0.506758,0.703049,0.417533,0.0
201328,0.909091,0.0,0.0,1.0,0.008403,0.05,0.0,0.0,0.389047,0.157366,0.424916,0.330929,0.196282,0.921137,0.19737,0.795715,0.289044,0.0
410148,0.0,0.666667,0.15,0.0,0.0,0.591667,0.006711,0.0,0.621972,0.0,0.200532,0.29966,0.691561,0.498245,0.493285,0.347903,0.396282,0.0
410151,0.0,1.0,0.15,0.0,0.016807,0.591667,0.0,0.0,0.621972,0.0,0.200532,0.29966,0.691561,0.498245,0.493285,0.347903,0.604852,0.0
410149,0.0,0.033333,0.75,0.0,0.0,0.591667,0.0,0.0,0.621972,0.0,0.200532,0.29966,0.691561,0.498245,0.967529,0.347903,0.0,0.0


In [110]:
m_2_3_4_5data_norm['label'].value_counts()

0.0    988887
1.0     64395
Name: label, dtype: int64

In [111]:
from sklearn.model_selection import train_test_split
train = m_2_3_4_5data_norm
X = train.iloc[:, 0:17]
y = train['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)

In [None]:
'''





训练数据处理截止
====================================


'''

In [372]:
'''
使用 GBDT
'''
from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
clf = GradientBoostingClassifier(learning_rate=0.01, n_estimators=30, max_depth=7, subsample=0.7)
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores_cvs = cross_validate(clf, X_test, y_test, cv=5, scoring='roc_auc', return_train_score = True)
sorted(scores_cvs.keys())
print(scores_cvs['test_score'])
print(scores_cvs['test_score'].mean())

[0.87514403 0.87643327 0.87655714 0.88131489 0.87448073]
0.8767860109599239


In [373]:
from sklearn.metrics import roc_auc_score
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)

In [374]:
res = pd.DataFrame(pred, columns=['neg', 'pos'])
# res['pos'] = res['pos'].apply(lambda row: 1 if  row>0.857 else 0)
pred = res['pos'].tolist()
roc_auc_score(y_test, pred)

0.8718960951082084

In [395]:
'''
使用XGBoost
'''
from xgboost import XGBClassifier
model = XGBClassifier(eta=0.001, reg_lambda=2, subsample=0.7, max_depth=7, num_class=2, objective="multi:softprob")
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores_cvs = cross_validate(model, X, y, cv=5, scoring='roc_auc', return_train_score = True)
sorted(scores_cvs.keys())
print(scores_cvs['test_score'])
print(scores_cvs['test_score'].mean())

[0.80101227 0.8627529  0.7546227  0.79281886 0.9216683 ]
0.8265750072321015


In [119]:
'''

=====================================
开始处理预测数据

处理Discout_rate
'''

# 处理 positive_data 的 Discout_rate
# 把 Discount_rate 切分

test = pd.read_csv('data/test_revised.csv')

a = test['Discount_rate'].str.split(':', expand=True)
a.columns = ['achieve', 'reduce']
a['achieve'] = pd.to_numeric(a['achieve'])

# 填充 reduce == null，即源数据是折扣比例
a['reduce'].fillna(0, inplace=True)
a['reduce'] = pd.to_numeric(a['reduce'])

# 生成 满减比例
a['Coupon_rate'] = a['reduce'] / a['achieve']
a['Discount_rate'] = a['achieve']
a.loc[a['Coupon_rate'] != 0, 'Discount_rate'] = 0

a.drop(['reduce'], axis=1, inplace=True)
a.loc[a['Discount_rate'] != 0, 'achieve'] = 0

test.drop(['Discount_rate'], axis=1, inplace=True)

test = pd.concat([test, a], axis=1)

test = test[['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'achieve', 'Coupon_rate', 'Discount_rate', 'Date_received']]
test.shape

(113640, 8)

In [120]:
'''
处理 Distance
'''
# 把 Distance 中的 null 替换为 11
test.loc[test['Distance'] == 'null', 'Distance'] = 11
test.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received
0,4129537,450,9983,1,30.0,0.166667,0.0,20160712
1,6949378,1300,3429,11,30.0,0.166667,0.0,20160706
2,2166529,7113,6928,5,200.0,0.1,0.0,20160727
3,2166529,7113,1808,5,100.0,0.1,0.0,20160727
4,6172162,7605,6500,2,30.0,0.033333,0.0,20160708


In [121]:
'''
处理 Date_received
'''
test['Date_received'] = test['Date_received'].astype('str')
test['Date_received'] = test['Date_received'].apply(lambda row: datetime.datetime.strptime(row, '%Y%m%d'))
test.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received
0,4129537,450,9983,1,30.0,0.166667,0.0,2016-07-12
1,6949378,1300,3429,11,30.0,0.166667,0.0,2016-07-06
2,2166529,7113,6928,5,200.0,0.1,0.0,2016-07-27
3,2166529,7113,1808,5,100.0,0.1,0.0,2016-07-27
4,6172162,7605,6500,2,30.0,0.033333,0.0,2016-07-08


In [122]:
'''
生成 u_1 用户领取优惠券之前领取过的优惠券数量
'''
test.sort_values(by='User_id', inplace=True)

# 通过连接得到同一个用户的不同领取时间的交互信息
refine_data_merge = pd.merge(test, test, how='left', on=['User_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
refine_data_merge = refine_data_merge[['User_id', 'Coupon_id_x',  'Date_received_x', 'Date_received_y']]
refine_data_merge = refine_data_merge.loc[(refine_data_merge['Date_received_x'] != refine_data_merge['Date_received_y'])]

# 提取领取时间之前（Date_delta > 0）的数据
refine_data_merge['Date_delta'] = refine_data_merge['Date_received_x'] - refine_data_merge['Date_received_y']
refine_data_merge['Date_delta'] = refine_data_merge['Date_delta'].apply(lambda row: row.days)
refine_data_merge = refine_data_merge.loc[refine_data_merge['Date_delta'] > 0]
refine_data_merge = refine_data_merge.groupby(['User_id', 'Date_received_x']).count()
refine_data_merge.reset_index(level=[0,1], inplace=True)
refine_data_merge = refine_data_merge[['User_id', 'Date_received_x', 'Date_delta']]
refine_data_merge.columns = ['User_id', 'Date_received', 'count']

# 统计同一个用户 Date_received 相同的 数目
dulp_received_date_data = test.groupby(['User_id', 'Date_received']).count()
dulp_received_date_data.reset_index(level=[0,1], inplace=True)
dulp_received_date_data = dulp_received_date_data[['User_id', 'Date_received', 'Coupon_id']]
dulp_received_date_data.columns = ['User_id', 'Date_received', 'dulp_count']

refine_dulp_merge = pd.merge(refine_data_merge, dulp_received_date_data, how='left', on=['User_id', 'Date_received'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)
refine_dulp_merge

refine_dulp_merge['u_1'] = refine_dulp_merge['count'] / refine_dulp_merge['dulp_count']
refine_dulp_merge.drop(['count', 'dulp_count'], axis=1, inplace=True)

u1_data_t = pd.merge(test, refine_dulp_merge, how='left', on=['User_id', 'Date_received'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
u1_data_t.fillna(0, inplace=True)
u1_data_t.shape

(113640, 9)

In [123]:
'''
生成u_3
'''
achieve_data = u1_data.groupby(['User_id']).mean()
achieve_data['User_id'] = sorted(list(set(u1_data['User_id'].tolist())))
achieve_data = achieve_data[['User_id', 'achieve']]
achieve_data.columns = ['User_id','u_3']
u3_data_t = pd.merge(u1_data_t, achieve_data, how='left', on='User_id', left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)
u3_data_t.shape

(113640, 10)

In [124]:
'''
生成u_4 用户之前上一次领取对应优惠券的平均时间间隔天数
'''

# 在预测数据中，就假设所有的数据都是用户领取优惠券的数据，但是在训练数据中要提取出领取优惠券的用户数据
used_user_data_group = pd.merge(test, test, how='left', on=['User_id', 'Coupon_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)

# 提取出同一用户在不同领取时间上的交互数据
used_user_data_group = used_user_data_group[['User_id', 'Coupon_id', 'Date_received_x', 'Date_received_y']]
used_user_data_group = used_user_data_group.loc[used_user_data_group['Date_received_x'] != used_user_data_group['Date_received_y']]

# 因为计算 date_delta = Date_received_x - Date_received_y
# 所以取出 date_delta > 0 的数据，此时需要的是Date_received_x（也可以是 date_delta < 0, 此时需要的是 此时需要的是Date_received_y）
used_user_data_group['date_delta'] = used_user_data_group['Date_received_x'] - used_user_data_group['Date_received_y']
used_user_data_group['date_delta'] = used_user_data_group['date_delta'].apply(lambda row: row.days)
used_user_data_group = used_user_data_group.loc[used_user_data_group['date_delta'] > 0]
used_user_data_group.sort_values(by='User_id', inplace=True)

# 算出当前领取日期下的最近一次领取日期间隔
used_user_data_group = used_user_data_group.groupby(by=['User_id', 'Coupon_id', 'Date_received_x']).min()
used_user_data_group.reset_index(level=[0, 1, 2], inplace=True)

# 查询信息用于生成 u_5data
u5_count = used_user_data_group.groupby(['User_id', 'Coupon_id', 'Date_received_x']).count()
u5_count.reset_index(level=[0,1,2], inplace=True)
u5_count = u5_count[['User_id', 'Coupon_id', 'Date_received_x', 'date_delta']]
u5_count.columns = ['User_id', 'Coupon_id', 'Date_received', 'u_5']

# 再计算同一用户对同一种优惠券的平均最近一次领取日期间隔数据
used_user_data_group = used_user_data_group.groupby(['User_id', 'Coupon_id']).mean()
used_user_data_group.reset_index(level=[0,1], inplace=True)
used_user_data_group.columns  = ['User_id', 'Coupon_id', 'u_4']

u4_data_t = pd.merge(u3_data_t, used_user_data_group, how='left', on=['User_id', 'Coupon_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
u4_data_t.fillna(0, inplace=True)

print(u4_data_t.shape)
u4_data_t.head()

(113640, 11)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,u_1,u_3,u_4
0,209,5032,825,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,0.0
1,209,5032,7557,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,0.0
2,215,599,5488,11,20.0,0.05,0.0,2016-07-03,0.0,30.0,0.0
3,316,2436,3992,0,30.0,0.166667,0.0,2016-07-21,0.0,50.0,0.0
4,417,3507,12465,0,50.0,0.02,0.0,2016-07-12,0.0,72.0,0.0


In [125]:
'''
生成u_5 该用户之前领取过多少个对应的优惠券
'''

u5_data_t = pd.merge(u4_data_t, u5_count, how='left', on=['User_id', 'Coupon_id', 'Date_received'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
u5_data_t.fillna(0, inplace=True)

print(u5_data_t.shape)
u5_data_t.head()

(113640, 12)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,u_1,u_3,u_4,u_5
0,209,5032,825,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,0.0,0.0
1,209,5032,7557,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,0.0,0.0
2,215,599,5488,11,20.0,0.05,0.0,2016-07-03,0.0,30.0,0.0,0.0
3,316,2436,3992,0,30.0,0.166667,0.0,2016-07-21,0.0,50.0,0.0,0.0
4,417,3507,12465,0,50.0,0.02,0.0,2016-07-12,0.0,72.0,0.0,0.0


In [126]:
'''
生成 m_1 在该商户中进行过消费的人数
'''
# 将 u5_data 中的 Distance 从 object 转为 int
u5_data_t = u5_data_t.convert_objects(convert_numeric=True)

merchant_data = test
merchant_data = merchant_data.groupby(['Merchant_id']).count()
merchant_data.reset_index(level=[0], inplace=True)
merchant_data = merchant_data[['Merchant_id', 'Coupon_id']]
merchant_data.columns = ['Merchant_id', 'm_1']

m_1data_t = pd.merge(u5_data_t, merchant_data, how='left', on=['Merchant_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
m_1data_t.fillna(0, inplace=True)

# merge操作之后会按关键字 Merchant_id 排序，用 User_id 重新排回来
m_1data_t.sort_values(by='User_id', inplace=True)
m_1data_t.shape

  """


(113640, 13)

In [127]:
'''
商户当前平均 Discount_rate m_2, 平均 Coupon_rate m_3, 平均 Distance m_4, 平均 achieve m_5
'''
ave_discount_data = m_1data_t.groupby('Merchant_id').mean()
ave_discount_data.reset_index(level=[0], inplace=True)
ave_discount_data = ave_discount_data[['Merchant_id', 'Discount_rate', 'Coupon_rate', 'Distance', 'achieve']]
ave_discount_data.columns = ['Merchant_id', 'm_2', 'm_3', 'm_4', 'm_5']
ave_discount_data.head()

m_2_3_4_5data_t = pd.merge(m_1data_t, ave_discount_data, how='left', on=['Merchant_id'], left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=False, indicator=False)
m_2_3_4_5data_t.fillna(0, inplace=True)
m_2_3_4_5data_t.sort_values(by='User_id', inplace=True)

print(m_2_3_4_5data_t.shape)
m_2_3_4_5data_t.head()

(113640, 17)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,u_1,u_3,u_4,u_5,m_1,m_2,m_3,m_4,m_5
80414,209,5032,825,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,0.0,0.0,95,0.0,0.252632,3.694737,20.315789
80415,209,5032,7557,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,0.0,0.0,95,0.0,0.252632,3.694737,20.315789
13610,215,599,5488,11,20.0,0.05,0.0,2016-07-03,0.0,30.0,0.0,0.0,44,0.043182,0.047727,3.090909,19.090909
61543,316,2436,3992,0,30.0,0.166667,0.0,2016-07-21,0.0,50.0,0.0,0.0,2070,0.0,0.15504,3.944928,28.937198
68087,417,3507,12465,0,50.0,0.02,0.0,2016-07-12,0.0,72.0,0.0,0.0,45,0.0,0.02,4.155556,50.0


In [128]:
'''
当前的 Discout_rate - 商家历史平均 Discout_rate m_6
当前的 Coupon_rate - 商家历史平均 Coupon_rate m_7
当前的 Distance - 商家历史平均 Distance m_8
当前的 achieve - 商家历史平均 achieve m_9
'''
m_2_3_4_5data_t['m_6'] = (m_2_3_4_5data_t['Discount_rate'] - m_2_3_4_5data_t['m_2'])
m_2_3_4_5data_t['m_7'] = 100 * (m_2_3_4_5data_t['Coupon_rate'] - m_2_3_4_5data_t['m_3'])
m_2_3_4_5data_t['m_8'] = m_2_3_4_5data_t['Distance'] - m_2_3_4_5data_t['m_4']
m_2_3_4_5data_t['m_9'] = m_2_3_4_5data_t['achieve'] - m_2_3_4_5data_t['m_5']

print(m_2_3_4_5data_t.shape)
m_2_3_4_5data_t.head()

(113640, 21)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,achieve,Coupon_rate,Discount_rate,Date_received,u_1,u_3,...,u_5,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9
80414,209,5032,825,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,...,0.0,95,0.0,0.252632,3.694737,20.315789,0.0,-0.2631579,-2.694737,-0.315789
80415,209,5032,7557,1,20.0,0.25,0.0,2016-07-21,0.0,0.0,...,0.0,95,0.0,0.252632,3.694737,20.315789,0.0,-0.2631579,-2.694737,-0.315789
13610,215,599,5488,11,20.0,0.05,0.0,2016-07-03,0.0,30.0,...,0.0,44,0.043182,0.047727,3.090909,19.090909,-0.043182,0.2272727,7.909091,0.909091
61543,316,2436,3992,0,30.0,0.166667,0.0,2016-07-21,0.0,50.0,...,0.0,2070,0.0,0.15504,3.944928,28.937198,0.0,1.162641,-3.944928,1.062802
68087,417,3507,12465,0,50.0,0.02,0.0,2016-07-12,0.0,72.0,...,0.0,45,0.0,0.02,4.155556,50.0,0.0,-1.040834e-15,-4.155556,0.0


In [129]:
m_2_3_4_5data_t.drop(['User_id', 'Merchant_id', 'Date_received', 'Coupon_id'], axis=1, inplace=True)
m_2_3_4_5data_t = m_2_3_4_5data_t[['Distance', 'achieve', 'Coupon_rate', 'Discount_rate', 'u_1', 'u_3', 'u_4', 'u_5', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9']]

print(m_2_3_4_5data_t.shape)
m_2_3_4_5data_t.head()

(113640, 17)


Unnamed: 0,Distance,achieve,Coupon_rate,Discount_rate,u_1,u_3,u_4,u_5,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9
80414,1,20.0,0.25,0.0,0.0,0.0,0.0,0.0,95,0.0,0.252632,3.694737,20.315789,0.0,-0.2631579,-2.694737,-0.315789
80415,1,20.0,0.25,0.0,0.0,0.0,0.0,0.0,95,0.0,0.252632,3.694737,20.315789,0.0,-0.2631579,-2.694737,-0.315789
13610,11,20.0,0.05,0.0,0.0,30.0,0.0,0.0,44,0.043182,0.047727,3.090909,19.090909,-0.043182,0.2272727,7.909091,0.909091
61543,0,30.0,0.166667,0.0,0.0,50.0,0.0,0.0,2070,0.0,0.15504,3.944928,28.937198,0.0,1.162641,-3.944928,1.062802
68087,0,50.0,0.02,0.0,0.0,72.0,0.0,0.0,45,0.0,0.02,4.155556,50.0,0.0,-1.040834e-15,-4.155556,0.0


In [130]:
'''
对数据进标准化
'''
m_2_3_4_5data_t_norm = (m_2_3_4_5data_t - m_2_3_4_5data_t.min()) / (m_2_3_4_5data_t.max() - m_2_3_4_5data_t.min())

print(m_2_3_4_5data_t_norm.shape)
m_2_3_4_5data_t_norm.head()

(113640, 17)


Unnamed: 0,Distance,achieve,Coupon_rate,Discount_rate,u_1,u_3,u_4,u_5,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9
80414,0.090909,0.04,0.375,0.0,0.0,0.0,0.0,0.0,0.003138,0.0,0.378947,0.335885,0.040632,0.448204,0.407805,0.355159,0.479683
80415,0.090909,0.04,0.375,0.0,0.0,0.0,0.0,0.0,0.003138,0.0,0.378947,0.335885,0.040632,0.448204,0.407805,0.355159,0.479683
13610,1.0,0.04,0.075,0.0,0.0,0.1,0.0,0.0,0.001435,0.045455,0.071591,0.280992,0.038182,0.422876,0.413852,0.870685,0.484434
61543,0.0,0.06,0.25,0.0,0.0,0.166667,0.0,0.0,0.069066,0.0,0.23256,0.35863,0.057874,0.448204,0.425387,0.294379,0.485031
68087,0.0,0.1,0.03,0.0,0.0,0.24,0.0,0.0,0.001469,0.0,0.03,0.377778,0.1,0.448204,0.41105,0.284139,0.480908


In [388]:
'''
使用模型进行预测
'''
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
clf = GradientBoostingClassifier(learning_rate=0.01, n_estimators=30, max_depth=7, subsample=0.7)
clf.fit(X, y)
test_pred = clf.predict_proba(m_2_3_4_5data_normt)

array([[0.09506052, 0.90493948],
       [0.09506052, 0.90493948],
       [0.10748803, 0.89251197],
       ...,
       [0.08834459, 0.91165541],
       [0.08723669, 0.91276331],
       [0.08834459, 0.91165541]])

In [117]:
'''
使用XGBoost 进行预测
'''
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

d_total_train = xgb.DMatrix(data=X, label=y)
dtrain = xgb.DMatrix(data=X_train, label=y_train)
watchlist = [(d_total_train, 'total')]

params={'booster':'gbtree',
        'objective': 'rank:pairwise',
        'eval_metric':'auc',
        'gamma':0.1,
#       'min_child_weighteval_metric':1.1,
        'max_depth':7,
        'lambda':100,
        'subsample':0.7,
        'colsample_bytree':0.7,
        'colsample_bylevel':0.7,
        'eta': 0.01,
#       'tree_method':'exact',
        'random_state ':0,
        }

bst = xgb.train(params, d_total_train, num_boost_round=200, evals=watchlist) #  dtrain是训练数据集

[0]	total-auc:0.829495
[1]	total-auc:0.86118
[2]	total-auc:0.86702
[3]	total-auc:0.868528
[4]	total-auc:0.869928
[5]	total-auc:0.870661
[6]	total-auc:0.87113
[7]	total-auc:0.870559
[8]	total-auc:0.871177
[9]	total-auc:0.87019
[10]	total-auc:0.870279
[11]	total-auc:0.869964
[12]	total-auc:0.870298
[13]	total-auc:0.869356
[14]	total-auc:0.869858
[15]	total-auc:0.86934
[16]	total-auc:0.868925
[17]	total-auc:0.868996
[18]	total-auc:0.868846
[19]	total-auc:0.868398
[20]	total-auc:0.869375
[21]	total-auc:0.869865
[22]	total-auc:0.86969
[23]	total-auc:0.870238
[24]	total-auc:0.870767
[25]	total-auc:0.871089
[26]	total-auc:0.871522
[27]	total-auc:0.871655
[28]	total-auc:0.871842
[29]	total-auc:0.871981
[30]	total-auc:0.872398
[31]	total-auc:0.872567
[32]	total-auc:0.872413
[33]	total-auc:0.872604
[34]	total-auc:0.872619
[35]	total-auc:0.872823
[36]	total-auc:0.872938
[37]	total-auc:0.873026
[38]	total-auc:0.873268
[39]	total-auc:0.873436
[40]	total-auc:0.873484
[41]	total-auc:0.873394
[42]	tot

In [118]:
dval = xgb.DMatrix(data=X_test)
pred = bst.predict(dval)
roc_auc_score(y_test, pred)

0.8765230498972455

In [131]:
'''
将预测结果和数据信息结合，生成最终的结果DataFrame
'''
from sklearn.preprocessing import MinMaxScaler
dtest = xgb.DMatrix(data=m_2_3_4_5data_t_norm)
pred = bst.predict(dtest)

dataset3 = pd.read_csv('data/test_revised.csv')
dataset3_preds = dataset3[['User_id','Coupon_id','Date_received']]
dataset3_preds['pred'] = bst.predict(dtest)
dataset3_preds['pred'] = MinMaxScaler().fit_transform(dataset3_preds[['pred']])
# dataset3_preds.sort_values(by=['Coupon_id'], inplace=True)

dataset3_preds.to_csv("xgb_preds2.csv",index=None,header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [112]:
'''
使用XGBoost
'''
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

d_total_train = xgb.DMatrix(data=X, label=y)
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
watchlist = [(dtrain, 'train')]

params={'booster':'gbtree',
        'objective': 'rank:pairwise',
        'eval_metric':'auc',
        'gamma':0.1,
#       'min_child_weighteval_metric':1.1,
        'max_depth':10,
        'lambda':100,
        'subsample':0.7,
        'colsample_bytree':0.7,
        'colsample_bylevel':0.7,
        'eta': 0.01,
#       'tree_method':'exact',
        'random_state ':0,
        }

# bst = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist) #  dtrain是训练数据集
# pred = bst.predict(dtest)
# roc_auc_score(y_test, pred)

cv = KFold(n_splits=5, shuffle=True, random_state=0)
xgb.cv(params, dtrain, num_boost_round=100, stratified=False, folds=cv, metrics='auc', obj=None, feval=None, maximize=False, early_stopping_rounds=20, fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=True)

# model = XGBClassifier(eta=0.001, reg_lambda=2, subsample=0.7, max_depth=7, num_class=2, objective="multi:softprob")

# scores_cvs = cross_validate(model, X, y, cv=5, scoring='roc_auc', return_train_score = True)
# sorted(scores_cvs.keys())
# print(scores_cvs['test_score'])
# print(scores_cvs['test_score'].mean())

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.865555,0.001909,0.875319,0.001890
1,0.875242,0.001640,0.884148,0.000502
2,0.878586,0.001983,0.887215,0.000892
3,0.880397,0.000841,0.888803,0.000542
4,0.881162,0.000400,0.889788,0.000755
5,0.881842,0.000553,0.890629,0.000204
6,0.882085,0.000673,0.890894,0.000145
7,0.882229,0.000716,0.891091,0.000282
8,0.882482,0.000535,0.891472,0.000077
9,0.882563,0.000825,0.891635,0.000346


In [99]:
from sklearn.preprocessing import MinMaxScaler
a = pd.DataFrame(data=list(pred), columns=['pred'])
a = (a - a.min()) / (a.max() - a.min())
a['y_test'] = y_test.tolist()
a.sort_values(by='pred', ascending=False)
a.loc[(a['pred'] > 0.5) & (a['y_test'] == 1)]

Unnamed: 0,pred,y_test
0,0.882022,1.0
2,1.000000,1.0
3,0.842875,1.0
5,0.634523,1.0
6,0.637379,1.0
7,0.929099,1.0
8,0.545333,1.0
9,0.556009,1.0
10,0.819032,1.0
11,0.997089,1.0


In [13]:
'''
用户参与过的满减的平均满减额度
'''
received_data = u2_data.loc[(u2_data['Coupon_id'] != 'null') & (offline_data['Date'] == 'null')]
# 处理 positive_data 的 Discout_rate
# 把 Discount_rate 切分
a = received_data['Discount_rate'].str.split(':', expand=True)
a.columns = ['achieve', 'reduce']
a['achieve'] = pd.to_numeric(a['achieve'])

# 填充 reduce == null，即源数据是折扣比例
a['reduce'].fillna(0, inplace=True)
a['reduce'] = pd.to_numeric(a['reduce'])

# 生成 满减比例
a['Coupon_rate'] = a['reduce'] / a['achieve']
a['Discount_rate'] = a['achieve']
a.loc[a['Coupon_rate'] != 0, 'Discount_rate'] = 0

a.drop(['reduce'], axis=1, inplace=True)
a.loc[a['Discount_rate'] != 0, 'achieve'] = 0
received_data.drop(['Discount_rate'], axis=1, inplace=True)
received_data = pd.concat([received_data, a], axis=1)

received_data.sort_values(by='User_id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
u2_data.drop(['Discount_rate'], axis=1, inplace=True)

In [18]:
achieve_discount_rate_data = received_data[['Coupon_id', 'achieve', 'Discount_rate']]
u3_data = pd.merge(u2_data, achieve_discount_rate_data, how='left', on='Coupon_id', left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)

MemoryError: 

In [140]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
achieve_list = achieve_data['achieve'].tolist()
res = le.fit_transform(achieve_list)
achieve_data['u_3'] = res
achieve_data.drop(['achieve'], axis=1, inplace=True)
achieve_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,User_id,u_3
679792,4,4
679793,4,0
678051,35,9
678050,35,2
678052,35,8
678053,35,8
1264847,36,3
1264846,36,4
97505,64,6
1263168,110,6


In [137]:
u3_data = pd.merge(u2_data, achieve_data, how='left', on='User_id', left_on=None, right_on=None,
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)
u3_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,u_1,u_2,u_3
0,4,1433,8735,30:5,10,2016-02-14 00:00:00,,2,0.0,4.0
1,4,1433,8735,30:5,10,2016-02-14 00:00:00,,2,0.0,0.0
2,4,1469,2902,0.95,10,2016-06-07 00:00:00,,2,0.0,4.0
3,4,1469,2902,0.95,10,2016-06-07 00:00:00,,2,0.0,0.0
4,35,3381,1807,300:30,0,2016-01-30 00:00:00,,4,0.0,9.0
