# 特征工程

从以下几个维度进行展开处理：

## 用户基本特征：

获取基本的用户特征，基于用户本身属性多为类别特征的特点，对age,sex,usr_lv_cd进行独热编码操作，对于用户注册时间暂时不处理

## 商品基本特征

根据商品文件获取基本的特征

针对属性a1,a2,a3进行独热编码

商品类别和品牌直接作为特征

## 评论特征

分时间段，

对评论数进行独热编码

## 行为特征

分时间段

对行为类别进行独热编码

分别按照用户-类别行为分组和用户-类别-商品行为分组统计，然后计算

用户对同类别下其他商品的行为计数

不同时间累积的行为计数（3,5,7,10,15,21,30）

## 累积用户特征

分时间段

用户不同行为的

购买转化率

均值

## 用户近期行为特征

在上面针对用户进行累积特征提取的基础上，分别提取用户近一个月、近三天的特征，然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重

## 用户对同类别下各种商品的行为

用户对各个类别的各项行为操作统计

用户对各个类别操作行为统计占对所有类别操作行为统计的比重

## 累积商品特征

分时间段

针对商品的不同行为的

购买转化率

均值

## 类别特征

分时间段下各个商品类别的

购买转化率

均值

# 数据加载

## 导包

In [1]:
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np
import gc

## 变量声明

In [2]:
action_1_path = 'data/JData_Action_201602.csv'
action_2_path = 'data/JData_Action_201603.csv'
action_3_path = 'data/JData_Action_201604.csv'
comment_path = 'data/JData_Comment.csv'
product_path = 'data/JData_Product.csv'
user_path = 'data/JData_User.csv'

## 定义函数提取数据

In [3]:
def get_actions_1():
    action = pd.read_csv(action_1_path)
    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')
    return action
def get_actions_2():
    action = pd.read_csv(action_2_path)
    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')
    return action
def get_actions_3():
    action = pd.read_csv(action_3_path)
    action[['user_id','sku_id','model_id','type','cate','brand']] = action[['user_id','sku_id','model_id','type','cate','brand']].astype('float32')
    return action

# 读取并拼接所有行为记录文件
def get_all_action():
    action_1 = get_actions_1()
    action_2 = get_actions_2()
    action_3 = get_actions_3()
    actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame
    return actions

# 获取某个时间段的行为记录
def get_actions(start_date, end_date, all_actions):
    """
    :param start_date:开始日期
    :param end_date:结束日期
    :return: actions: 返回数据
    """
    actions = all_actions[(all_actions.time >= start_date) & (all_actions.time < end_date)].copy()
    return actions

# 用户特征

## 用户基本特征

获取基本的用户特征，基于用户本身属性多为类别特征的特点，对age,sex,usr_lv_cd进行独热编码操作，对于用户注册时间暂时不处理

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])

array([1, 1, 2, 0])

In [5]:
user = pd.read_csv(user_path)
display(user.head())
le = preprocessing.LabelEncoder()    
age_df = le.fit_transform(user['age']) # numpy数组
display(age_df[:5])
del user,age_df
gc.collect()

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm
0,200001,6.0,2.0,5,2016-01-26
1,200002,-1.0,0.0,1,2016-01-26
2,200003,4.0,1.0,4,2016-01-26
3,200004,-1.0,2.0,1,2016-01-26
4,200005,2.0,0.0,4,2016-01-26


array([6, 0, 4, 0, 2])

0

In [6]:
from sklearn import preprocessing

def get_basic_user_feat():
    # 针对年龄的中文字符问题处理，首先是读入的时候编码，填充空值，然后将其数值化，最后独热编码，此外对于sex也进行了数值类型转换
    user = pd.read_csv(user_path)
    user.head(5)
    user.dropna(axis=0, how='any',inplace=True)
    user.head(5)
    user['sex'] = user['sex'].astype(int)    
    user['age'] = user['age'].astype(int)
    le = preprocessing.LabelEncoder()    
    age_df = le.fit_transform(user['age'])

    age_df = pd.get_dummies(age_df, prefix='age')
    sex_df = pd.get_dummies(user['sex'], prefix='sex')
    user_lv_df = pd.get_dummies(user['user_lv_cd'], prefix='user_lv_cd')
    user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
    return user

In [7]:
user = get_basic_user_feat()
display(user.head())
del user
gc.collect()

Unnamed: 0,user_id,age_0,age_1,age_2,age_3,age_4,age_5,age_6,sex_0,sex_1,sex_2,user_lv_cd_1,user_lv_cd_2,user_lv_cd_3,user_lv_cd_4,user_lv_cd_5
0,200001.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,200002.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,200003.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,200004.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,200005.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


23172

# 商品特征

## 商品基本特征

根据商品文件获取基本的特征，针对属性a1,a2,a3进行独热编码，商品类别和品牌直接作为特征

In [8]:
def get_basic_product_feat():
    product = pd.read_csv(product_path)
    attr1_df = pd.get_dummies(product["a1"], prefix="a1")
    attr2_df = pd.get_dummies(product["a2"], prefix="a2")
    attr3_df = pd.get_dummies(product["a3"], prefix="a3")
    product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)
    return product

# 评论特征

分时间段

对评论数进行独热编码

In [9]:
def get_comments_product_feat(end_date):
    comments = pd.read_csv(comment_path)
    comments = comments[comments.dt <= end_date]# 某日期之前的评论数据
    df = pd.get_dummies(comments['comment_num'], prefix='comment_num')
    # 为了防止某个时间段不具备评论数为0的情况（测试集出现过这种情况）
    for i in range(0, 5):
        if 'comment_num_' + str(i) not in df.columns:
            df['comment_num_' + str(i)] = 0
    df = df[['comment_num_0', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]
    
    comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame

    comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate','comment_num_0', 'comment_num_1', 
                         'comment_num_2', 'comment_num_3', 'comment_num_4']]
    return comments

评论数据转换

In [10]:
start_date = '2016-02-01'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
end_date = end_date.strftime('%Y-%m-%d')

display(start_date)
comments = get_comments_product_feat(end_date)
display(comments.head(),comments.shape)
del comments
gc.collect()

'2016-02-01'

Unnamed: 0,sku_id,has_bad_comment,bad_comment_rate,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4
0,1000,1,0.0417,0,0,0,1,0
1,10000,0,0.0,0,0,1,0,0
2,100011,1,0.0376,0,0,0,0,1
3,100018,0,0.0,0,0,0,1,0
4,100020,0,0.0,0,0,0,1,0


(46546, 8)

4254

# 行为特征

## 用户-类别-商品计数统计

分时间段

对行为类别进行独热编码

分别按照用户-类别行为分组和用户-类别-商品行为分组统计，然后计算

用户对同类别下其他商品的行为计数

针对用户对同类别下目标商品的行为计数与该时间段的行为均值作差

### 函数定义

In [11]:
def get_action_feat(start_date, end_date, all_actions, day):
    actions = get_actions(start_date, end_date, all_actions)
    actions = actions[['user_id', 'sku_id', 'cate','type']]
    # 对行为类别进行独热编码
    prefix = 'action_before_%s' % day
    df = pd.get_dummies(actions['type'], prefix=prefix)
    actions = pd.concat([actions, df], axis=1)
    
    # 分组统计，用户-类别-商品,不同用户对不同类别下商品的行为计数
    actions = actions.groupby(['user_id', 'cate','sku_id'], as_index=False).sum()
    # 分组统计，用户-类别，不同用户对不同商品类别的行为计数
    user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()
    del user_cate['sku_id']
    del user_cate['type']
    # 数据合并
    actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])
    
    # 前述两种分组含有相同名称的不同行为的计数，系统会自动针对名称调整添加后缀,x,y
    # 所以这里作差统计的是同一类别下其他商品的行为计数
    # 不同时间累积的行为计数（3,5,7,10,15,21,30）表示时间间隔天数
    actions[prefix+'_1.0_y'] = actions[prefix+'_1.0_y'] - actions[prefix+'_1.0_x']
    actions[prefix+'_2.0_y'] = actions[prefix+'_2.0_y'] - actions[prefix+'_2.0_x']
    actions[prefix+'_3.0_y'] = actions[prefix+'_3.0_y'] - actions[prefix+'_3.0_x']
    actions[prefix+'_4.0_y'] = actions[prefix+'_4.0_y'] - actions[prefix+'_4.0_x']
    actions[prefix+'_5.0_y'] = actions[prefix+'_5.0_y'] - actions[prefix+'_5.0_x']
    actions[prefix+'_6.0_y'] = actions[prefix+'_6.0_y'] - actions[prefix+'_6.0_x']
    
    
    # 统计用户对不同类别下商品计数与该类别下商品行为计数均值（对时间）的差值
    actions[prefix+'minus_mean_1'] = actions[prefix+'_1.0_x'] - (actions[prefix+'_1.0_x']/day)
    actions[prefix+'minus_mean_2'] = actions[prefix+'_2.0_x'] - (actions[prefix+'_2.0_x']/day)
    actions[prefix+'minus_mean_3'] = actions[prefix+'_3.0_x'] - (actions[prefix+'_3.0_x']/day)
    actions[prefix+'minus_mean_4'] = actions[prefix+'_4.0_x'] - (actions[prefix+'_4.0_x']/day)
    actions[prefix+'minus_mean_5'] = actions[prefix+'_5.0_x'] - (actions[prefix+'_5.0_x']/day)
    actions[prefix+'minus_mean_6'] = actions[prefix+'_6.0_x'] - (actions[prefix+'_6.0_x']/day)
    del actions['type']
    return actions

### 代码解读

加载一定时间段内所有数据

In [12]:
all_actions = get_all_action()
start_date = '2016-02-01'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
end_date = end_date.strftime('%Y-%m-%d')
# 获取一定时间段内数据
actions = get_actions(start_date, end_date, all_actions)
display(actions.head(),actions.shape)
del all_actions
gc.collect()

Unnamed: 0,user_id,sku_id,time,model_id,type,cate,brand
29,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
30,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
31,272629.0,107774.0,2016-02-01 00:00:00,0.0,6.0,10.0,36.0
32,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
33,272629.0,107774.0,2016-02-01 00:00:00,216.0,6.0,10.0,36.0


(1005110, 7)

305

In [13]:
# 提取部分特征
actions = actions[['user_id', 'sku_id', 'cate','type']]
# 对行为类别进行独热编码
df = pd.get_dummies(actions['type'], prefix='action_before_%s' %3)
display(df.head())

# 数据合并
actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
display(actions.head(),actions.shape)
del df
gc.collect()

# 分组统计，用户-类别-商品,不同用户对不同类别下商品的行为计数
actions = actions.groupby(['user_id', 'cate','sku_id'], as_index=False).sum()
display(actions.head(),actions.shape)

Unnamed: 0,action_before_3_1.0,action_before_3_2.0,action_before_3_3.0,action_before_3_4.0,action_before_3_5.0,action_before_3_6.0
29,1,0,0,0,0,0
30,1,0,0,0,0,0
31,0,0,0,0,0,1
32,1,0,0,0,0,0
33,0,0,0,0,0,1


Unnamed: 0,user_id,sku_id,cate,type,action_before_3_1.0,action_before_3_2.0,action_before_3_3.0,action_before_3_4.0,action_before_3_5.0,action_before_3_6.0
29,272629.0,107774.0,10.0,1.0,1,0,0,0,0,0
30,272629.0,107774.0,10.0,1.0,1,0,0,0,0,0
31,272629.0,107774.0,10.0,6.0,0,0,0,0,0,1
32,272629.0,107774.0,10.0,1.0,1,0,0,0,0,0
33,272629.0,107774.0,10.0,6.0,0,0,0,0,0,1


(1005110, 10)

Unnamed: 0,user_id,cate,sku_id,type,action_before_3_1.0,action_before_3_2.0,action_before_3_3.0,action_before_3_4.0,action_before_3_5.0,action_before_3_6.0
0,200002.0,4.0,7199.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
1,200002.0,4.0,28973.0,40.0,4.0,0.0,0.0,0.0,0.0,6.0
2,200002.0,4.0,73364.0,24.0,6.0,0.0,0.0,0.0,0.0,3.0
3,200002.0,4.0,118303.0,38.0,2.0,0.0,0.0,0.0,0.0,6.0
4,200002.0,4.0,149851.0,32.0,2.0,0.0,0.0,0.0,0.0,5.0


(119134, 10)

简单代码演示groupby

In [14]:
import pandas as pd
df = pd.DataFrame(data={'books':['bk1','bk1','bk1','bk2','bk2','bk3'], 
                        'price': [12,12,12,15,15,17],
                        'num':[2,1,1,4,2,2]})
display(df)
display(df.groupby('books',as_index=True).sum())
display(df.groupby('books',as_index=False).sum())

Unnamed: 0,books,price,num
0,bk1,12,2
1,bk1,12,1
2,bk1,12,1
3,bk2,15,4
4,bk2,15,2
5,bk3,17,2


Unnamed: 0_level_0,price,num
books,Unnamed: 1_level_1,Unnamed: 2_level_1
bk1,36,4
bk2,30,6
bk3,17,2


Unnamed: 0,books,price,num
0,bk1,36,4
1,bk2,30,6
2,bk3,17,2


分组统计用户-类别，不同用户对不同商品类别的行为计数

In [15]:
# 分组统计，用户-类别，不同用户对不同商品类别的行为计数
user_cate = actions.groupby(['user_id','cate'], as_index=False).sum()
del user_cate['sku_id']
del user_cate['type']
display(user_cate.head(),user_cate.shape)
actions = pd.merge(actions, user_cate, how='left', on=['user_id','cate'])
del user_cate
gc.collect()
display(actions.head(),actions.shape)

Unnamed: 0,user_id,cate,action_before_3_1.0,action_before_3_2.0,action_before_3_3.0,action_before_3_4.0,action_before_3_5.0,action_before_3_6.0
0,200002.0,4.0,16.0,0.0,0.0,0.0,0.0,20.0
1,200002.0,5.0,4.0,0.0,0.0,0.0,0.0,6.0
2,200002.0,7.0,4.0,0.0,0.0,0.0,0.0,3.0
3,200002.0,8.0,4.0,0.0,0.0,0.0,0.0,12.0
4,200003.0,4.0,8.0,0.0,0.0,0.0,0.0,12.0


(35577, 8)

Unnamed: 0,user_id,cate,sku_id,type,action_before_3_1.0_x,action_before_3_2.0_x,action_before_3_3.0_x,action_before_3_4.0_x,action_before_3_5.0_x,action_before_3_6.0_x,action_before_3_1.0_y,action_before_3_2.0_y,action_before_3_3.0_y,action_before_3_4.0_y,action_before_3_5.0_y,action_before_3_6.0_y
0,200002.0,4.0,7199.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,20.0
1,200002.0,4.0,28973.0,40.0,4.0,0.0,0.0,0.0,0.0,6.0,16.0,0.0,0.0,0.0,0.0,20.0
2,200002.0,4.0,73364.0,24.0,6.0,0.0,0.0,0.0,0.0,3.0,16.0,0.0,0.0,0.0,0.0,20.0
3,200002.0,4.0,118303.0,38.0,2.0,0.0,0.0,0.0,0.0,6.0,16.0,0.0,0.0,0.0,0.0,20.0
4,200002.0,4.0,149851.0,32.0,2.0,0.0,0.0,0.0,0.0,5.0,16.0,0.0,0.0,0.0,0.0,20.0


(119134, 16)

用户对同类别下其他商品的行为计数

In [16]:
prefix = 'action_before_%s' % 3
actions[prefix+'_1_y'] = actions[prefix+'_1.0_y'] - actions[prefix+'_1.0_x']
display(actions.head(),actions.shape)
del actions
gc.collect()

Unnamed: 0,user_id,cate,sku_id,type,action_before_3_1.0_x,action_before_3_2.0_x,action_before_3_3.0_x,action_before_3_4.0_x,action_before_3_5.0_x,action_before_3_6.0_x,action_before_3_1.0_y,action_before_3_2.0_y,action_before_3_3.0_y,action_before_3_4.0_y,action_before_3_5.0_y,action_before_3_6.0_y,action_before_3_1_y
0,200002.0,4.0,7199.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,20.0,14.0
1,200002.0,4.0,28973.0,40.0,4.0,0.0,0.0,0.0,0.0,6.0,16.0,0.0,0.0,0.0,0.0,20.0,12.0
2,200002.0,4.0,73364.0,24.0,6.0,0.0,0.0,0.0,0.0,3.0,16.0,0.0,0.0,0.0,0.0,20.0,10.0
3,200002.0,4.0,118303.0,38.0,2.0,0.0,0.0,0.0,0.0,6.0,16.0,0.0,0.0,0.0,0.0,20.0,14.0
4,200002.0,4.0,149851.0,32.0,2.0,0.0,0.0,0.0,0.0,5.0,16.0,0.0,0.0,0.0,0.0,20.0,14.0


(119134, 17)

0

## 用户-行为

### 累积用户特征

分时间段

用户不同行为的

购买转化率

均值

#### 函数定义

In [17]:
def get_accumulate_user_feat(end_date, all_actions, day):
    start_date = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=day)
    start_date = start_date.strftime('%Y-%m-%d')
    prefix = 'user_action_%s' % day

    actions = get_actions(start_date, end_date, all_actions)
    df = pd.get_dummies(actions['type'], prefix=prefix) # 独热编码

    actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
    actions = pd.concat([actions[['user_id', 'date']], df], axis=1)
    del df
    gc.collect()
    # 分组统计，按用户分组，统计用户各项行为的转化率、均值
    actions = actions.groupby(['user_id'], as_index=False).sum()
    actions[prefix + '_1_ratio'] =  np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_1.0'])
    actions[prefix + '_2_ratio'] =  np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_2.0'])
    actions[prefix + '_3_ratio'] =  np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_3.0'])
    actions[prefix + '_5_ratio'] =  np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_5.0'])
    actions[prefix + '_6_ratio'] =  np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_6.0'])
    # 均值
    actions[prefix + '_1_mean'] = actions[prefix + '_1.0'] / day
    actions[prefix + '_2_mean'] = actions[prefix + '_2.0'] / day
    actions[prefix + '_3_mean'] = actions[prefix + '_3.0'] / day
    actions[prefix + '_4_mean'] = actions[prefix + '_4.0'] / day
    actions[prefix + '_5_mean'] = actions[prefix + '_5.0'] / day
    actions[prefix + '_6_mean'] = actions[prefix + '_6.0'] / day
    return actions

In [18]:
np.log2(16) - np.log2(32)

-1.0

In [19]:
np.log2(16/32)

-1.0

#### 代码解读

加载一定时间段内所有数据

In [20]:
prefix = 'user_action_%s' % 3
all_actions = get_all_action()
start_date = '2016-02-01'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
end_date = end_date.strftime('%Y-%m-%d')
# 获取一定时间段内数据
actions = get_actions(start_date, end_date, all_actions)
display(actions.head(),actions.shape)
del all_actions
gc.collect()

Unnamed: 0,user_id,sku_id,time,model_id,type,cate,brand
29,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
30,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
31,272629.0,107774.0,2016-02-01 00:00:00,0.0,6.0,10.0,36.0
32,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
33,272629.0,107774.0,2016-02-01 00:00:00,216.0,6.0,10.0,36.0


(1005110, 7)

2686

用户行为统计计数

In [21]:
df = pd.get_dummies(actions['type'], prefix=prefix)
display(df.head(),df.shape)
actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
actions = pd.concat([actions[['user_id', 'date']], df], axis=1)
actions = actions.groupby(['user_id'],as_index=False).sum()
display(actions.head(),actions.shape)

Unnamed: 0,user_action_3_1.0,user_action_3_2.0,user_action_3_3.0,user_action_3_4.0,user_action_3_5.0,user_action_3_6.0
29,1,0,0,0,0,0
30,1,0,0,0,0,0
31,0,0,0,0,0,1
32,1,0,0,0,0,0
33,0,0,0,0,0,1


(1005110, 6)

Unnamed: 0,user_id,user_action_3_1.0,user_action_3_2.0,user_action_3_3.0,user_action_3_4.0,user_action_3_5.0,user_action_3_6.0
0,200002.0,28.0,0.0,0.0,0.0,0.0,41.0
1,200003.0,20.0,0.0,0.0,0.0,0.0,31.0
2,200008.0,8.0,0.0,0.0,0.0,0.0,20.0
3,200023.0,1.0,0.0,0.0,0.0,0.0,0.0
4,200030.0,8.0,0.0,0.0,0.0,0.0,17.0


(22210, 7)

不同行为转购率和均值

In [22]:
actions[prefix + '_1_ratio'] =  np.log(1 + actions[prefix + '_4.0']) - np.log(1 + actions[prefix +'_1.0'])
actions[prefix + '_1_mean'] = actions[prefix + '_1.0'] / 3
actions.head(20)

Unnamed: 0,user_id,user_action_3_1.0,user_action_3_2.0,user_action_3_3.0,user_action_3_4.0,user_action_3_5.0,user_action_3_6.0,user_action_3_1_ratio,user_action_3_1_mean
0,200002.0,28.0,0.0,0.0,0.0,0.0,41.0,-3.367296,9.333333
1,200003.0,20.0,0.0,0.0,0.0,0.0,31.0,-3.044522,6.666667
2,200008.0,8.0,0.0,0.0,0.0,0.0,20.0,-2.197225,2.666667
3,200023.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.693147,0.333333
4,200030.0,8.0,0.0,0.0,0.0,0.0,17.0,-2.197225,2.666667
5,200031.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0
6,200033.0,17.0,0.0,0.0,1.0,0.0,0.0,-2.197225,5.666667
7,200038.0,48.0,1.0,0.0,1.0,0.0,58.0,-3.198673,16.0
8,200044.0,70.0,0.0,0.0,0.0,0.0,96.0,-4.26268,23.333333
9,200045.0,36.0,0.0,0.0,0.0,0.0,54.0,-3.610918,12.0


### 用户近期行为特征

在上面针对用户进行累积特征提取的基础上，分别提取用户近一个月、近三天的特征，然后提取一个月内用户除去最近三天的行为占据一个月的行为的比重

In [23]:
def get_recent_user_feat(end_date, all_actions):
    actions_3 = get_accumulate_user_feat(end_date, all_actions, 3)
    actions_30 = get_accumulate_user_feat(end_date, all_actions, 30)
    actions = pd.merge(actions_3, actions_30, how ='left', on='user_id')
    del actions_3
    del actions_30
    gc.collect()
    
    actions['recent_action1'] =  np.log(1 + actions['user_action_30_1.0']-actions['user_action_3_1.0']) - np.log(1 + actions['user_action_30_1.0'])
    actions['recent_action2'] =  np.log(1 + actions['user_action_30_2.0']-actions['user_action_3_2.0']) - np.log(1 + actions['user_action_30_2.0'])
    actions['recent_action3'] =  np.log(1 + actions['user_action_30_3.0']-actions['user_action_3_3.0']) - np.log(1 + actions['user_action_30_3.0'])
    actions['recent_action4'] =  np.log(1 + actions['user_action_30_4.0']-actions['user_action_3_4.0']) - np.log(1 + actions['user_action_30_4.0'])
    actions['recent_action5'] =  np.log(1 + actions['user_action_30_5.0']-actions['user_action_3_5.0']) - np.log(1 + actions['user_action_30_5.0'])
    actions['recent_action6'] =  np.log(1 + actions['user_action_30_6.0']-actions['user_action_3_6.0']) - np.log(1 + actions['user_action_30_6.0'])
    
    return actions

### 用户对大类别商品交互行为特征工程

用户对各个类别的各项行为操作统计

用户对各个类别操作行为统计占对所有类别操作行为统计的比重

#### 函数定义

In [24]:
#增加了用户对不同类别的交互特征
def get_user_cate_feature(start_date, end_date, all_actions):
    actions = get_actions(start_date, end_date, all_actions)
    actions = actions[['user_id', 'cate', 'type']]
    df = pd.get_dummies(actions['type'], prefix='type')
    actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
    
    actions = actions.groupby(['user_id', 'cate']).sum()
    
    actions = actions.unstack()
    actions.columns = actions.columns.swaplevel(0, 1)
    actions.columns = actions.columns.droplevel()
    
    actions.columns = [
        'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',
        'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',
        'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',
        'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',
        'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',
        'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',
        'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',
        'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',
        'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',
        'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',
        'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',
        'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6']
    
    actions = actions.fillna(0)
    
    actions['cate_action_sum'] = actions.sum(axis=1)
    
    # 用户对各个类别操作行为统计占对所有类别操作行为统计的比重
    actions['cate8_percentage'] = (
        actions['cate_8_type1'] + actions['cate_8_type2'] +
        actions['cate_8_type3'] + actions['cate_8_type4'] +
        actions['cate_8_type5'] + actions['cate_8_type6']) / actions['cate_action_sum']
    
    actions['cate4_percentage'] = (
        actions['cate_4_type1'] + actions['cate_4_type2'] +
        actions['cate_4_type3'] + actions['cate_4_type4'] +
        actions['cate_4_type5'] + actions['cate_4_type6']) / actions['cate_action_sum']
    actions['cate5_percentage'] = (
        actions['cate_5_type1'] + actions['cate_5_type2'] +
        actions['cate_5_type3'] + actions['cate_5_type4'] +
        actions['cate_5_type5'] + actions['cate_5_type6']) / actions['cate_action_sum']
    actions['cate6_percentage'] = (
        actions['cate_6_type1'] + actions['cate_6_type2'] +
        actions['cate_6_type3'] + actions['cate_6_type4'] +
        actions['cate_6_type5'] + actions['cate_6_type6']) / actions['cate_action_sum']
    actions['cate7_percentage'] = (
        actions['cate_7_type1'] + actions['cate_7_type2'] +
        actions['cate_7_type3'] + actions['cate_7_type4'] +
        actions['cate_7_type5'] + actions['cate_7_type6']) / actions['cate_action_sum']
    actions['cate9_percentage'] = (
        actions['cate_9_type1'] + actions['cate_9_type2'] +
        actions['cate_9_type3'] + actions['cate_9_type4'] +
        actions['cate_9_type5'] + actions['cate_9_type6']) / actions['cate_action_sum']
    actions['cate10_percentage'] = (
        actions['cate_10_type1'] + actions['cate_10_type2'] +
        actions['cate_10_type3'] + actions['cate_10_type4'] +
        actions['cate_10_type5'] + actions['cate_10_type6']) / actions['cate_action_sum']
    actions['cate11_percentage'] = (
        actions['cate_11_type1'] + actions['cate_11_type2'] +
        actions['cate_11_type3'] + actions['cate_11_type4'] +
        actions['cate_11_type5'] + actions['cate_11_type6']) / actions['cate_action_sum']

    actions['cate8_type1_percentage'] = np.log(
        1 + actions['cate_8_type1']) - np.log(
            1 + actions['cate_8_type1'] + actions['cate_4_type1'] +
            actions['cate_5_type1'] + actions['cate_6_type1'] +
            actions['cate_7_type1'] + actions['cate_9_type1'] +
            actions['cate_10_type1'] + actions['cate_11_type1'])

    actions['cate8_type2_percentage'] = np.log(
        1 + actions['cate_8_type2']) - np.log(
            1 + actions['cate_8_type2'] + actions['cate_4_type2'] +
            actions['cate_5_type2'] + actions['cate_6_type2'] +
            actions['cate_7_type2'] + actions['cate_9_type2'] +
            actions['cate_10_type2'] + actions['cate_11_type2'])
    actions['cate8_type3_percentage'] = np.log(
        1 + actions['cate_8_type3']) - np.log(
            1 + actions['cate_8_type3'] + actions['cate_4_type3'] +
            actions['cate_5_type3'] + actions['cate_6_type3'] +
            actions['cate_7_type3'] + actions['cate_9_type3'] +
            actions['cate_10_type3'] + actions['cate_11_type3'])
    actions['cate8_type4_percentage'] = np.log(
        1 + actions['cate_8_type4']) - np.log(
            1 + actions['cate_8_type4'] + actions['cate_4_type4'] +
            actions['cate_5_type4'] + actions['cate_6_type4'] +
            actions['cate_7_type4'] + actions['cate_9_type4'] +
            actions['cate_10_type4'] + actions['cate_11_type4'])
    actions['cate8_type5_percentage'] = np.log(
        1 + actions['cate_8_type5']) - np.log(
            1 + actions['cate_8_type5'] + actions['cate_4_type5'] +
            actions['cate_5_type5'] + actions['cate_6_type5'] +
            actions['cate_7_type5'] + actions['cate_9_type5'] +
            actions['cate_10_type5'] + actions['cate_11_type5'])
    actions['cate8_type6_percentage'] = np.log(
        1 + actions['cate_8_type6']) - np.log(
            1 + actions['cate_8_type6'] + actions['cate_4_type6'] +
            actions['cate_5_type6'] + actions['cate_6_type6'] +
            actions['cate_7_type6'] + actions['cate_9_type6'] +
            actions['cate_10_type6'] + actions['cate_11_type6'])
    
    actions['user_id'] = actions.index
    actions = actions[[
        'user_id', 'cate8_percentage', 'cate4_percentage', 'cate5_percentage',
        'cate6_percentage', 'cate7_percentage', 'cate9_percentage',
        'cate10_percentage', 'cate11_percentage', 'cate8_type1_percentage',
        'cate8_type2_percentage', 'cate8_type3_percentage',
        'cate8_type4_percentage', 'cate8_type5_percentage',
        'cate8_type6_percentage']]
    return actions

#### 代码解读

加载一定时间段内所有数据

In [25]:
prefix = 'user_action_%s' % 3
all_actions = get_all_action()
start_date = '2016-02-01'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
end_date = end_date.strftime('%Y-%m-%d')
# 获取一定时间段内数据
actions = get_actions(start_date, end_date, all_actions)
actions = actions[['user_id', 'cate', 'type']]
display(actions.head(),actions.shape)
del all_actions
gc.collect()

Unnamed: 0,user_id,cate,type
29,272629.0,10.0,1.0
30,272629.0,10.0,1.0
31,272629.0,10.0,6.0
32,272629.0,10.0,1.0
33,272629.0,10.0,6.0


(1005110, 3)

2932

用户类别分组聚合

In [26]:
df = pd.get_dummies(actions['type'], prefix='type')
actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
actions = actions.groupby(['user_id', 'cate']).sum()
actions.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_1.0,type_2.0,type_3.0,type_4.0,type_5.0,type_6.0
user_id,cate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200002.0,4.0,16.0,0.0,0.0,0.0,0.0,20.0
200002.0,5.0,4.0,0.0,0.0,0.0,0.0,6.0
200002.0,7.0,4.0,0.0,0.0,0.0,0.0,3.0
200002.0,8.0,4.0,0.0,0.0,0.0,0.0,12.0
200003.0,4.0,8.0,0.0,0.0,0.0,0.0,12.0
200003.0,8.0,12.0,0.0,0.0,0.0,0.0,19.0
200008.0,7.0,8.0,0.0,0.0,0.0,0.0,20.0
200023.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0
200030.0,4.0,8.0,0.0,0.0,0.0,0.0,17.0
200031.0,10.0,0.0,0.0,0.0,0.0,0.0,20.0


行索引变列索引

In [27]:
actions = actions.unstack()
actions.head()

Unnamed: 0_level_0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_2.0,type_2.0,...,type_5.0,type_5.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0
cate,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,4.0,5.0,...,10.0,11.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
200002.0,16.0,4.0,,4.0,4.0,,,,0.0,0.0,...,,,20.0,6.0,,3.0,12.0,,,
200003.0,8.0,,,,12.0,,,,0.0,,...,,,12.0,,,,19.0,,,
200008.0,,,,8.0,,,,,,,...,,,,,,20.0,,,,
200023.0,,,,,1.0,,,,,,...,,,,,,,0.0,,,
200030.0,8.0,,,,,,,,0.0,,...,,,17.0,,,,,,,


交换列索引层级

In [28]:
actions.columns = actions.columns.swaplevel(0, 1)
actions.head()

cate,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,4.0,5.0,...,10.0,11.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
Unnamed: 0_level_1,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_2.0,type_2.0,...,type_5.0,type_5.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
200002.0,16.0,4.0,,4.0,4.0,,,,0.0,0.0,...,,,20.0,6.0,,3.0,12.0,,,
200003.0,8.0,,,,12.0,,,,0.0,,...,,,12.0,,,,19.0,,,
200008.0,,,,8.0,,,,,,,...,,,,,,20.0,,,,
200023.0,,,,,1.0,,,,,,...,,,,,,,0.0,,,
200030.0,8.0,,,,,,,,0.0,,...,,,17.0,,,,,,,


删除第一层列索引

In [29]:
actions.columns = actions.columns.droplevel()
actions.head()

Unnamed: 0_level_0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_1.0,type_2.0,type_2.0,...,type_5.0,type_5.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0,type_6.0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200002.0,16.0,4.0,,4.0,4.0,,,,0.0,0.0,...,,,20.0,6.0,,3.0,12.0,,,
200003.0,8.0,,,,12.0,,,,0.0,,...,,,12.0,,,,19.0,,,
200008.0,,,,8.0,,,,,,,...,,,,,,20.0,,,,
200023.0,,,,,1.0,,,,,,...,,,,,,,0.0,,,
200030.0,8.0,,,,,,,,0.0,,...,,,17.0,,,,,,,


列索引重新赋值

In [30]:
actions.columns = [
        'cate_4_type1', 'cate_5_type1', 'cate_6_type1', 'cate_7_type1',
        'cate_8_type1', 'cate_9_type1', 'cate_10_type1', 'cate_11_type1',
        'cate_4_type2', 'cate_5_type2', 'cate_6_type2', 'cate_7_type2',
        'cate_8_type2', 'cate_9_type2', 'cate_10_type2', 'cate_11_type2',
        'cate_4_type3', 'cate_5_type3', 'cate_6_type3', 'cate_7_type3',
        'cate_8_type3', 'cate_9_type3', 'cate_10_type3', 'cate_11_type3',
        'cate_4_type4', 'cate_5_type4', 'cate_6_type4', 'cate_7_type4',
        'cate_8_type4', 'cate_9_type4', 'cate_10_type4', 'cate_11_type4',
        'cate_4_type5', 'cate_5_type5', 'cate_6_type5', 'cate_7_type5',
        'cate_8_type5', 'cate_9_type5', 'cate_10_type5', 'cate_11_type5',
        'cate_4_type6', 'cate_5_type6', 'cate_6_type6', 'cate_7_type6',
        'cate_8_type6', 'cate_9_type6', 'cate_10_type6', 'cate_11_type6']
actions.head()

Unnamed: 0_level_0,cate_4_type1,cate_5_type1,cate_6_type1,cate_7_type1,cate_8_type1,cate_9_type1,cate_10_type1,cate_11_type1,cate_4_type2,cate_5_type2,...,cate_10_type5,cate_11_type5,cate_4_type6,cate_5_type6,cate_6_type6,cate_7_type6,cate_8_type6,cate_9_type6,cate_10_type6,cate_11_type6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200002.0,16.0,4.0,,4.0,4.0,,,,0.0,0.0,...,,,20.0,6.0,,3.0,12.0,,,
200003.0,8.0,,,,12.0,,,,0.0,,...,,,12.0,,,,19.0,,,
200008.0,,,,8.0,,,,,,,...,,,,,,20.0,,,,
200023.0,,,,,1.0,,,,,,...,,,,,,,0.0,,,
200030.0,8.0,,,,,,,,0.0,,...,,,17.0,,,,,,,


空数据填充并求和

In [31]:
actions = actions.fillna(0)
display(actions.head())
actions['cate_action_sum'] = actions.sum(axis=1)
actions.head()

Unnamed: 0_level_0,cate_4_type1,cate_5_type1,cate_6_type1,cate_7_type1,cate_8_type1,cate_9_type1,cate_10_type1,cate_11_type1,cate_4_type2,cate_5_type2,...,cate_10_type5,cate_11_type5,cate_4_type6,cate_5_type6,cate_6_type6,cate_7_type6,cate_8_type6,cate_9_type6,cate_10_type6,cate_11_type6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200002.0,16.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20.0,6.0,0.0,3.0,12.0,0.0,0.0,0.0
200003.0,8.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,12.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0
200008.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0
200023.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200030.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,cate_4_type1,cate_5_type1,cate_6_type1,cate_7_type1,cate_8_type1,cate_9_type1,cate_10_type1,cate_11_type1,cate_4_type2,cate_5_type2,...,cate_11_type5,cate_4_type6,cate_5_type6,cate_6_type6,cate_7_type6,cate_8_type6,cate_9_type6,cate_10_type6,cate_11_type6,cate_action_sum
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200002.0,16.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,6.0,0.0,3.0,12.0,0.0,0.0,0.0,69.0
200003.0,8.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,12.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,51.0
200008.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,28.0
200023.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
200030.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0


用户对类别8操作行为统计占对所有类别操作行为统计的比重

In [32]:
actions['cate8_percentage'] = (
        actions['cate_8_type1'] + actions['cate_8_type2'] +
        actions['cate_8_type3'] + actions['cate_8_type4'] +
        actions['cate_8_type5'] + actions['cate_8_type6']) / actions['cate_action_sum']
actions.head()

Unnamed: 0_level_0,cate_4_type1,cate_5_type1,cate_6_type1,cate_7_type1,cate_8_type1,cate_9_type1,cate_10_type1,cate_11_type1,cate_4_type2,cate_5_type2,...,cate_4_type6,cate_5_type6,cate_6_type6,cate_7_type6,cate_8_type6,cate_9_type6,cate_10_type6,cate_11_type6,cate_action_sum,cate8_percentage
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200002.0,16.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,20.0,6.0,0.0,3.0,12.0,0.0,0.0,0.0,69.0,0.231884
200003.0,8.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,51.0,0.607843
200008.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,28.0,0.0
200023.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
200030.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0


类别8-交互1占总交互1比例

In [33]:
actions['cate8_type1_percentage'] = np.log(1 + actions['cate_8_type1'])- np.log(
            1 + actions['cate_8_type1'] + actions['cate_4_type1'] +
            actions['cate_5_type1'] + actions['cate_6_type1'] +
            actions['cate_7_type1'] + actions['cate_9_type1'] +
            actions['cate_10_type1'] + actions['cate_11_type1'])
actions.head()

Unnamed: 0_level_0,cate_4_type1,cate_5_type1,cate_6_type1,cate_7_type1,cate_8_type1,cate_9_type1,cate_10_type1,cate_11_type1,cate_4_type2,cate_5_type2,...,cate_5_type6,cate_6_type6,cate_7_type6,cate_8_type6,cate_9_type6,cate_10_type6,cate_11_type6,cate_action_sum,cate8_percentage,cate8_type1_percentage
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200002.0,16.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,6.0,0.0,3.0,12.0,0.0,0.0,0.0,69.0,0.231884,-1.757858
200003.0,8.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,19.0,0.0,0.0,0.0,51.0,0.607843,-0.479573
200008.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20.0,0.0,0.0,0.0,0.0,28.0,0.0,-2.197225
200023.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
200030.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,-2.197225


In [34]:
actions['user_id'] = actions.index
actions.head()

Unnamed: 0_level_0,cate_4_type1,cate_5_type1,cate_6_type1,cate_7_type1,cate_8_type1,cate_9_type1,cate_10_type1,cate_11_type1,cate_4_type2,cate_5_type2,...,cate_6_type6,cate_7_type6,cate_8_type6,cate_9_type6,cate_10_type6,cate_11_type6,cate_action_sum,cate8_percentage,cate8_type1_percentage,user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200002.0,16.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,12.0,0.0,0.0,0.0,69.0,0.231884,-1.757858,200002.0
200003.0,8.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,19.0,0.0,0.0,0.0,51.0,0.607843,-0.479573,200003.0
200008.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,28.0,0.0,-2.197225,200008.0
200023.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,200023.0
200030.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,-2.197225,200030.0


## 商品-行为

### 累积商品特征（函数定义

分时间段

针对商品的不同行为的

购买转化率

均值

In [35]:
def get_accumulate_product_feat(start_date, end_date, all_actions):
    actions = get_actions(start_date, end_date, all_actions)
    df = pd.get_dummies(actions['type'], prefix='product_action')
    # 按照商品-日期分组，计算某个时间段该商品的各项行为的标准差
    actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
    actions = pd.concat([actions[['sku_id', 'date']], df], axis=1)

    actions = actions.groupby(['sku_id'], as_index=False).sum()
    # 时间间隔，起始时间 终止时间，间隔
    days_interal = (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(start_date, '%Y-%m-%d')).days
    
    # 针对商品分组，计算购买转化率
    actions['product_action_1_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_1.0'])
    actions['product_action_2_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_2.0'])
    actions['product_action_3_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_3.0'])
    actions['product_action_5_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_5.0'])
    actions['product_action_6_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_6.0'])
    # 计算各种行为的均值
    actions['product_action_1_mean'] = actions['product_action_1.0'] / days_interal
    actions['product_action_2_mean'] = actions['product_action_2.0'] / days_interal
    actions['product_action_3_mean'] = actions['product_action_3.0'] / days_interal
    actions['product_action_4_mean'] = actions['product_action_4.0'] / days_interal
    actions['product_action_5_mean'] = actions['product_action_5.0'] / days_interal
    actions['product_action_6_mean'] = actions['product_action_6.0'] / days_interal
    return actions

### 代码解读

加载一定时间段内所有数据

In [36]:
prefix = 'user_action_%s' % 3
all_actions = get_all_action()
start_date = '2016-02-01'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
end_date = end_date.strftime('%Y-%m-%d')
# 获取一定时间段内数据
actions = get_actions(start_date, end_date, all_actions)
display(actions.head(),actions.shape)
del all_actions
gc.collect()

Unnamed: 0,user_id,sku_id,time,model_id,type,cate,brand
29,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
30,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
31,272629.0,107774.0,2016-02-01 00:00:00,0.0,6.0,10.0,36.0
32,272629.0,107774.0,2016-02-01 00:00:00,,1.0,10.0,36.0
33,272629.0,107774.0,2016-02-01 00:00:00,216.0,6.0,10.0,36.0


(1005110, 7)

5285

商品分组聚合

In [37]:
df = pd.get_dummies(actions['type'], prefix='product_action')
actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
actions = pd.concat([actions[['sku_id', 'date']], df], axis=1)
actions = actions.groupby(['sku_id'], as_index=False).sum()
actions.head()

Unnamed: 0,sku_id,product_action_1.0,product_action_2.0,product_action_3.0,product_action_4.0,product_action_5.0,product_action_6.0
0,2.0,2.0,0.0,0.0,0.0,0.0,3.0
1,37.0,2.0,0.0,0.0,0.0,0.0,3.0
2,40.0,4.0,0.0,0.0,0.0,0.0,9.0
3,50.0,8.0,0.0,2.0,0.0,0.0,14.0
4,52.0,87.0,0.0,1.0,0.0,0.0,112.0


In [38]:
actions.head(50)

Unnamed: 0,sku_id,product_action_1.0,product_action_2.0,product_action_3.0,product_action_4.0,product_action_5.0,product_action_6.0
0,2.0,2.0,0.0,0.0,0.0,0.0,3.0
1,37.0,2.0,0.0,0.0,0.0,0.0,3.0
2,40.0,4.0,0.0,0.0,0.0,0.0,9.0
3,50.0,8.0,0.0,2.0,0.0,0.0,14.0
4,52.0,87.0,0.0,1.0,0.0,0.0,112.0
5,68.0,4.0,0.0,0.0,0.0,0.0,26.0
6,131.0,4.0,0.0,0.0,0.0,0.0,4.0
7,149.0,1.0,0.0,0.0,0.0,0.0,1.0
8,156.0,90.0,3.0,1.0,0.0,0.0,129.0
9,169.0,127.0,2.0,2.0,0.0,1.0,169.0


商品不同行为的转购率和均值计算

In [39]:
days_interal = (datetime.strptime(end_date, '%Y-%m-%d') - 
                datetime.strptime(start_date, '%Y-%m-%d')).days
print('时间间隔',days_interal)
actions['product_action_1_ratio'] =  np.log(1 + actions['product_action_4.0']) - np.log(1 + actions['product_action_1.0'])
actions['product_action_1_mean'] = actions['product_action_1.0'] / days_interal
actions.head()

时间间隔 3


Unnamed: 0,sku_id,product_action_1.0,product_action_2.0,product_action_3.0,product_action_4.0,product_action_5.0,product_action_6.0,product_action_1_ratio,product_action_1_mean
0,2.0,2.0,0.0,0.0,0.0,0.0,3.0,-1.098612,0.666667
1,37.0,2.0,0.0,0.0,0.0,0.0,3.0,-1.098612,0.666667
2,40.0,4.0,0.0,0.0,0.0,0.0,9.0,-1.609438,1.333333
3,50.0,8.0,0.0,2.0,0.0,0.0,14.0,-2.197225,2.666667
4,52.0,87.0,0.0,1.0,0.0,0.0,112.0,-4.477337,29.0


## 类别特征

分时间段下各个商品类别的

购买转化率

均值

In [40]:
def get_accumulate_cate_feat(start_date, end_date, all_actions):
    actions = get_actions(start_date, end_date, all_actions)
    actions['date'] = pd.to_datetime(actions['time']).apply(lambda x: x.date())
    df = pd.get_dummies(actions['type'], prefix='cate_action')
    actions = pd.concat([actions[['cate','date']], df], axis=1)
    
    # 按照类别分组，统计各个商品类别下行为的转化率
    actions = actions.groupby(['cate'], as_index=False).sum()
    
    days_interal = (datetime.strptime(end_date, '%Y-%m-%d')-datetime.strptime(start_date, '%Y-%m-%d')).days

    actions['cate_action_1_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_1.0']))
    actions['cate_action_2_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_2.0']))
    actions['cate_action_3_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_3.0']))
    actions['cate_action_5_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_5.0']))
    actions['cate_action_6_ratio'] =(np.log(1 + actions['cate_action_4.0']) - np.log(1 + actions['cate_action_6.0']))
    # 按照类别分组，统计各个商品类别下行为在一段时间的均值
    actions['cate_action_1_mean'] = actions['cate_action_1.0'] / days_interal
    actions['cate_action_2_mean'] = actions['cate_action_2.0'] / days_interal
    actions['cate_action_3_mean'] = actions['cate_action_3.0'] / days_interal
    actions['cate_action_4_mean'] = actions['cate_action_4.0'] / days_interal
    actions['cate_action_5_mean'] = actions['cate_action_5.0'] / days_interal
    actions['cate_action_6_mean'] = actions['cate_action_6.0'] / days_interal
    return actions

# 构造训练集/测试集

## 构造训练集

标签,采用滑动窗口的方式，构造训练集的时候针对产生购买的行为标记为1

整合特征

### 函数调用数据查看

购买行为标记

In [41]:
def get_labels(start_date, end_date, all_actions):
    actions = get_actions(start_date, end_date, all_actions)
    
    # 修改为预测购买了商品8的用户预测
    actions = actions[(actions['type'] == 4) & (actions['cate'] == 8)]
    actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
    actions['label'] = 1
    actions = actions[['user_id', 'sku_id', 'label']]
    return actions

查看用户数据结构

In [42]:
# 查看全部数据
all_actions = get_all_action()
print ("get all actions!")
display(all_actions.head(),all_actions.shape)
del all_actions
gc.collect()

get all actions!


Unnamed: 0,user_id,sku_id,time,model_id,type,cate,brand
0,266079.0,138778.0,2016-01-31 23:59:02,,1.0,8.0,403.0
1,266079.0,138778.0,2016-01-31 23:59:03,0.0,6.0,8.0,403.0
2,200719.0,61226.0,2016-01-31 23:59:07,,1.0,8.0,30.0
3,200719.0,61226.0,2016-01-31 23:59:08,0.0,6.0,8.0,30.0
4,263587.0,72348.0,2016-01-31 23:59:08,,1.0,5.0,159.0


(50601736, 7)

2117

In [44]:
# 用户特征
user = get_basic_user_feat()
print ('get_basic_user_feat finsihed')
display(user.head(),user.shape)
del user
gc.collect()

get_basic_user_feat finsihed


Unnamed: 0,user_id,age_0,age_1,age_2,age_3,age_4,age_5,age_6,sex_0,sex_1,sex_2,user_lv_cd_1,user_lv_cd_2,user_lv_cd_3,user_lv_cd_4,user_lv_cd_5
0,200001.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,200002.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,200003.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,200004.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,200005.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


(105321, 16)

1539

In [45]:
# 商品基本特征
product = get_basic_product_feat()
print ('get_basic_product_feat finsihed')
display(product.head(),product.shape)
del product
gc.collect()

get_basic_product_feat finsihed


Unnamed: 0,sku_id,cate,brand,a1_-1,a1_1,a1_2,a1_3,a2_-1,a2_1,a2_2,a3_-1,a3_1,a3_2
0,10,8,489,0,0,0,1,0,1,0,0,1,0
1,100002,8,489,0,0,0,1,0,0,1,0,0,1
2,100003,8,30,0,1,0,0,1,0,0,1,0,0
3,100006,8,545,0,1,0,0,0,0,1,0,1,0
4,10001,8,244,1,0,0,0,0,1,0,0,0,1


(24187, 13)

30

In [46]:
# 用户近期行为特征
start_date = '2016-02-01'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
end_date = end_date.strftime('%Y-%m-%d') # 转为字符串
all_actions = get_all_action()
user_acc = get_recent_user_feat(end_date, all_actions)
display(user_acc.head(),user_acc.shape)
del all_actions,user_acc
gc.collect()
print ('get_recent_user_feat finsihed')

Unnamed: 0,user_id,user_action_3_1.0,user_action_3_2.0,user_action_3_3.0,user_action_3_4.0,user_action_3_5.0,user_action_3_6.0,user_action_3_1_ratio,user_action_3_2_ratio,user_action_3_3_ratio,...,user_action_30_3_mean,user_action_30_4_mean,user_action_30_5_mean,user_action_30_6_mean,recent_action1,recent_action2,recent_action3,recent_action4,recent_action5,recent_action6
0,200002.0,28.0,0.0,0.0,0.0,0.0,41.0,-3.367296,0.0,0.0,...,0.0,0.0,0.0,1.366667,-3.367296,0.0,0.0,0.0,0.0,-3.73767
1,200003.0,20.0,0.0,0.0,0.0,0.0,31.0,-3.044522,0.0,0.0,...,0.0,0.0,0.0,1.033333,-3.044522,0.0,0.0,0.0,0.0,-3.465736
2,200008.0,8.0,0.0,0.0,0.0,0.0,20.0,-2.197225,0.0,0.0,...,0.0,0.0,0.0,0.666667,-2.197225,0.0,0.0,0.0,0.0,-3.044522
3,200023.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.693147,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.693147,0.0,0.0,0.0,0.0,0.0
4,200030.0,8.0,0.0,0.0,0.0,0.0,17.0,-2.197225,0.0,0.0,...,0.0,0.0,0.0,0.566667,-2.197225,0.0,0.0,0.0,0.0,-2.890372


(22210, 41)

get_recent_user_feat finsihed


### 构造训练集

特征工程-构建函数创建新特征

In [47]:
start_date = '2016-02-01'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
end_date = end_date.strftime('%Y-%m-%d') # 转为字符串
all_actions = get_all_action()
user_cate = get_user_cate_feature(start_date, end_date, all_actions)
display(user_cate.head())
user_cate = user_cate.reset_index(drop = True)# 处理索引
display(user_cate.head())
del all_actions,user_cate
gc.collect()

Unnamed: 0_level_0,user_id,cate8_percentage,cate4_percentage,cate5_percentage,cate6_percentage,cate7_percentage,cate9_percentage,cate10_percentage,cate11_percentage,cate8_type1_percentage,cate8_type2_percentage,cate8_type3_percentage,cate8_type4_percentage,cate8_type5_percentage,cate8_type6_percentage
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
200002.0,200002.0,0.231884,0.521739,0.144928,0.0,0.101449,0.0,0.0,0.0,-1.757858,0.0,0.0,0.0,0.0,-1.17272
200003.0,200003.0,0.607843,0.392157,0.0,0.0,0.0,0.0,0.0,0.0,-0.479573,0.0,0.0,0.0,0.0,-0.470004
200008.0,200008.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-2.197225,0.0,0.0,0.0,0.0,-3.044522
200023.0,200023.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200030.0,200030.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.197225,0.0,0.0,0.0,0.0,-2.890372


Unnamed: 0,user_id,cate8_percentage,cate4_percentage,cate5_percentage,cate6_percentage,cate7_percentage,cate9_percentage,cate10_percentage,cate11_percentage,cate8_type1_percentage,cate8_type2_percentage,cate8_type3_percentage,cate8_type4_percentage,cate8_type5_percentage,cate8_type6_percentage
0,200002.0,0.231884,0.521739,0.144928,0.0,0.101449,0.0,0.0,0.0,-1.757858,0.0,0.0,0.0,0.0,-1.17272
1,200003.0,0.607843,0.392157,0.0,0.0,0.0,0.0,0.0,0.0,-0.479573,0.0,0.0,0.0,0.0,-0.470004
2,200008.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-2.197225,0.0,0.0,0.0,0.0,-3.044522
3,200023.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,200030.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.197225,0.0,0.0,0.0,0.0,-2.890372


0

In [48]:
def make_actions(user, product, all_actions, start_date):
    end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=3)
    end_date = end_date.strftime('%Y-%m-%d')
    # 修正get_accumulate_product_feat,get_accumulate_cate_feat的时间跨度
    start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=30)
    start_days = start_days.strftime('%Y-%m-%d')
    print (end_date)
    user_acc = get_recent_user_feat(end_date, all_actions)
    print ('get_recent_user_feat finsihed')
    
    user_cate = get_user_cate_feature(start_date, end_date, all_actions)
    user_cate = user_cate.reset_index(drop = True)# 处理索引
    print ('get_user_cate_feature finished')
    
    product_acc = get_accumulate_product_feat(start_days, end_date, all_actions)
    print ('get_accumulate_product_feat finsihed')
    cate_acc = get_accumulate_cate_feat(start_days, end_date, all_actions)
    print ('get_accumulate_cate_feat finsihed')
    comment_acc = get_comments_product_feat(end_date)
    print ('get_comments_product_feat finished')
    # 标记
    test_start_date = end_date
    test_end_date = datetime.strptime(test_start_date, '%Y-%m-%d') + timedelta(days=5)
    test_end_date = test_end_date.strftime('%Y-%m-%d')
    labels = get_labels(test_start_date, test_end_date, all_actions)
    print ("get labels")
    
    actions = None
    for i in (3, 5, 7, 10, 15, 21, 30):
        start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)
        start_days = start_days.strftime('%Y-%m-%d')
        if actions is None:
            actions = get_action_feat(start_days, end_date, all_actions, i)
        else:
            # 注意这里的拼接key
            actions = pd.merge(actions, get_action_feat(start_days, end_date, all_actions, i), 
                               how='left',
                               on=['user_id', 'sku_id', 'cate'])

    actions = pd.merge(actions, user, how='left', on='user_id')
    actions = pd.merge(actions, user_acc, how='left', on='user_id')
    actions = pd.merge(actions, user_cate, how='left', on='user_id')
    # 注意这里的拼接key
    actions = pd.merge(actions, product, how='left', on=['sku_id', 'cate'])
    actions = pd.merge(actions, product_acc, how='left', on='sku_id')
    actions = pd.merge(actions, cate_acc, how='left', on='cate')
    actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
    actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])
    # 主要是填充拼接商品基本特征、评论特征、标签之后的空值
    actions = actions.fillna(0)
    # 采样
    action_postive = actions[actions['label'] == 1] # 购买
    action_negative = actions[actions['label'] == 0] # 没有购买
    del actions
    neg_len = len(action_postive) * 10 # 负样本是正样本数量的10倍
    action_negative = action_negative.sample(n=neg_len)
    action_sample = pd.concat([action_postive, action_negative], ignore_index=True)    
    return action_sample

构造训练数据集

In [49]:
def make_train_set(start_date, setNums ,f_path, all_actions):
    train_actions = None
    user = get_basic_user_feat()
    print ('get_basic_user_feat finsihed')
    product = get_basic_product_feat()
    print ('get_basic_product_feat finsihed')
    # 滑窗,构造多组训练集/验证集
    for i in range(setNums):
        print(start_date)
        if train_actions is None:
            train_actions = make_actions(user, product, all_actions, start_date)
        else:
            train_actions = pd.concat([train_actions, 
                                       make_actions(user, product, all_actions, start_date)],
                                      ignore_index=True)
        # 接下来每次移动一天
        start_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=1)
        start_date = start_date.strftime('%Y-%m-%d')
        print ("round {0}/{1} over!".format(i+1, setNums))
    train_actions.to_csv(f_path, index=False)
    del train_actions
    
# 训练集 & 验证集
start_date = '2016-03-01'
all_actions = get_all_action()
make_train_set(start_date, 20, 'train_set.csv',all_actions)
del all_actions
gc.collect()

get_basic_user_feat finsihed
get_basic_product_feat finsihed
2016-03-01
2016-03-04
get_recent_user_feat finsihed
get_user_cate_feature finished
get_accumulate_product_feat finsihed
get_accumulate_cate_feat finsihed
get_comments_product_feat finished
get labels
round 1/20 over!
2016-03-02
2016-03-05
get_recent_user_feat finsihed
get_user_cate_feature finished
get_accumulate_product_feat finsihed
get_accumulate_cate_feat finsihed
get_comments_product_feat finished
get labels
round 2/20 over!
2016-03-03
2016-03-06
get_recent_user_feat finsihed
get_user_cate_feature finished
get_accumulate_product_feat finsihed
get_accumulate_cate_feat finsihed
get_comments_product_feat finished
get labels
round 3/20 over!
2016-03-04
2016-03-07
get_recent_user_feat finsihed
get_user_cate_feature finished
get_accumulate_product_feat finsihed
get_accumulate_cate_feat finsihed
get_comments_product_feat finished
get labels
round 4/20 over!
2016-03-05
2016-03-08
get_recent_user_feat finsihed
get_user_cate_featu

0

## 构造测试集

In [50]:
# 测试集
val_start_date = '2016-04-01'
all_actions = get_all_action()
make_train_set(val_start_date, 3, 'test_set.csv',all_actions)
del all_actions
gc.collect()

get_basic_user_feat finsihed
get_basic_product_feat finsihed
2016-04-01
2016-04-04
get_recent_user_feat finsihed
get_user_cate_feature finished
get_accumulate_product_feat finsihed
get_accumulate_cate_feat finsihed
get_comments_product_feat finished
get labels
round 1/3 over!
2016-04-02
2016-04-05
get_recent_user_feat finsihed
get_user_cate_feature finished
get_accumulate_product_feat finsihed
get_accumulate_cate_feat finsihed
get_comments_product_feat finished
get labels
round 2/3 over!
2016-04-03
2016-04-06
get_recent_user_feat finsihed
get_user_cate_feature finished
get_accumulate_product_feat finsihed
get_accumulate_cate_feat finsihed
get_comments_product_feat finished
get labels
round 3/3 over!


0