# 特征工程

构建手工特征，从全局用户、全局商家、特定用户针对特定商家、特定商家对于特定用户四个方面构建。构建完后保存到本地。

In [1]:
import numpy as np
import pandas as pd

# 基本路径信息
path_format1_user_info = '/home/brian/Projects/Repurchase/data_format1/user_info_format1.csv'
path_format1_user_log = '/home/brian/Projects/Repurchase/data_format1/user_log_format1.csv'
path_format1_train = '/home/brian/Projects/Repurchase/data_format1/train_format1.csv'
path_format1_test = '/home/brian/Projects/Repurchase/data_format1/test_format1.csv'

with open (path_format1_train, 'r') as f_split_train:
    split_train_data = pd.read_csv(f_split_train)

with open (path_format1_test, 'r') as f_split_test:
    split_test_data = pd.read_csv(f_split_test)

with open(path_format1_user_info, 'r') as f_user_info:           
    user_info = pd.read_csv(f_user_info)

with open(path_format1_user_log, 'r') as f_user_log:
    user_log = pd.read_csv(f_user_log)

## 一、画像创建

### 1.1 建立全局用户画像

In [2]:
# 用全局的数据特征建立用户的整体画像，包括label=-1和label=1的数据,这里要保证它是可迁移的，这样可以直接对test data做类似的操作

# 等待建立的全局用户特征包含如下：
# 建立用户在所有商家浏览不同商品总数的整体画像
# 建立用户在所有商家浏览不同商品类别数的整体画像
# 建立用户在所有商家的平均浏览天数
# 建立用户在所有商家的平均活动次数
# 建立用户在所有商家的平均单击次数
# 建立用户在所有商家的平均加入购物车次数
# 建立用户在所有商家的平均购买次数
# 建立用户在所有商家的平均加入收藏夹次数
# 建立用户在所有商家的平均购买单击率
# 建立用户在所有商家的平均购买活动率


# 建立用户年龄和性别的画像
split_train_data_feature = pd.merge(split_train_data, user_info, left_on =['user_id'], right_on=['user_id'], how='left')
split_train_data_feature.head(10)

Unnamed: 0,user_id,merchant_id,label,age_range,gender
0,34176,3906,0,6.0,0.0
1,34176,121,0,6.0,0.0
2,34176,4356,1,6.0,0.0
3,34176,2217,0,6.0,0.0
4,230784,4818,0,0.0,0.0
5,362112,2618,0,4.0,1.0
6,34944,2051,0,5.0,0.0
7,231552,3828,1,5.0,0.0
8,231552,2124,0,5.0,0.0
9,232320,1168,0,4.0,1.0


In [3]:
# 建立用户在所有商家浏览不同商品总数的整体画像
global_user_feature_items = user_log.groupby(['user_id','seller_id','item_id']).count().reset_index()[['user_id','item_id']]
global_user_feature_items = global_user_feature_items.groupby(['user_id']).count().reset_index()
global_user_feature_items.rename(columns={'item_id':'user_items'},inplace=True)
global_user_feature_items.head(10)

Unnamed: 0,user_id,user_items
0,1,12
1,2,43
2,3,45
3,4,28
4,5,87
5,6,166
6,7,13
7,8,46
8,9,41
9,10,39


In [4]:
# 建立用户在所有商家浏览不同商品类别总数的整体画像
global_user_feature_cats = user_log.groupby(['user_id','seller_id','cat_id']).count().reset_index()[['user_id','cat_id']]
global_user_feature_cats = global_user_feature_cats.groupby(['user_id']).count().reset_index()
global_user_feature_cats.rename(columns={'cat_id':'user_cats'},inplace=True)
global_user_feature_cats.head(10)

Unnamed: 0,user_id,user_cats
0,1,9
1,2,16
2,3,31
3,4,16
4,5,69
5,6,104
6,7,10
7,8,29
8,9,29
9,10,24


In [5]:
# 建立用户浏览不同商家总数的整体画像
global_user_feature_stores = user_log.groupby(['user_id','seller_id']).count().reset_index()[['user_id','seller_id']]
global_user_feature_stores = global_user_feature_stores.groupby(['user_id']).count().reset_index()
global_user_feature_stores.rename(columns={'seller_id':'user_stores'},inplace=True)
global_user_feature_stores.head(10)

Unnamed: 0,user_id,user_stores
0,1,9
1,2,14
2,3,23
3,4,12
4,5,56
5,6,79
6,7,6
7,8,23
8,9,23
9,10,17


In [6]:
# 建立用户在各个商家上的浏览总天数的整体画像
global_user_feature_browse_days = user_log.groupby(['user_id','seller_id','time_stamp']).count().reset_index()[['user_id','time_stamp']]
global_user_feature_browse_days = global_user_feature_browse_days.groupby(['user_id']).count().reset_index()
global_user_feature_browse_days.rename(columns={'time_stamp':'user_browse_days'},inplace=True)
global_user_feature_browse_days.head(10)


Unnamed: 0,user_id,user_browse_days
0,1,10
1,2,14
2,3,26
3,4,17
4,5,84
5,6,102
6,7,9
7,8,32
8,9,34
9,10,24


In [7]:
# 建立用户在所有商家上activity_log的整体画像，对稠密的数据用比例显示，对系数的数据采用原值
global_user_feature_actions = user_log.groupby(['user_id','action_type']).count().reset_index()[['user_id','action_type','item_id']]
global_user_feature_actions.rename(columns={'item_id':'times'},inplace=True)    # 用来统计它乘积倍数的一个属性
global_user_feature_actions.head(10)

Unnamed: 0,user_id,action_type,times
0,1,0,27
1,1,2,6
2,2,0,47
3,2,2,14
4,2,3,2
5,3,0,63
6,3,2,4
7,3,3,1
8,4,0,49
9,4,2,1


In [8]:
# 建立用户全部单击次数user_clicks的整体画像
global_user_feature_clicks_index = global_user_feature_actions['action_type'] == 0
global_user_feature_actions['user_clicks'] = global_user_feature_clicks_index * global_user_feature_actions['times']
global_user_feature_actions.head(10)

Unnamed: 0,user_id,action_type,times,user_clicks
0,1,0,27,27
1,1,2,6,0
2,2,0,47,47
3,2,2,14,0
4,2,3,2,0
5,3,0,63,63
6,3,2,4,0
7,3,3,1,0
8,4,0,49,49
9,4,2,1,0


In [9]:
# 建立用户全部加入购物车user_carts的整体画像
global_user_feature_carts_index = global_user_feature_actions['action_type'] == 1
global_user_feature_actions['user_carts'] = global_user_feature_carts_index * global_user_feature_actions['times']
global_user_feature_actions.head(10)

Unnamed: 0,user_id,action_type,times,user_clicks,user_carts
0,1,0,27,27,0
1,1,2,6,0,0
2,2,0,47,47,0
3,2,2,14,0,0
4,2,3,2,0,0
5,3,0,63,63,0
6,3,2,4,0,0
7,3,3,1,0,0
8,4,0,49,49,0
9,4,2,1,0,0


In [10]:
# 建立用户全部购买user_purchases的整体画像
global_user_feature_purchases_index = global_user_feature_actions['action_type'] == 2
global_user_feature_actions['user_purchases'] = global_user_feature_purchases_index * global_user_feature_actions['times']
global_user_feature_actions.head(10)

Unnamed: 0,user_id,action_type,times,user_clicks,user_carts,user_purchases
0,1,0,27,27,0,0
1,1,2,6,0,0,6
2,2,0,47,47,0,0
3,2,2,14,0,0,14
4,2,3,2,0,0,0
5,3,0,63,63,0,0
6,3,2,4,0,0,4
7,3,3,1,0,0,0
8,4,0,49,49,0,0
9,4,2,1,0,0,1


In [11]:
# 建立用户全部收藏的user_collections的整体画像
global_user_feature_collections_index = global_user_feature_actions['action_type'] == 3
global_user_feature_actions['user_collections'] = global_user_feature_collections_index * global_user_feature_actions['times']
global_user_feature_actions.head(10)

Unnamed: 0,user_id,action_type,times,user_clicks,user_carts,user_purchases,user_collections
0,1,0,27,27,0,0,0
1,1,2,6,0,0,6,0
2,2,0,47,47,0,0,0
3,2,2,14,0,0,14,0
4,2,3,2,0,0,0,2
5,3,0,63,63,0,0,0
6,3,2,4,0,0,4,0
7,3,3,1,0,0,0,1
8,4,0,49,49,0,0,0
9,4,2,1,0,0,1,0


In [12]:
# 压缩user_id的特征维度，生成对应于user_id的特征
global_user_feature_actions = global_user_feature_actions.groupby(['user_id']).sum().reset_index()[['user_id','user_clicks','user_carts','user_purchases','user_collections']]
global_user_feature_actions.head(10)

Unnamed: 0,user_id,user_clicks,user_carts,user_purchases,user_collections
0,1,27,0,6,0
1,2,47,0,14,2
2,3,63,0,4,1
3,4,49,0,1,0
4,5,150,0,13,10
5,6,217,0,17,15
6,7,6,0,8,0
7,8,61,0,23,7
8,9,79,0,4,4
9,10,56,0,7,1


In [13]:
# 建立用户全部点击次数的user_logs特征画像
global_user_feature_actions['user_logs'] = global_user_feature_actions['user_clicks'] + global_user_feature_actions['user_carts'] + global_user_feature_actions['user_purchases'] + global_user_feature_actions['user_collections']
global_user_feature_actions.head(10)

Unnamed: 0,user_id,user_clicks,user_carts,user_purchases,user_collections,user_logs
0,1,27,0,6,0,33
1,2,47,0,14,2,63
2,3,63,0,4,1,68
3,4,49,0,1,0,50
4,5,150,0,13,10,173
5,6,217,0,17,15,249
6,7,6,0,8,0,14
7,8,61,0,23,7,91
8,9,79,0,4,4,87
9,10,56,0,7,1,64


In [14]:
# 合并所有的用户全局特征
global_user_feature = pd.merge(global_user_feature_items, global_user_feature_cats, left_on='user_id',right_on='user_id',how='left')
global_user_feature = pd.merge(global_user_feature, global_user_feature_stores, left_on='user_id',right_on='user_id',how='left')
global_user_feature = pd.merge(global_user_feature, global_user_feature_browse_days, left_on='user_id',right_on='user_id',how='left')
global_user_feature = pd.merge(global_user_feature, global_user_feature_actions, left_on='user_id', right_on='user_id',how='left')
global_user_feature.head(10)

Unnamed: 0,user_id,user_items,user_cats,user_stores,user_browse_days,user_clicks,user_carts,user_purchases,user_collections,user_logs
0,1,12,9,9,10,27,0,6,0,33
1,2,43,16,14,14,47,0,14,2,63
2,3,45,31,23,26,63,0,4,1,68
3,4,28,16,12,17,49,0,1,0,50
4,5,87,69,56,84,150,0,13,10,173
5,6,166,104,79,102,217,0,17,15,249
6,7,13,10,6,9,6,0,8,0,14
7,8,46,29,23,32,61,0,23,7,91
8,9,41,29,23,34,79,0,4,4,87
9,10,39,24,17,24,56,0,7,1,64


In [15]:
# 处理生成的用户全局整体画像，把数据多的部分变成比例，其他的不变

global_user_feature['user_items_per_store'] = global_user_feature['user_items'] / global_user_feature['user_stores']
global_user_feature['user_cats_per_store'] = global_user_feature['user_cats'] / global_user_feature['user_stores']
global_user_feature['user_days_per_store'] = global_user_feature['user_browse_days'] / global_user_feature['user_stores']
global_user_feature['user_logs_per_store'] = global_user_feature['user_logs'] / global_user_feature['user_stores']
global_user_feature['user_clicks_per_store'] = global_user_feature['user_clicks'] / global_user_feature['user_stores']
global_user_feature['user_carts_per_store'] = global_user_feature['user_carts'] / global_user_feature['user_stores']
global_user_feature['user_purchases_per_store'] = global_user_feature['user_purchases'] / global_user_feature['user_stores']
global_user_feature['user_collections_per_store'] = global_user_feature['user_collections'] / global_user_feature['user_stores']
# 单击、收藏、加购物车和全部操作与购买的比值特征
global_user_feature['user_click_purchase_ratio'] = global_user_feature['user_clicks'] / global_user_feature['user_purchases']
global_user_feature['user_collection_purchase_ratio'] = global_user_feature['user_collections'] / global_user_feature['user_purchases']
global_user_feature['user_cart_purchase_ratio'] = global_user_feature['user_carts'] / global_user_feature['user_purchases']
global_user_feature['user_log_purchase_ratio'] = global_user_feature['user_logs'] / global_user_feature['user_purchases']
global_user_feature.head(10)

Unnamed: 0,user_id,user_items,user_cats,user_stores,user_browse_days,user_clicks,user_carts,user_purchases,user_collections,user_logs,...,user_days_per_store,user_logs_per_store,user_clicks_per_store,user_carts_per_store,user_purchases_per_store,user_collections_per_store,user_click_purchase_ratio,user_collection_purchase_ratio,user_cart_purchase_ratio,user_log_purchase_ratio
0,1,12,9,9,10,27,0,6,0,33,...,1.111111,3.666667,3.0,0.0,0.666667,0.0,4.5,0.0,0.0,5.5
1,2,43,16,14,14,47,0,14,2,63,...,1.0,4.5,3.357143,0.0,1.0,0.142857,3.357143,0.142857,0.0,4.5
2,3,45,31,23,26,63,0,4,1,68,...,1.130435,2.956522,2.73913,0.0,0.173913,0.043478,15.75,0.25,0.0,17.0
3,4,28,16,12,17,49,0,1,0,50,...,1.416667,4.166667,4.083333,0.0,0.083333,0.0,49.0,0.0,0.0,50.0
4,5,87,69,56,84,150,0,13,10,173,...,1.5,3.089286,2.678571,0.0,0.232143,0.178571,11.538462,0.769231,0.0,13.307692
5,6,166,104,79,102,217,0,17,15,249,...,1.291139,3.151899,2.746835,0.0,0.21519,0.189873,12.764706,0.882353,0.0,14.647059
6,7,13,10,6,9,6,0,8,0,14,...,1.5,2.333333,1.0,0.0,1.333333,0.0,0.75,0.0,0.0,1.75
7,8,46,29,23,32,61,0,23,7,91,...,1.391304,3.956522,2.652174,0.0,1.0,0.304348,2.652174,0.304348,0.0,3.956522
8,9,41,29,23,34,79,0,4,4,87,...,1.478261,3.782609,3.434783,0.0,0.173913,0.173913,19.75,1.0,0.0,21.75
9,10,39,24,17,24,56,0,7,1,64,...,1.411765,3.764706,3.294118,0.0,0.411765,0.058824,8.0,0.142857,0.0,9.142857


In [16]:
# 将上述全局的特征融合到待测试或者待训练的user_id上
split_train_data_feature = pd.merge(split_train_data_feature, global_user_feature, left_on='user_id', right_on='user_id',how='left')
split_train_data_feature.head(10)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,user_items,user_cats,user_stores,user_browse_days,user_clicks,...,user_days_per_store,user_logs_per_store,user_clicks_per_store,user_carts_per_store,user_purchases_per_store,user_collections_per_store,user_click_purchase_ratio,user_collection_purchase_ratio,user_cart_purchase_ratio,user_log_purchase_ratio
0,34176,3906,0,6.0,0.0,256,156,109,170,410,...,1.559633,4.137615,3.761468,0.0,0.311927,0.06422,12.058824,0.205882,0.0,13.264706
1,34176,121,0,6.0,0.0,256,156,109,170,410,...,1.559633,4.137615,3.761468,0.0,0.311927,0.06422,12.058824,0.205882,0.0,13.264706
2,34176,4356,1,6.0,0.0,256,156,109,170,410,...,1.559633,4.137615,3.761468,0.0,0.311927,0.06422,12.058824,0.205882,0.0,13.264706
3,34176,2217,0,6.0,0.0,256,156,109,170,410,...,1.559633,4.137615,3.761468,0.0,0.311927,0.06422,12.058824,0.205882,0.0,13.264706
4,230784,4818,0,0.0,0.0,31,22,20,25,47,...,1.25,2.7,2.35,0.0,0.35,0.0,6.714286,0.0,0.0,7.714286
5,362112,2618,0,4.0,1.0,57,22,17,20,86,...,1.176471,5.235294,5.058824,0.0,0.176471,0.0,28.666667,0.0,0.0,29.666667
6,34944,2051,0,5.0,0.0,21,11,10,11,30,...,1.1,3.3,3.0,0.0,0.3,0.0,10.0,0.0,0.0,11.0
7,231552,3828,1,5.0,0.0,94,36,12,18,141,...,1.5,12.916667,11.75,0.0,1.0,0.166667,11.75,0.166667,0.0,12.916667
8,231552,2124,0,5.0,0.0,94,36,12,18,141,...,1.5,12.916667,11.75,0.0,1.0,0.166667,11.75,0.166667,0.0,12.916667
9,232320,1168,0,4.0,1.0,63,41,30,35,70,...,1.166667,3.7,2.333333,0.0,0.266667,1.1,8.75,4.125,0.0,13.875


### 1.2 建立全局的商家画像

In [17]:
# 建立全局的商家画像
# 主要等待建立的商家全局画像有如下：
# 1.商家平均被用户浏览的不同商品的总个数
# 2.商家平均被用户浏览的不同商品的总类别数
# 3.商家平均被用户浏览的不同品牌的总类别数
# 4.商家平均被用户浏览的天数
# 5.商家平均被用户交互的次数
# 6.商家平均被用户点击的次数
# 7.商家平均被用户加入购物车的次数
# 8.商家平均被用户购买的次数
# 9.商家平均被用户收藏的次数
# 10.商家被用户的活动购买比
# 11.商家被用户的点击购买比

In [18]:
# 建立商家含有的不同商品的个数整体画像
global_merchant_feature_items = user_log.groupby(['user_id','seller_id','item_id']).count().reset_index()[['seller_id','item_id']]
global_merchant_feature_items = global_merchant_feature_items.groupby(['seller_id']).count().reset_index()
global_merchant_feature_items.rename(columns={'item_id':'merchant_items'},inplace=True)
global_merchant_feature_items.head(10)

Unnamed: 0,seller_id,merchant_items
0,1,202327
1,2,1285
2,3,1748
3,4,1951
4,5,5684
5,6,969
6,7,2178
7,8,2432
8,9,1432
9,10,11648


In [19]:
# 建立商家含有的不同商品类别个数的整体画像
global_merchant_feature_cats = user_log.groupby(['user_id','seller_id','cat_id']).count().reset_index()[['seller_id','cat_id']]
global_merchant_feature_cats = global_merchant_feature_cats.groupby(['seller_id']).count().reset_index()
global_merchant_feature_cats.rename(columns={'cat_id':'merchant_cats'},inplace=True)
global_merchant_feature_cats.head(10)

Unnamed: 0,seller_id,merchant_cats
0,1,77701
1,2,1053
2,3,1170
3,4,1562
4,5,4051
5,6,688
6,7,1701
7,8,1510
8,9,1115
9,10,7169


In [20]:
# 商家含有的不同商品品牌个数的整体画像

### 分两步：一、先用每个商家对应销售的最多的brand填补空缺值； 二、构建画像

# 第一步：填补空缺值
# 从统计信息中提取brand缺失部分对应的用户和商家
missing_brand_id = user_log[user_log['brand_id'].isnull()][['user_id','seller_id']].sort_index()

# 从统计信息中取出相同的特征
global_merchant_feature_brands = user_log.groupby(['seller_id','brand_id']).count().reset_index()[['seller_id','brand_id','item_id']]
global_merchant_feature_brands = global_merchant_feature_brands.sort_values('item_id',ascending=True).drop_duplicates(['seller_id'],keep='last').sort_index()
missing_brand_id = pd.merge(missing_brand_id, global_merchant_feature_brands, left_on='seller_id', right_on='seller_id', how='left')
missing_brand_id.drop(columns = ['item_id'], inplace=True)
# 将缺失值填充回原数据
mask = user_log['brand_id'].isnull()
user_log.loc[mask,'brand_id'] = missing_brand_id['brand_id'].values

In [21]:
# 第二步：构建画像
global_merchant_feature_brands = user_log.groupby(['user_id','seller_id','brand_id']).count().reset_index()[['seller_id','brand_id']]
global_merchant_feature_brands = global_merchant_feature_brands.groupby(['seller_id']).count().reset_index()
global_merchant_feature_brands.rename(columns={'brand_id':'merchant_brands'},inplace=True)
global_merchant_feature_brands.head(10)

Unnamed: 0,seller_id,merchant_brands
0,1,32576
1,2,936
2,3,1136
3,4,1483
4,5,3652
5,6,660
6,7,1675
7,8,1239
8,9,1189
9,10,6435


In [22]:
# 商家被不同用户浏览的总天数的整体画像
global_merchant_feature_browse_days = user_log.groupby(['user_id','seller_id','time_stamp']).count().reset_index()[['seller_id','time_stamp']]
global_merchant_feature_browse_days = global_merchant_feature_browse_days.groupby(['seller_id']).count().reset_index()
global_merchant_feature_browse_days.rename(columns={'time_stamp':'merchant_days'},inplace=True)
global_merchant_feature_browse_days.head(10)

Unnamed: 0,seller_id,merchant_days
0,1,63839
1,2,1131
2,3,1391
3,4,1669
4,5,4330
5,6,789
6,7,1902
7,8,1560
8,9,1130
9,10,6862


In [23]:
# 建立商家被不同用户访问总数的整体画像
global_merchant_feature_users = user_log.groupby(['user_id','seller_id']).count().reset_index()[['user_id','seller_id']]
global_merchant_feature_users = global_merchant_feature_users.groupby(['seller_id']).count().reset_index()
global_merchant_feature_users.rename(columns={'user_id':'merchant_users'},inplace=True)
global_merchant_feature_users.head(10)

Unnamed: 0,seller_id,merchant_users
0,1,30796
1,2,936
2,3,1136
3,4,1481
4,5,3652
5,6,659
6,7,1608
7,8,1239
8,9,941
9,10,5517


In [24]:
# 建立商家在所有用户上activity_log的整体画像，对稠密的数据用比例显示，对系数的数据采用原值
global_merchant_feature_actions = user_log.groupby(['seller_id','action_type']).count().reset_index()[['seller_id','action_type','item_id']]
global_merchant_feature_actions.rename(columns={'item_id':'times'},inplace=True)    # 用来统计它倍乘的一个属性
global_merchant_feature_actions.head(10)

Unnamed: 0,seller_id,action_type,times
0,1,0,308236
1,1,1,444
2,1,2,17705
3,1,3,12755
4,2,0,2030
5,2,1,8
6,2,2,189
7,2,3,144
8,3,0,2399
9,3,1,4


In [25]:
# 建立商家全部被单击次数merchant_clicks的整体画像
global_merchant_feature_clicks_index = global_merchant_feature_actions['action_type'] == 0
global_merchant_feature_actions['merchant_clicks'] = global_merchant_feature_clicks_index * global_merchant_feature_actions['times']
global_merchant_feature_actions.head(10)

Unnamed: 0,seller_id,action_type,times,merchant_clicks
0,1,0,308236,308236
1,1,1,444,0
2,1,2,17705,0
3,1,3,12755,0
4,2,0,2030,2030
5,2,1,8,0
6,2,2,189,0
7,2,3,144,0
8,3,0,2399,2399
9,3,1,4,0


In [26]:
# 建立商家全部被加入购物车次数merchant_carts的整体画像
global_merchant_feature_carts_index = global_merchant_feature_actions['action_type'] == 1
global_merchant_feature_actions['merchant_carts'] = global_merchant_feature_carts_index * global_merchant_feature_actions['times']
global_merchant_feature_actions.head(10)

Unnamed: 0,seller_id,action_type,times,merchant_clicks,merchant_carts
0,1,0,308236,308236,0
1,1,1,444,0,444
2,1,2,17705,0,0
3,1,3,12755,0,0
4,2,0,2030,2030,0
5,2,1,8,0,8
6,2,2,189,0,0
7,2,3,144,0,0
8,3,0,2399,2399,0
9,3,1,4,0,4


In [27]:
# 建立商家全部被购买的merchant_purchases的整体画像
global_merchant_feature_purchases_index = global_merchant_feature_actions['action_type'] == 2
global_merchant_feature_actions['merchant_purchases'] = global_merchant_feature_purchases_index * global_merchant_feature_actions['times']
global_merchant_feature_actions.head(10)

Unnamed: 0,seller_id,action_type,times,merchant_clicks,merchant_carts,merchant_purchases
0,1,0,308236,308236,0,0
1,1,1,444,0,444,0
2,1,2,17705,0,0,17705
3,1,3,12755,0,0,0
4,2,0,2030,2030,0,0
5,2,1,8,0,8,0
6,2,2,189,0,0,189
7,2,3,144,0,0,0
8,3,0,2399,2399,0,0
9,3,1,4,0,4,0


In [28]:
# 建立商家全部被收藏的merchant_collections的整体画像
global_merchant_feature_collections_index = global_merchant_feature_actions['action_type'] == 3
global_merchant_feature_actions['merchant_collections'] = global_merchant_feature_collections_index * global_merchant_feature_actions['times']
global_merchant_feature_actions.head(10)

Unnamed: 0,seller_id,action_type,times,merchant_clicks,merchant_carts,merchant_purchases,merchant_collections
0,1,0,308236,308236,0,0,0
1,1,1,444,0,444,0,0
2,1,2,17705,0,0,17705,0
3,1,3,12755,0,0,0,12755
4,2,0,2030,2030,0,0,0
5,2,1,8,0,8,0,0
6,2,2,189,0,0,189,0
7,2,3,144,0,0,0,144
8,3,0,2399,2399,0,0,0
9,3,1,4,0,4,0,0


In [29]:
# 压缩seller_id的特征维度，生成对应于seller_id的特征
global_merchant_feature_actions = global_merchant_feature_actions.groupby(['seller_id']).sum().reset_index()[['seller_id','merchant_clicks','merchant_carts','merchant_purchases','merchant_collections']]
global_merchant_feature_actions.head(10)

Unnamed: 0,seller_id,merchant_clicks,merchant_carts,merchant_purchases,merchant_collections
0,1,308236,444,17705,12755
1,2,2030,8,189,144
2,3,2399,4,67,175
3,4,2646,2,294,164
4,5,7483,9,144,556
5,6,1390,6,82,120
6,7,3521,7,399,179
7,8,3540,3,395,306
8,9,2096,7,94,65
9,10,19125,64,1133,866


In [30]:
# 建立商家全部被活动的merchant_logs总画像
global_merchant_feature_actions['merchant_logs'] = global_merchant_feature_actions['merchant_clicks'] + global_merchant_feature_actions['merchant_carts'] + global_merchant_feature_actions['merchant_purchases'] + global_merchant_feature_actions['merchant_collections']
global_merchant_feature_actions.head(10)

Unnamed: 0,seller_id,merchant_clicks,merchant_carts,merchant_purchases,merchant_collections,merchant_logs
0,1,308236,444,17705,12755,339140
1,2,2030,8,189,144,2371
2,3,2399,4,67,175,2645
3,4,2646,2,294,164,3106
4,5,7483,9,144,556,8192
5,6,1390,6,82,120,1598
6,7,3521,7,399,179,4106
7,8,3540,3,395,306,4244
8,9,2096,7,94,65,2262
9,10,19125,64,1133,866,21188


In [31]:
# 合成商家全局的特征向量
global_merchant_feature = pd.merge(global_merchant_feature_items, global_merchant_feature_cats, left_on='seller_id',right_on='seller_id',how='left')
global_merchant_feature = pd.merge(global_merchant_feature, global_merchant_feature_brands, left_on = 'seller_id', right_on='seller_id', how='left')
global_merchant_feature = pd.merge(global_merchant_feature, global_merchant_feature_browse_days, left_on = 'seller_id', right_on='seller_id', how='left')
global_merchant_feature = pd.merge(global_merchant_feature, global_merchant_feature_users, left_on ='seller_id', right_on='seller_id', how='left')
global_merchant_feature = pd.merge(global_merchant_feature, global_merchant_feature_actions, left_on='seller_id', right_on='seller_id', how='left')
global_merchant_feature.head(10)

Unnamed: 0,seller_id,merchant_items,merchant_cats,merchant_brands,merchant_days,merchant_users,merchant_clicks,merchant_carts,merchant_purchases,merchant_collections,merchant_logs
0,1,202327,77701,32576,63839,30796,308236,444,17705,12755,339140
1,2,1285,1053,936,1131,936,2030,8,189,144,2371
2,3,1748,1170,1136,1391,1136,2399,4,67,175,2645
3,4,1951,1562,1483,1669,1481,2646,2,294,164,3106
4,5,5684,4051,3652,4330,3652,7483,9,144,556,8192
5,6,969,688,660,789,659,1390,6,82,120,1598
6,7,2178,1701,1675,1902,1608,3521,7,399,179,4106
7,8,2432,1510,1239,1560,1239,3540,3,395,306,4244
8,9,1432,1115,1189,1130,941,2096,7,94,65,2262
9,10,11648,7169,6435,6862,5517,19125,64,1133,866,21188


In [32]:
# 对稠密的特征建立比例的特征矢量，而对稀疏的特征则保留原样

global_merchant_feature['merchant_items_per_user'] = global_merchant_feature['merchant_items'] / global_merchant_feature['merchant_users']
global_merchant_feature['merchant_cats_per_user'] = global_merchant_feature['merchant_cats'] / global_merchant_feature['merchant_users']
global_merchant_feature['merchant_brands_per_user'] = global_merchant_feature['merchant_brands'] / global_merchant_feature['merchant_users']
global_merchant_feature['merchant_days_per_user'] = global_merchant_feature['merchant_days'] / global_merchant_feature['merchant_users']
global_merchant_feature['merchant_clicks_per_user'] = global_merchant_feature['merchant_clicks'] / global_merchant_feature['merchant_users']
global_merchant_feature['merchant_carts_per_user'] = global_merchant_feature['merchant_carts'] / global_merchant_feature['merchant_users']
global_merchant_feature['merchant_purchases_per_user'] = global_merchant_feature['merchant_purchases'] / global_merchant_feature['merchant_users']
global_merchant_feature['merchant_collections_per_user'] = global_merchant_feature['merchant_collections'] / global_merchant_feature['merchant_users']
# 单击、收藏、加购物车和全部操作与购买的比值特征
global_merchant_feature['merchant_click_purchase_ratio'] = global_merchant_feature['merchant_clicks'] / global_merchant_feature['merchant_purchases']
global_merchant_feature['merchant_collection_purchase_ratio'] = global_merchant_feature['merchant_collections'] / global_merchant_feature['merchant_purchases']
global_merchant_feature['merchant_cart_purchase_ratio'] = global_merchant_feature['merchant_carts'] / global_merchant_feature['merchant_purchases']
global_merchant_feature['merchant_log_purchase_ratio'] = global_merchant_feature['merchant_logs'] / global_merchant_feature['merchant_purchases']
global_merchant_feature.head(10)

Unnamed: 0,seller_id,merchant_items,merchant_cats,merchant_brands,merchant_days,merchant_users,merchant_clicks,merchant_carts,merchant_purchases,merchant_collections,...,merchant_brands_per_user,merchant_days_per_user,merchant_clicks_per_user,merchant_carts_per_user,merchant_purchases_per_user,merchant_collections_per_user,merchant_click_purchase_ratio,merchant_collection_purchase_ratio,merchant_cart_purchase_ratio,merchant_log_purchase_ratio
0,1,202327,77701,32576,63839,30796,308236,444,17705,12755,...,1.0578,2.072964,10.008962,0.014417,0.574912,0.414177,17.409545,0.720418,0.025078,19.155041
1,2,1285,1053,936,1131,936,2030,8,189,144,...,1.0,1.208333,2.168803,0.008547,0.201923,0.153846,10.740741,0.761905,0.042328,12.544974
2,3,1748,1170,1136,1391,1136,2399,4,67,175,...,1.0,1.224472,2.111796,0.003521,0.058979,0.154049,35.80597,2.61194,0.059701,39.477612
3,4,1951,1562,1483,1669,1481,2646,2,294,164,...,1.00135,1.126941,1.786631,0.00135,0.198515,0.110736,9.0,0.557823,0.006803,10.564626
4,5,5684,4051,3652,4330,3652,7483,9,144,556,...,1.0,1.185652,2.049014,0.002464,0.03943,0.152245,51.965278,3.861111,0.0625,56.888889
5,6,969,688,660,789,659,1390,6,82,120,...,1.001517,1.197269,2.109256,0.009105,0.124431,0.182094,16.95122,1.463415,0.073171,19.487805
6,7,2178,1701,1675,1902,1608,3521,7,399,179,...,1.041667,1.182836,2.189677,0.004353,0.248134,0.111318,8.824561,0.448622,0.017544,10.290727
7,8,2432,1510,1239,1560,1239,3540,3,395,306,...,1.0,1.25908,2.857143,0.002421,0.318805,0.246973,8.962025,0.774684,0.007595,10.744304
8,9,1432,1115,1189,1130,941,2096,7,94,65,...,1.263549,1.20085,2.227418,0.007439,0.099894,0.069075,22.297872,0.691489,0.074468,24.06383
9,10,11648,7169,6435,6862,5517,19125,64,1133,866,...,1.166395,1.243792,3.466558,0.011601,0.205365,0.156969,16.879965,0.764342,0.056487,18.700794


In [33]:
# 统一商家col
global_merchant_feature.rename(columns={'seller_id':'merchant_id'},inplace=True)

In [34]:
### 新特征构建商家关于性别和年龄的特征merchant_gender、avg_gender

# 由于性别和年龄的缺失基数较大，但占比较小，这里只用非缺失值进行预估，还是很充足的
mask_age_gender = ~(np.sum(user_info.isnull(),axis=1) > 0)
user_info_process = user_info.loc[mask_age_gender, :]

global_merchant_feature_users = user_log.groupby(['user_id','seller_id']).count().reset_index()[['user_id','seller_id','item_id']]
global_merchant_feature_users.rename(columns={'item_id':'counts'},inplace=True)   

user_info_overall = pd.merge(user_info_process, global_merchant_feature_users, left_on='user_id',right_on='user_id',how='left')

global_merchant_feature_gender = user_info_overall.groupby(['gender','seller_id']).sum().reset_index()[['gender','seller_id','counts']]

In [35]:
# 组件关于商家的年龄全局特征(它表达的是访问这家店的平均年龄范围)

# 取出年龄的全局特征
global_merchant_feature_age = user_info_overall.groupby(['age_range','seller_id','counts']).sum().reset_index()[['age_range','seller_id','counts']]
# 定义乘积因子
global_merchant_feature_age['times'] = global_merchant_feature_age['age_range'] * global_merchant_feature_age['counts']
# 计算商家被所有用户访问的年龄总和
global_merchant_feature_age1 = global_merchant_feature_age.groupby(['seller_id']).sum().reset_index()[['seller_id','times']]
# 计算用户被所有商家访问的次数
global_merchant_feature_age= global_merchant_feature_age.groupby(['seller_id']).sum().reset_index()[['seller_id','counts']]

# 生成每个商家被访问的年龄特征
global_merchant_feature['avg_gender'] = global_merchant_feature_age1['times'].values / global_merchant_feature_age['counts'].values

In [36]:
# 组建关于商家的性别全局性别特征(它表达的是有百分之多少的男性用户访问这家店)

# 统计商家被不同性别用户的访问总数
global_merchant_feature_gender = user_info_overall.groupby(['gender','seller_id']).sum().reset_index()[['gender','seller_id','counts']]
# 统计商家被所有用户的访问总数
global_merchant_feature_gender1 = global_merchant_feature_gender.groupby(['seller_id']).sum().reset_index()[['seller_id','counts']]
# 定义乘积因子加权求和
global_merchant_feature_gender['times'] = global_merchant_feature_gender['gender'] * global_merchant_feature_gender['counts']
# 算得每个商家被访问的所有性别总和
global_merchant_feature_gender = global_merchant_feature_gender.groupby(['seller_id']).sum().reset_index()[['seller_id','times']]

# 生成每个商家被访问的性别特征
global_merchant_feature['merchant_gender'] = global_merchant_feature_gender['times'].values / global_merchant_feature_gender1['counts'].values

In [37]:
# 把全局的商家特征融合到训练特征或测试特征上面
split_train_data_feature = pd.merge(split_train_data_feature, global_merchant_feature, left_on='merchant_id', right_on='merchant_id',how='left')
split_train_data_feature.columns
split_train_data_feature.head(10)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,user_items,user_cats,user_stores,user_browse_days,user_clicks,...,merchant_clicks_per_user,merchant_carts_per_user,merchant_purchases_per_user,merchant_collections_per_user,merchant_click_purchase_ratio,merchant_collection_purchase_ratio,merchant_cart_purchase_ratio,merchant_log_purchase_ratio,avg_gender,merchant_gender
0,34176,3906,0,6.0,0.0,256,156,109,170,410,...,2.555422,0.004812,0.070459,0.165149,36.268293,2.343902,0.068293,39.680488,3.708402,0.163514
1,34176,121,0,6.0,0.0,256,156,109,170,410,...,6.611015,0.011069,0.437288,0.246912,15.118201,0.564644,0.025314,16.708159,3.783862,0.285673
2,34176,4356,1,6.0,0.0,256,156,109,170,410,...,2.671635,0.007014,0.422183,0.085927,6.328141,0.203531,0.016615,7.548287,3.716368,0.410391
3,34176,2217,0,6.0,0.0,256,156,109,170,410,...,3.096028,0.005987,0.220569,0.245999,14.036549,1.115292,0.027143,16.178984,3.054318,0.200127
4,230784,4818,0,0.0,0.0,31,22,20,25,47,...,5.769067,0.0172,0.3644,0.2612,15.831687,0.716795,0.047201,17.595682,3.136461,0.654746
5,362112,2618,0,4.0,1.0,57,22,17,20,86,...,2.337384,0.003574,0.358828,0.170122,6.513944,0.474104,0.00996,7.998008,3.303172,0.350714
6,34944,2051,0,5.0,0.0,21,11,10,11,30,...,2.624289,0.005091,0.34696,0.191674,7.56366,0.552438,0.014674,9.130773,3.247082,0.25869
7,231552,3828,1,5.0,0.0,94,36,12,18,141,...,8.689311,0.011708,0.245567,0.555762,35.384648,2.263177,0.047677,38.695502,3.729013,0.190217
8,231552,2124,0,5.0,0.0,94,36,12,18,141,...,5.735449,0.005159,0.177244,0.338135,32.359143,1.907743,0.029105,35.295991,3.750152,0.230877
9,232320,1168,0,4.0,1.0,63,41,30,35,70,...,3.360438,0.004037,0.269031,0.14331,12.49089,0.53269,0.015005,14.038585,3.529982,0.338984


### 1.3 建立某一个用户针对特定商家的特定画像

In [38]:
# 建立某一个用户针对特定商家的特定画像
# 主要等待建立的商家全局画像有如下：
# 1.用户在指定商家中浏览的商品总数的特定画像
# 2.用户在指定商家中浏览的商品类别数的特定画像
# 3.用户在指定商家中浏览的商品品牌数的特定画像
# 4.用户在指定商家中浏览了几天的特定画像
# 5.用户在指定商家中的总操作数的特定画像
# 6.用户在指定商家中的单击总量的特定画像
# 7.用户在指定商家中的加入购物车总量的特定画像
# 8.用户在指定商家中的购买总量的特定画像
# 9.用户在指定商家中的加入收藏夹总量的特定画像

In [39]:
# 1.建立用户在一个商家浏览商品总数的特定画像

user_merchant_feature_items = user_log.groupby(['user_id', 'seller_id', 'item_id']).count().reset_index()[['user_id','seller_id','item_id']]
user_merchant_feature_items = user_merchant_feature_items.groupby(['user_id', 'seller_id']).count().reset_index()
user_merchant_feature_items.rename(columns={'item_id':'user_merchant_items'},inplace=True)
user_merchant_feature_items.head(10)

Unnamed: 0,user_id,seller_id,user_merchant_items
0,1,471,1
1,1,739,1
2,1,925,1
3,1,1019,1
4,1,1156,1
5,1,2245,4
6,1,4026,1
7,1,4177,1
8,1,4335,1
9,2,420,15


In [40]:
# 2.建立用户在一个商家浏览商品类别数的特定画像

user_merchant_feature_cats = user_log.groupby(['user_id', 'seller_id', 'cat_id']).count().reset_index()[['user_id','seller_id','cat_id']]
user_merchant_feature_cats = user_merchant_feature_cats.groupby(['user_id', 'seller_id']).count().reset_index()
user_merchant_feature_cats.rename(columns={'cat_id':'user_merchant_cats'},inplace=True)
user_merchant_feature_cats.head(10)

Unnamed: 0,user_id,seller_id,user_merchant_cats
0,1,471,1
1,1,739,1
2,1,925,1
3,1,1019,1
4,1,1156,1
5,1,2245,1
6,1,4026,1
7,1,4177,1
8,1,4335,1
9,2,420,2


In [41]:
# 3.建立用户在一个商家浏览商品品牌数的特定画像
user_merchant_feature_brands = user_log.groupby(['user_id', 'seller_id', 'brand_id']).count().reset_index()[['user_id','seller_id','brand_id']]
user_merchant_feature_brands = user_merchant_feature_brands.groupby(['user_id', 'seller_id']).count().reset_index()
user_merchant_feature_brands.rename(columns={'brand_id':'user_merchant_brands'},inplace=True)
user_merchant_feature_brands.head(10)

Unnamed: 0,user_id,seller_id,user_merchant_brands
0,1,471,1
1,1,739,1
2,1,925,1
3,1,1019,1
4,1,1156,1
5,1,2245,1
6,1,4026,1
7,1,4177,1
8,1,4335,1
9,2,420,2


In [42]:
# 4. 建立用户在一个商家浏览的天数的特定画像
user_merchant_feature_days = user_log.groupby(['user_id', 'seller_id', 'time_stamp']).count().reset_index()[['user_id','seller_id','time_stamp']]
user_merchant_feature_days = user_merchant_feature_days.groupby(['user_id', 'seller_id']).count().reset_index()
user_merchant_feature_days.rename(columns={'time_stamp':'user_merchant_days'},inplace=True)
user_merchant_feature_days.head(10)

Unnamed: 0,user_id,seller_id,user_merchant_days
0,1,471,1
1,1,739,1
2,1,925,1
3,1,1019,1
4,1,1156,1
5,1,2245,1
6,1,4026,2
7,1,4177,1
8,1,4335,1
9,2,420,1


In [43]:
# 5.建立用户在一个商家actions整体的特定画像
user_merchant_feature_actions = user_log.groupby(['user_id','seller_id','action_type']).count().reset_index()[['user_id', 'seller_id','action_type','item_id']]
user_merchant_feature_actions.rename(columns={'item_id':'times'},inplace=True)
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,action_type,times
0,1,471,0,1
1,1,739,0,1
2,1,925,0,3
3,1,925,2,1
4,1,1019,0,10
5,1,1019,2,4
6,1,1156,0,1
7,1,2245,0,5
8,1,4026,0,4
9,1,4026,2,1


In [44]:
# 6.建立用户在一个商家单击次数的特定画像
user_merchant_clicks_index = user_merchant_feature_actions['action_type'] == 0
user_merchant_feature_actions['user_merchant_clicks'] = user_merchant_clicks_index * user_merchant_feature_actions['times']
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,action_type,times,user_merchant_clicks
0,1,471,0,1,1
1,1,739,0,1,1
2,1,925,0,3,3
3,1,925,2,1,0
4,1,1019,0,10,10
5,1,1019,2,4,0
6,1,1156,0,1,1
7,1,2245,0,5,5
8,1,4026,0,4,4
9,1,4026,2,1,0


In [45]:
# 7.建立用户在一个商家加入购物车次数的特定画像
user_merchant_carts_index = user_merchant_feature_actions['action_type'] == 1
user_merchant_feature_actions['user_merchant_carts'] = user_merchant_carts_index * user_merchant_feature_actions['times']
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,action_type,times,user_merchant_clicks,user_merchant_carts
0,1,471,0,1,1,0
1,1,739,0,1,1,0
2,1,925,0,3,3,0
3,1,925,2,1,0,0
4,1,1019,0,10,10,0
5,1,1019,2,4,0,0
6,1,1156,0,1,1,0
7,1,2245,0,5,5,0
8,1,4026,0,4,4,0
9,1,4026,2,1,0,0


In [46]:
# 8.建立用户在一个商家购买次数的特定画像
user_merchant_purchases_index = user_merchant_feature_actions['action_type'] == 2
user_merchant_feature_actions['user_merchant_purchases'] = user_merchant_purchases_index * user_merchant_feature_actions['times']
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,action_type,times,user_merchant_clicks,user_merchant_carts,user_merchant_purchases
0,1,471,0,1,1,0,0
1,1,739,0,1,1,0,0
2,1,925,0,3,3,0,0
3,1,925,2,1,0,0,1
4,1,1019,0,10,10,0,0
5,1,1019,2,4,0,0,4
6,1,1156,0,1,1,0,0
7,1,2245,0,5,5,0,0
8,1,4026,0,4,4,0,0
9,1,4026,2,1,0,0,1


In [47]:
# 9.建立用户在一个商家次数的特定画像
user_merchant_collections_index = user_merchant_feature_actions['action_type'] == 3
user_merchant_feature_actions['user_merchant_collections'] = user_merchant_collections_index * user_merchant_feature_actions['times']
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,action_type,times,user_merchant_clicks,user_merchant_carts,user_merchant_purchases,user_merchant_collections
0,1,471,0,1,1,0,0,0
1,1,739,0,1,1,0,0,0
2,1,925,0,3,3,0,0,0
3,1,925,2,1,0,0,1,0
4,1,1019,0,10,10,0,0,0
5,1,1019,2,4,0,0,4,0
6,1,1156,0,1,1,0,0,0
7,1,2245,0,5,5,0,0,0
8,1,4026,0,4,4,0,0,0
9,1,4026,2,1,0,0,1,0


In [48]:
# 压缩user_id和sell_id的维度
user_merchant_feature_actions = user_merchant_feature_actions.groupby(['user_id','seller_id']).sum().reset_index()
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,action_type,times,user_merchant_clicks,user_merchant_carts,user_merchant_purchases,user_merchant_collections
0,1,471,0,1,1,0,0,0
1,1,739,0,1,1,0,0,0
2,1,925,2,4,3,0,1,0
3,1,1019,2,14,10,0,4,0
4,1,1156,0,1,1,0,0,0
5,1,2245,0,5,5,0,0,0
6,1,4026,2,5,4,0,1,0
7,1,4177,0,1,1,0,0,0
8,1,4335,0,1,1,0,0,0
9,2,420,2,26,23,0,3,0


In [49]:
# 10.建立用户在一个商家总交互次数的特定画像
user_merchant_feature_actions['user_merchant_logs'] = user_merchant_feature_actions['user_merchant_clicks'] + user_merchant_feature_actions['user_merchant_carts'] + user_merchant_feature_actions['user_merchant_purchases'] + user_merchant_feature_actions['user_merchant_collections']
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,action_type,times,user_merchant_clicks,user_merchant_carts,user_merchant_purchases,user_merchant_collections,user_merchant_logs
0,1,471,0,1,1,0,0,0,1
1,1,739,0,1,1,0,0,0,1
2,1,925,2,4,3,0,1,0,4
3,1,1019,2,14,10,0,4,0,14
4,1,1156,0,1,1,0,0,0,1
5,1,2245,0,5,5,0,0,0,5
6,1,4026,2,5,4,0,1,0,5
7,1,4177,0,1,1,0,0,0,1
8,1,4335,0,1,1,0,0,0,1
9,2,420,2,26,23,0,3,0,26


In [50]:
# 丢掉不需要的特征
user_merchant_feature_actions.columns
user_merchant_feature_actions.drop(columns=['action_type','times'], inplace=True)
user_merchant_feature_actions.head(10)

Unnamed: 0,user_id,seller_id,user_merchant_clicks,user_merchant_carts,user_merchant_purchases,user_merchant_collections,user_merchant_logs
0,1,471,1,0,0,0,1
1,1,739,1,0,0,0,1
2,1,925,3,0,1,0,4
3,1,1019,10,0,4,0,14
4,1,1156,1,0,0,0,1
5,1,2245,5,0,0,0,5
6,1,4026,4,0,1,0,5
7,1,4177,1,0,0,0,1
8,1,4335,1,0,0,0,1
9,2,420,23,0,3,0,26


In [51]:
# 合并上述已经生成的特征
user_merchant_feature = pd.merge(user_merchant_feature_items, user_merchant_feature_cats, left_on=['user_id','seller_id'], right_on=['user_id','seller_id'], how='left')
user_merchant_feature = pd.merge(user_merchant_feature, user_merchant_feature_brands, left_on=['user_id','seller_id'], right_on=['user_id','seller_id'], how='left')
user_merchant_feature = pd.merge(user_merchant_feature, user_merchant_feature_days, left_on=['user_id','seller_id'], right_on=['user_id','seller_id'], how='left')
user_merchant_feature = pd.merge(user_merchant_feature, user_merchant_feature_actions, left_on=['user_id','seller_id'], right_on=['user_id','seller_id'], how='left')
user_merchant_feature.head(10)

Unnamed: 0,user_id,seller_id,user_merchant_items,user_merchant_cats,user_merchant_brands,user_merchant_days,user_merchant_clicks,user_merchant_carts,user_merchant_purchases,user_merchant_collections,user_merchant_logs
0,1,471,1,1,1,1,1,0,0,0,1
1,1,739,1,1,1,1,1,0,0,0,1
2,1,925,1,1,1,1,3,0,1,0,4
3,1,1019,1,1,1,1,10,0,4,0,14
4,1,1156,1,1,1,1,1,0,0,0,1
5,1,2245,4,1,1,1,5,0,0,0,5
6,1,4026,1,1,1,2,4,0,1,0,5
7,1,4177,1,1,1,1,1,0,0,0,1
8,1,4335,1,1,1,1,1,0,0,0,1
9,2,420,15,2,2,1,23,0,3,0,26


In [52]:
# 改一下merchant_id
user_merchant_feature.rename(columns={'seller_id':'merchant_id'},inplace=True)

In [53]:
# 将用户针对商家的特定画像进行合并 
split_train_data_feature = pd.merge(split_train_data_feature, user_merchant_feature, left_on=['user_id','merchant_id'], right_on=['user_id','merchant_id'],how='left')
split_train_data_feature.head(10)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,user_items,user_cats,user_stores,user_browse_days,user_clicks,...,merchant_gender,user_merchant_items,user_merchant_cats,user_merchant_brands,user_merchant_days,user_merchant_clicks,user_merchant_carts,user_merchant_purchases,user_merchant_collections,user_merchant_logs
0,34176,3906,0,6.0,0.0,256,156,109,170,410,...,0.163514,20,6,1,9,36,0,1,2,39
1,34176,121,0,6.0,0.0,256,156,109,170,410,...,0.285673,1,1,1,3,13,0,1,0,14
2,34176,4356,1,6.0,0.0,256,156,109,170,410,...,0.410391,2,1,1,2,12,0,6,0,18
3,34176,2217,0,6.0,0.0,256,156,109,170,410,...,0.200127,1,1,1,1,1,0,1,0,2
4,230784,4818,0,0.0,0.0,31,22,20,25,47,...,0.654746,1,1,1,3,7,0,1,0,8
5,362112,2618,0,4.0,1.0,57,22,17,20,86,...,0.350714,1,1,1,1,0,0,1,0,1
6,34944,2051,0,5.0,0.0,21,11,10,11,30,...,0.25869,2,1,1,1,2,0,1,0,3
7,231552,3828,1,5.0,0.0,94,36,12,18,141,...,0.190217,48,15,1,3,78,0,5,0,83
8,231552,2124,0,5.0,0.0,94,36,12,18,141,...,0.230877,4,1,1,1,6,0,1,0,7
9,232320,1168,0,4.0,1.0,63,41,30,35,70,...,0.338984,1,1,1,2,2,0,1,1,4


In [54]:
split_train_data_feature.columns

Index(['user_id', 'merchant_id', 'label', 'age_range', 'gender', 'user_items',
       'user_cats', 'user_stores', 'user_browse_days', 'user_clicks',
       'user_carts', 'user_purchases', 'user_collections', 'user_logs',
       'user_items_per_store', 'user_cats_per_store', 'user_days_per_store',
       'user_logs_per_store', 'user_clicks_per_store', 'user_carts_per_store',
       'user_purchases_per_store', 'user_collections_per_store',
       'user_click_purchase_ratio', 'user_collection_purchase_ratio',
       'user_cart_purchase_ratio', 'user_log_purchase_ratio', 'merchant_items',
       'merchant_cats', 'merchant_brands', 'merchant_days', 'merchant_users',
       'merchant_clicks', 'merchant_carts', 'merchant_purchases',
       'merchant_collections', 'merchant_logs', 'merchant_items_per_user',
       'merchant_cats_per_user', 'merchant_brands_per_user',
       'merchant_days_per_user', 'merchant_clicks_per_user',
       'merchant_carts_per_user', 'merchant_purchases_per_user',
  

### 1.4 建立特定商家对某一个用户的特定画像

在下两节中，训练集和测试集分别在其它特征生成后，在其它特征基础上直接创建。详见节2.1， 3.1。

# 二、训练集特征工程处理

### 2.1 微调，并建立特定商家对某一个用户的特定画像

In [55]:
# 微调：对user_merchant_feature进行一些局部改进
split_train_data_feature['user_merchant_click_purchase_ratio'] = split_train_data_feature['user_click_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_clicks']
split_train_data_feature['user_merchant_collection_purchase_ratio'] = split_train_data_feature['user_collection_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_collections']
split_train_data_feature['user_merchant_cart_purchase_ratio'] = split_train_data_feature['user_cart_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_carts']
split_train_data_feature['user_merchant_log_purchase_ratio'] = split_train_data_feature['user_log_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_logs']

# 建立特定商家对某一个用户的特定画像：以商家为中心进行的特征评估
split_train_data_feature['merchant_user_items'] = split_train_data_feature['user_merchant_items'] - split_train_data_feature['merchant_items_per_user']
split_train_data_feature['merchant_user_cats'] = split_train_data_feature['user_merchant_cats'] - split_train_data_feature['merchant_cats_per_user']
split_train_data_feature['merchant_user_days'] = split_train_data_feature['user_merchant_days'] - split_train_data_feature['merchant_days_per_user']
split_train_data_feature['merchant_user_clicks'] = split_train_data_feature['user_merchant_clicks'] - split_train_data_feature['merchant_clicks_per_user']
split_train_data_feature['merchant_user_carts'] = split_train_data_feature['user_merchant_carts'] - split_train_data_feature['merchant_carts_per_user']
split_train_data_feature['merchant_user_purchases'] = split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['merchant_purchases_per_user']
split_train_data_feature['merchant_user_collections'] = split_train_data_feature['user_merchant_collections'] - split_train_data_feature['merchant_collections_per_user']
split_train_data_feature['merchant_user_logs'] = split_train_data_feature['user_merchant_logs'] - split_train_data_feature['user_logs_per_store']
split_train_data_feature['merchant_user_click_purchase_ratio'] = split_train_data_feature['merchant_click_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_clicks']
split_train_data_feature['merchant_user_collection_purchase_ratio'] = split_train_data_feature['merchant_collection_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_collections']
split_train_data_feature['merchant_user_cart_purchase_ratio'] = split_train_data_feature['merchant_cart_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_carts']
split_train_data_feature['merchant_user_log_purchase_ratio'] = split_train_data_feature['merchant_log_purchase_ratio'] * split_train_data_feature['user_merchant_purchases'] - split_train_data_feature['user_merchant_logs']

In [56]:
# 调整age和gender的一些属性问题，尤其注意
# 由于1存在的样本实在太少，做缺失值的方式进行处理
split_train_data_feature['age_range'].replace(8, 7, inplace=True)
split_train_data_feature['age_range'].replace(1, np.nan, inplace=True)

In [57]:
split_train_data_feature.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260864 entries, 0 to 260863
Data columns (total 75 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   user_id                                  260864 non-null  int64  
 1   merchant_id                              260864 non-null  int64  
 2   label                                    260864 non-null  int64  
 3   age_range                                259598 non-null  float64
 4   gender                                   257153 non-null  float64
 5   user_items                               260864 non-null  int64  
 6   user_cats                                260864 non-null  int64  
 7   user_stores                              260864 non-null  int64  
 8   user_browse_days                         260864 non-null  int64  
 9   user_clicks                              260864 non-null  int64  
 10  user_carts                      

### 2.2 建立模型预测、填补训练集age_range, gender缺失值

* age_range的预测为多分类问题，创建一个较为复杂的模型，随机森林模型。以无缺失值的数据来建模，然后预测缺失值，填补。
* gender为二分类问题，创建一个较为简单的模型，决策树模型。训练填补过程同上。

Note: 建模规则：首先构建无缺失值数据特征与age_range/gender的皮尔逊相关系数矩阵，然后仅挑选相关性最高的20维特征进行建模。这样做的考虑是：全部特征都用可能会带入过多噪声到age_range/gender的预测值中，进而影响后续主任务。

In [58]:
# 一、填补age_range
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

train_feature_age = split_train_data_feature.drop(columns=['gender', 'user_merchant_brands'])
age_corr = train_feature_age.corr(method='pearson')['age_range'].sort_values(ascending=False)
high_corr_idxs = age_corr.index.drop(['user_id', 'merchant_id'])[1:21]

train_feature_age_normal = train_feature_age[np.isnan(train_feature_age['age_range'])==False]
train_feature_age_nan = train_feature_age[np.isnan(train_feature_age['age_range'])==True]

train_feature_age_normal_label = train_feature_age_normal['age_range']
train_feature_age_normal_data = train_feature_age_normal[high_corr_idxs]
train_feature_age_nan_data = train_feature_age_nan[high_corr_idxs]

rfc = RandomForestClassifier(n_estimators=30, criterion='gini', max_depth=6, random_state=0)
cv_results = cross_validate(rfc, train_feature_age_normal_data, train_feature_age_normal_label, cv=3,
                            verbose=2, return_train_score=True, return_estimator=True)
cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   5.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.6s remaining:    0.0s


[CV] END .................................................... total time=   5.1s
[CV] END .................................................... total time=   5.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.8s finished


{'fit_time': array([4.78290844, 4.83998489, 4.79777503]),
 'score_time': array([0.25203061, 0.27391958, 0.24913025]),
 'estimator': [RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0)],
 'test_score': array([0.29619914, 0.29490483, 0.29688439]),
 'train_score': array([0.30058648, 0.30195013, 0.30103544])}

In [59]:
rfc_estimator = cv_results['estimator'][2]
age_nan_pred = rfc_estimator.predict(train_feature_age_nan_data)
train_feature_age_nan['age_range'] = age_nan_pred
age_range = pd.concat([train_feature_age_normal[['user_id', 'merchant_id', 'age_range']],
                       train_feature_age_nan[['user_id', 'merchant_id','age_range']]], axis=0)

split_train_data_feature.drop(columns='age_range', inplace=True)
split_train_data_feature = pd.merge(split_train_data_feature, age_range, on=['user_id', 'merchant_id'])
# split_train_data_feature.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_feature_age_nan['age_range'] = age_nan_pred


In [60]:
# 二、填补gender

# 数据处理
train_feature_gender = split_train_data_feature.drop(columns=['age_range', 'user_merchant_brands']) # 
gender_corr = train_feature_gender.corr(method='pearson')['gender'].sort_values(ascending=False)
high_corr_idxs = gender_corr.index.drop(['user_id', 'merchant_id'])[1:21]

train_feature_gender_normal = train_feature_gender[np.isnan(train_feature_gender['gender'])==False]
train_feature_gender_nan = train_feature_gender[np.isnan(train_feature_gender['gender'])==True]

train_feature_gender_normal_label = train_feature_gender_normal['gender']
train_feature_gender_normal_data = train_feature_gender_normal[high_corr_idxs]
train_feature_gender_nan_data = train_feature_gender_nan[high_corr_idxs]

# 模型训练
cv_results = cross_validate(rfc, train_feature_gender_normal_data, train_feature_gender_normal_label, cv=3,
                            verbose=2, return_train_score=True, return_estimator=True)
cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   3.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.3s remaining:    0.0s


[CV] END .................................................... total time=   3.7s
[CV] END .................................................... total time=   3.8s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.7s finished


{'fit_time': array([3.66841936, 3.55272722, 3.64899683]),
 'score_time': array([0.18543482, 0.19356751, 0.17845964]),
 'estimator': [RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0)],
 'test_score': array([0.70709769, 0.70936093, 0.70717594]),
 'train_score': array([0.70983755, 0.70953423, 0.70971675])}

In [61]:
# 预测，整合
rfc_estimator = cv_results['estimator'][1]
gender_nan_pred = rfc_estimator.predict(train_feature_gender_nan_data)
train_feature_gender_nan['gender'] = gender_nan_pred
gender = pd.concat([train_feature_gender_normal[['user_id', 'merchant_id', 'gender']],
                    train_feature_gender_nan[['user_id', 'merchant_id','gender']]], axis=0)

split_train_data_feature.drop(columns='gender', inplace=True)
split_train_data_feature = pd.merge(split_train_data_feature, gender, on=['user_id', 'merchant_id'])
split_train_data_feature.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_feature_gender_nan['gender'] = gender_nan_pred


<class 'pandas.core.frame.DataFrame'>
Int64Index: 260864 entries, 0 to 260863
Data columns (total 75 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   user_id                                  260864 non-null  int64  
 1   merchant_id                              260864 non-null  int64  
 2   label                                    260864 non-null  int64  
 3   user_items                               260864 non-null  int64  
 4   user_cats                                260864 non-null  int64  
 5   user_stores                              260864 non-null  int64  
 6   user_browse_days                         260864 non-null  int64  
 7   user_clicks                              260864 non-null  int64  
 8   user_carts                               260864 non-null  int64  
 9   user_purchases                           260864 non-null  int64  
 10  user_collections                

**复制一遍正样本(label=1)以减轻数据分布偏差的影响**

In [62]:
# 复制一遍label=1的数据
rep_idxs = split_train_data_feature['label'] == 1
split_train_data_feature = pd.concat([split_train_data_feature,
                                      split_train_data_feature[rep_idxs]], axis=0)
split_train_data_feature.shape

(276816, 75)

In [63]:
split_train_data_feature.to_csv('./trainset_final.csv', index=False)

## 三、测试集特征工程处理

### 3.1 构建测试集特征集，微调，并建立特定商家对某一个用户的特定画像

In [64]:
# 构建测试集特征集
# 在真实的测试数据上进行预测

# 加入基本的用户信息
split_test_data_feature = pd.merge(split_test_data, user_info, left_on = 'user_id',right_on = 'user_id',how='left')

# 加入全局的用户特征画像
split_test_data_feature = pd.merge(split_test_data_feature, global_user_feature, left_on = 'user_id', right_on = 'user_id', how='left')

# 加入全局的商家特征画像
split_test_data_feature = pd.merge(split_test_data_feature, global_merchant_feature, left_on = 'merchant_id', right_on = 'merchant_id', how='left')

# 加入全局的商家针对用户的特征画像
split_test_data_feature = pd.merge(split_test_data_feature, user_merchant_feature, left_on = ['user_id','merchant_id'], right_on = ['user_id','merchant_id'], how='left')

In [65]:
# 同样的做局部改进的处理
# 微调：对user_merchant_feature进行一些局部改进
split_test_data_feature['user_merchant_click_purchase_ratio'] = split_test_data_feature['user_click_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_clicks']
split_test_data_feature['user_merchant_collection_purchase_ratio'] = split_test_data_feature['user_collection_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_collections']
split_test_data_feature['user_merchant_cart_purchase_ratio'] = split_test_data_feature['user_cart_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_carts']
split_test_data_feature['user_merchant_log_purchase_ratio'] = split_test_data_feature['user_log_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_logs']

# 建立特定商家对某一个用户的特定画像：以商家为中心进行的特征评估
split_test_data_feature['merchant_user_items'] = split_test_data_feature['user_merchant_items'] - split_test_data_feature['merchant_items_per_user']
split_test_data_feature['merchant_user_cats'] = split_test_data_feature['user_merchant_cats'] - split_test_data_feature['merchant_cats_per_user']
split_test_data_feature['merchant_user_days'] = split_test_data_feature['user_merchant_days'] - split_test_data_feature['merchant_days_per_user']
split_test_data_feature['merchant_user_clicks'] = split_test_data_feature['user_merchant_clicks'] - split_test_data_feature['merchant_clicks_per_user']
split_test_data_feature['merchant_user_carts'] = split_test_data_feature['user_merchant_carts'] - split_test_data_feature['merchant_carts_per_user']
split_test_data_feature['merchant_user_purchases'] = split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['merchant_purchases_per_user']
split_test_data_feature['merchant_user_collections'] = split_test_data_feature['user_merchant_collections'] - split_test_data_feature['merchant_collections_per_user']
split_test_data_feature['merchant_user_logs'] = split_test_data_feature['user_merchant_logs'] - split_test_data_feature['user_logs_per_store']
split_test_data_feature['merchant_user_click_purchase_ratio'] = split_test_data_feature['merchant_click_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_clicks']
split_test_data_feature['merchant_user_collection_purchase_ratio'] = split_test_data_feature['merchant_collection_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_collections']
split_test_data_feature['merchant_user_cart_purchase_ratio'] = split_test_data_feature['merchant_cart_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_carts']
split_test_data_feature['merchant_user_log_purchase_ratio'] = split_test_data_feature['merchant_log_purchase_ratio'] * split_test_data_feature['user_merchant_purchases'] - split_test_data_feature['user_merchant_logs']

In [66]:
# 调整age和gender的一些属性问题
# 由于1存在的样本实在太少，做缺失值的方式进行处理
split_test_data_feature['age_range'].replace(8, 7, inplace=True)
split_test_data_feature['age_range'].replace(1, np.nan, inplace=True) ###

split_test_data_feature.drop(columns='prob', inplace=True)

### 3.2 建立模型预测、填补测试集age_range, gender缺失值

In [67]:
# 一、填补age_range

test_feature_age = split_test_data_feature.drop(columns=['gender', 'user_merchant_brands'])
age_corr = test_feature_age.corr(method='pearson')['age_range'].sort_values(ascending=False)
high_corr_idxs = age_corr.index.drop(['user_id', 'merchant_id'])[1:21]

test_feature_age_normal = test_feature_age[np.isnan(test_feature_age['age_range'])==False]
test_feature_age_nan = test_feature_age[np.isnan(test_feature_age['age_range'])==True]

test_feature_age_normal_label = test_feature_age_normal['age_range']
test_feature_age_normal_data = test_feature_age_normal[high_corr_idxs]
test_feature_age_nan_data = test_feature_age_nan[high_corr_idxs]

rfc = RandomForestClassifier(n_estimators=30, criterion='gini', max_depth=6, random_state=0)
cv_results = cross_validate(rfc, test_feature_age_normal_data, test_feature_age_normal_label, cv=3,
                            verbose=2, return_train_score=True, return_estimator=True)
cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   4.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.4s remaining:    0.0s


[CV] END .................................................... total time=   4.6s
[CV] END .................................................... total time=   4.7s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   15.6s finished


{'fit_time': array([4.61459517, 4.3444407 , 4.41479945]),
 'score_time': array([0.2420783 , 0.23457527, 0.24058652]),
 'estimator': [RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0)],
 'test_score': array([0.29416581, 0.29504567, 0.29375404]),
 'train_score': array([0.29950295, 0.29966844, 0.30084475])}

In [68]:
rfc_estimator = cv_results['estimator'][2]
age_nan_pred = rfc_estimator.predict(test_feature_age_nan_data)
test_feature_age_nan['age_range'] = age_nan_pred
age_range = pd.concat([test_feature_age_normal[['user_id', 'merchant_id', 'age_range']],
                       test_feature_age_nan[['user_id', 'merchant_id','age_range']]], axis=0)

split_test_data_feature.drop(columns='age_range', inplace=True)
split_test_data_feature = pd.merge(split_test_data_feature, age_range, on=['user_id', 'merchant_id'])
# split_test_data_feature.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_feature_age_nan['age_range'] = age_nan_pred


In [69]:
# 二、填补gender

# 数据处理
test_feature_gender = split_test_data_feature.drop(columns=['age_range', 'user_merchant_brands'])
gender_corr = test_feature_gender.corr(method='pearson')['gender'].sort_values(ascending=False)
high_corr_idxs = gender_corr.index.drop(['user_id', 'merchant_id'])[1:21]

test_feature_gender_normal = test_feature_gender[np.isnan(test_feature_gender['gender'])==False]
test_feature_gender_nan = test_feature_gender[np.isnan(test_feature_gender['gender'])==True]

test_feature_gender_normal_label = test_feature_gender_normal['gender']
test_feature_gender_normal_data = test_feature_gender_normal[high_corr_idxs]
test_feature_gender_nan_data = test_feature_gender_nan[high_corr_idxs]

# 模型训练
cv_results = cross_validate(rfc, test_feature_gender_normal_data, test_feature_gender_normal_label, cv=3,
                            verbose=2, return_train_score=True, return_estimator=True)
cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   3.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.9s remaining:    0.0s


[CV] END .................................................... total time=   3.6s
[CV] END .................................................... total time=   3.5s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.7s finished


{'fit_time': array([3.32886672, 3.391222  , 3.37409282]),
 'score_time': array([0.18173575, 0.18148541, 0.16859603]),
 'estimator': [RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0),
  RandomForestClassifier(max_depth=6, n_estimators=30, random_state=0)],
 'test_score': array([0.70214599, 0.70511522, 0.7049522 ]),
 'train_score': array([0.70639024, 0.70648921, 0.70570906])}

In [70]:
# 预测，整合
rfc_estimator = cv_results['estimator'][1]
gender_nan_pred = rfc_estimator.predict(test_feature_gender_nan_data)
test_feature_gender_nan['gender'] = gender_nan_pred
gender = pd.concat([test_feature_gender_normal[['user_id', 'merchant_id', 'gender']],
                    test_feature_gender_nan[['user_id', 'merchant_id','gender']]], axis=0)

split_test_data_feature.drop(columns='gender', inplace=True)
split_test_data_feature = pd.merge(split_test_data_feature, gender, on=['user_id', 'merchant_id'])
# split_test_data_feature.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_feature_gender_nan['gender'] = gender_nan_pred


In [71]:
# 观察测试数据
split_test_data_feature = split_test_data_feature.drop(columns=['user_id','merchant_id'])
split_test_data_feature

Unnamed: 0,user_items,user_cats,user_stores,user_browse_days,user_clicks,user_carts,user_purchases,user_collections,user_logs,user_items_per_store,...,merchant_user_carts,merchant_user_purchases,merchant_user_collections,merchant_user_logs,merchant_user_click_purchase_ratio,merchant_user_collection_purchase_ratio,merchant_user_cart_purchase_ratio,merchant_user_log_purchase_ratio,age_range,gender
0,34,26,21,33,63,0,16,2,81,1.619048,...,-0.003973,0.558090,-0.220378,-1.857143,5.749710,0.498695,0.008991,6.257396,0.0,0.0
1,65,48,37,43,71,0,6,0,77,1.756757,...,-0.004785,3.918660,-0.253589,7.918919,17.273230,1.172566,0.022124,18.467920,2.0,2.0
2,25,24,22,25,51,0,5,0,56,1.136364,...,-0.005742,0.553527,-0.158121,3.454545,1.593477,0.354157,0.012862,1.960496,6.0,0.0
3,25,24,22,25,51,0,5,0,56,1.136364,...,-0.003536,0.873717,-0.155715,8.454545,12.058717,1.233062,0.028004,13.319783,6.0,0.0
4,85,63,56,81,162,0,7,7,176,1.517857,...,-0.005055,0.892923,1.900276,46.857143,-16.042918,-1.068670,0.047210,-17.064378,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261472,1173,465,278,701,1770,0,26,208,2004,4.219424,...,-0.002167,0.834617,-0.099182,-2.208633,9.096070,0.599709,0.013100,9.708879,6.0,0.0
261473,29,20,17,20,46,0,8,1,55,1.705882,...,-0.003769,0.488065,-0.109296,-1.235294,4.580368,0.213497,0.007362,4.801227,7.0,1.0
261474,29,20,17,20,46,0,8,1,55,1.705882,...,-0.012390,3.656254,-0.209305,12.764706,25.733129,2.435583,0.144172,28.312883,7.0,1.0
261475,46,34,33,36,62,1,8,1,72,1.393939,...,-0.005851,0.839992,-0.080307,0.818182,14.150063,0.501892,0.036570,14.688525,0.0,0.0


In [72]:
split_test_data_feature.to_csv('./testset_final.csv', index=False)