In [1]:
import gc
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# 用户行为，使用format1进行加载
# 加载全量样本
"""
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
submission = pd.read_csv('./data_format1/test_format1.csv')
"""
# 加载小样本
user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('./data_format1_small/train.csv')
submission = pd.read_csv('./data_format1_small/test.csv')
train_data = pd.read_csv('./data_format2/train_format2.csv')

train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
#print(matrix)

# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')

# 对离散特征做LabelEncoder
lbe_merchant_id=LabelEncoder()
lbe_merchant_id.fit(np.r_[0,user_log['merchant_id'].values])
user_log['merchant_id']=lbe_merchant_id.transform(user_log['merchant_id'])
matrix['merchant_id']=lbe_merchant_id.transform(matrix['merchant_id'])

lbe_user_id=LabelEncoder()
user_log['user_id']=lbe_user_id.fit_transform(user_log['user_id'])
user_info['user_id']=lbe_user_id.transform(user_info['user_id'])
matrix['user_id']=lbe_user_id.transform(matrix['user_id'])

lbe_item_id=LabelEncoder()
user_log['item_id']=lbe_item_id.fit_transform(user_log['item_id'])
lbe_cat_id=LabelEncoder()
user_log['cat_id']=lbe_cat_id.fit_transform(user_log['cat_id'])
lbe_brand_id=LabelEncoder()
user_log['brand_id']=lbe_brand_id.fit_transform(user_log['brand_id'])

user_log['merchant_id'].max(),user_log['user_id'].max()
matrix = matrix.merge(user_info, on='user_id', how='left')

# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()
#print(matrix)

# User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计action_type为0，1，2，3的个数（原始操作，没有补0）
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')
#print(matrix)

# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
#print(matrix)

# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
#print(temp)
#print('-'*100)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔
#print(matrix)

#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
#print(matrix)



In [2]:
lbe_action_type={0:1,1:2,2:3,3:4}
user_log['action_type']=user_log['action_type'].map(lbe_action_type)
# 用户行为sequence
user_log_1 = pd.DataFrame(user_log.sort_values('time_stamp', ascending=True).groupby('user_id')['merchant_id','action_type','time_stamp'].agg(lambda x:list(x)))
split_pot = list()
for session in user_log_1['time_stamp']:
    temp_list = list()
    for session_num in range(1,len(session)):
        time_num = (session[session_num] - session[session_num-1]).seconds/60
        if time_num >= 30 :
            temp_list.append(session_num)
    split_pot.append(temp_list)


In [3]:
max_len = 0
for count_1 in split_pot:
    temp = len(count_1)
    if temp > max_len:
        max_len = temp
#print(max_len)

In [None]:
#print(user_log_1['merchant_id'])
#print(split_pot)

In [4]:
n = 0
merchant_id_list_all = pd.DataFrame(columns = [range(max_len+1)])

for merchant_id_temp in user_log_1['merchant_id']:
    merchant_id_list_one = []
    split_pot_n = split_pot[n]
    if split_pot_n == []:
        merchant_id_list_one.append(merchant_id_temp)
    else:
        for count_2 in range(len(split_pot_n)):
            if count_2 == 0:
                merchant_id_list_one.append(merchant_id_temp[0:split_pot_n[count_2]])
            else:
                merchant_id_list_one.append(merchant_id_temp[split_pot_n[count_2-1]:split_pot_n[count_2]])
                if count_2 == len(split_pot_n)-1:
                    merchant_id_list_one.append(merchant_id_temp[split_pot_n[count_2]:])
    while len(merchant_id_list_one) < max_len+1:
        merchant_id_list_one.append([0])    
    merchant_id_list_one_temp = pd.Series(merchant_id_list_one)
    merchant_id_list_all = merchant_id_list_all.append(merchant_id_list_one_temp,ignore_index=True)
    n += 1

print(merchant_id_list_all)

      (0, 1, 2, 3, 4, 5, 6, 7)  \
0                          NaN   
1                          NaN   
2                          NaN   
3                          NaN   
4                          NaN   
...                        ...   
19107                      NaN   
19108                      NaN   
19109                      NaN   
19110                      NaN   
19111                      NaN   

                                                       0  \
0                                                 [4782]   
1      [34, 3759, 4078, 3759, 3759, 291, 3759, 3759, ...   
2      [2459, 2459, 2459, 2459, 2459, 2459, 2459, 245...   
3                              [567, 567, 567, 567, 567]   
4         [1553, 3733, 3733, 3738, 3738, 3738, 1961, 34]   
...                                                  ...   
19107  [3606, 3606, 3606, 962, 962, 3606, 3606, 3954,...   
19108                     [3954, 1102, 2883, 1102, 1102]   
19109                                       [2492, 

In [5]:
n = 0
action_type_list_all = pd.DataFrame(columns = [range(max_len+1)])

for action_type_temp in user_log_1['action_type']:
    action_type_list_one = []
    split_pot_n = split_pot[n]
    if split_pot_n == []:
        action_type_list_one.append(action_type_temp)
    else:
        for count_3 in range(len(split_pot_n)):
            if count_3 == 0:
                action_type_list_one.append(action_type_temp[0:split_pot_n[count_3]])
            else:
                action_type_list_one.append(action_type_temp[split_pot_n[count_3-1]:split_pot_n[count_3]])
                if count_3 == len(split_pot_n)-1:
                    action_type_list_one.append(action_type_temp[split_pot_n[count_3]:])
    while len(action_type_list_one) < max_len+1:
        action_type_list_one.append([0])    
    action_type_list_one_temp = pd.Series(action_type_list_one)
    action_type_list_all = action_type_list_all.append(action_type_list_one_temp,ignore_index=True)
    n += 1

print(action_type_list_all)

      (0, 1, 2, 3, 4, 5, 6, 7)  \
0                          NaN   
1                          NaN   
2                          NaN   
3                          NaN   
4                          NaN   
...                        ...   
19107                      NaN   
19108                      NaN   
19109                      NaN   
19110                      NaN   
19111                      NaN   

                                                       0  \
0                                                    [4]   
1      [3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, ...   
2                   [1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 1]   
3                                        [1, 1, 1, 1, 1]   
4                               [4, 3, 4, 3, 3, 3, 4, 4]   
...                                                  ...   
19107                        [3, 1, 1, 1, 1, 1, 3, 1, 1]   
19108                                    [1, 1, 1, 1, 1]   
19109                                             [

In [6]:
for count_2 in range(max_len+1):
    user_log_1['session_' + str(count_2) + '_merchant_id'] = merchant_id_list_all[count_2]
    user_log_1['session_' + str(count_2) + '_action_type'] = action_type_list_all[count_2]
print(user_log_1)

                                               merchant_id  \
user_id                                                      
0        [4782, 2378, 3496, 2378, 202, 3295, 3295, 2378...   
1        [34, 3759, 4078, 3759, 3759, 291, 3759, 3759, ...   
2        [2459, 2459, 2459, 2459, 2459, 2459, 2459, 245...   
3        [567, 567, 567, 567, 567, 886, 78, 1523, 1523,...   
4        [1553, 3733, 3733, 3738, 3738, 3738, 1961, 34,...   
...                                                    ...   
19107    [3606, 3606, 3606, 962, 962, 3606, 3606, 3954,...   
19108    [3954, 1102, 2883, 1102, 1102, 606, 606, 606, ...   
19109    [2492, 2283, 2763, 1719, 36, 36, 36, 1719, 171...   
19110    [1352, 1352, 1352, 1352, 1352, 1352, 1352, 135...   
19111    [4950, 4950, 702, 254, 3833, 1687, 546, 1910, ...   

                                               action_type  \
user_id                                                      
0        [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1      

In [7]:
matrix = matrix.merge(user_log_1, on='user_id', how='left')
print(matrix)

       user_id  merchant_id_x label origin  prob    u1   u2  u3  u4  u5  ...  \
0        16497           1203   0.0  train   0.0    46   29  12  16  16  ...   
1         1950            946   0.0  train   0.0   365  198  46  46  45  ...   
2        10829           2278   0.0  train   0.0    47   31  14  15  17  ...   
3         7974            951   0.0  train   0.0   234  105  23  35  36  ...   
4        14604           1892   0.0  train   0.0   186  106  34  40  39  ...   
...        ...            ...   ...    ...   ...   ...  ...  ..  ..  ..  ...   
23888     2157           1748   nan   test   0.0   128   97  28  39  40  ...   
23889     2673            798   nan   test   0.0  1286  540  55  93  96  ...   
23890    11847            639   nan   test   0.0     9    8   7   7   7  ...   
23891    11847           3953   nan   test   0.0     9    8   7   7   7  ...   
23892    19079           2954   nan   test   0.0   197   85  36  39  40  ...   

                                   sess

In [8]:
matrix.to_csv('matrix.csv',index = 0)