In [52]:
import os 
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
from tqdm.contrib import tzip
from random import randint

https://github.com/ZyDaDa/LogisticRegression-CTR


## 读取数据

In [53]:
data_folder = 'data_format1'

user_log_df = pd.read_csv(os.path.join(data_folder, 'user_log_format1.csv'))
user_info_df = pd.read_csv(os.path.join(data_folder, 'user_info_format1.csv'))

In [54]:
# 用户交互的历史记录-作为全部数据集
user_log_df.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [55]:
# 作为用户特征
user_info_df.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [56]:
# 使用一部分数据集
use_num = len(user_log_df) // 50
user_log_df = user_log_df.iloc[:use_num]

## 数据处理
1. 低频用户/物品过滤：过滤掉交互次数小于5的用户/物品
2. 特征处理：Onehot、Embedding、Normalization...
3. 数据集划分

In [57]:
# 用户过滤
filted_cnt = 5

# 过滤item
counts = user_log_df['item_id'].value_counts()
user_log_df = user_log_df[user_log_df['item_id'].isin(counts[counts >= filted_cnt].index)]

# 过滤user
counts = user_log_df['user_id'].value_counts()
user_log_df = user_log_df[user_log_df['user_id'].isin(counts[counts >= filted_cnt].index)]

user_info_df = user_info_df[user_info_df['user_id'].isin(user_log_df['user_id'].unique())]

In [58]:
# 特征处理 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# item feature
feature_num = {}
for col in ['item_id','cat_id','seller_id','brand_id']:
    enc = LabelEncoder()

    user_log_df[col] = enc.fit_transform(user_log_df[col]) + 1 # 空出0作为填充 
    feature_num[col] = user_log_df[col].max()+1

# user feature
enc = LabelEncoder()
user_info_df['user_id'] = enc.fit_transform(user_info_df['user_id'])
user_log_df['user_id'] = enc.transform(user_log_df['user_id'])
feature_num['user_id'] = user_info_df['user_id'].max()+1

enc = OneHotEncoder()
user_age = enc.fit_transform(user_info_df[['age_range']]).todense().A
user_gender = user_info_df['gender'].fillna(0).values
all_user_feature = np.concatenate([user_age, user_gender.reshape(-1,1)],-1)
all_user_feature.shape

(7146, 10)

In [59]:
item_feature = {}
for item_id, cat_id, seller_id, barnd_id in tqdm(user_log_df[['item_id','cat_id','seller_id','brand_id']].to_numpy(),ncols=80):
    item_feature[item_id] = {'cat_id': cat_id,
                             'seller_id': seller_id,
                             'item_id': item_id,
                             'brand_id': barnd_id}

user_feature = {}
for uid, ufeat in tzip(user_info_df['user_id'].values, all_user_feature, ncols=80):
    user_feature[uid] = ufeat

100%|███████████████████████████████| 774625/774625 [00:02<00:00, 331849.88it/s]
100%|██████████████████████████████████| 7146/7146 [00:00<00:00, 1187076.57it/s]


In [60]:
# 数据集划分
# 将交互的最后两个item作为测试数据
train_set = []

test_set = []

test_num = 2 # 每个用户用来测试的数量
neg_sample_rate = 1 # 采样负样本的数量

for u, u_df in tqdm(user_log_df.sort_values(by='time_stamp').groupby('user_id'),ncols=80):
    user_log = u_df[['user_id','item_id']].to_numpy()
    pos_label = np.ones(shape=(len(user_log),1))
    user_log = np.concatenate([user_log,pos_label],-1) # user_id, item_id, label(1)
    
    pos_items = user_log[:,1]
    
    # 负采样
    neg_num = int((len(user_log)-test_num)*neg_sample_rate) + test_num
    neg_sample = np.zeros(shape=(neg_num,2))
    neg_sample[:,0] = u # 固定用户
    
    # 随机负采样
    for i in range(neg_num):
        neg = randint(1, feature_num['item_id']-1)
        while neg in pos_items: # 保证负样本不在正样本中出现
            neg = randint(1, feature_num['item_id']-1)
        neg_sample[i,1] = neg
        
    neg_label = np.zeros(shape=(len(neg_sample),1))
    neg_sample = np.concatenate([neg_sample,neg_label],-1) # user_id, item_id, label(0)
    
    train_set.append(user_log[:-test_num])
    test_set.append(user_log[-test_num:])
    
    train_set.append(neg_sample[:-test_num])
    test_set.append(neg_sample[-test_num:])

100%|██████████████████████████████████████| 7146/7146 [00:10<00:00, 673.74it/s]


In [61]:
train_set = np.concatenate(train_set, axis=0)
test_set = np.concatenate(test_set, axis=0)

In [62]:
len(train_set), len(test_set)

(1520666, 28584)

In [63]:
# 保存全部数据

pickle.dump((train_set, test_set, feature_num, item_feature, user_feature), open('all_data.pkl','wb'))