In [1]:
# date: 2020/5/12
# author: zonnoz
# description: 数据探索，数据路径可能需修改

import pandas as pd
import numpy as np
from tqdm import tqdm 
import warnings
warnings.filterwarnings("ignore")

## 查看用户数据

In [21]:
underexpose_user_feat = pd.read_csv("./data/underexpose_train/underexpose_user_feat.csv", 
                                    names=["user_id", "user_age_level", "user_gender", "user_city_level"], 
                                    nrows=None, 
                                    sep=",")

In [22]:
underexpose_user_feat.head()

Unnamed: 0,user_id,user_age_level,user_gender,user_city_level
0,17,8.0,M,4.0
1,26,7.0,M,2.0
2,35,6.0,F,4.0
3,40,6.0,M,1.0
4,49,6.0,M,1.0


### 用户 user_id 个数

In [23]:
underexpose_user_feat.user_id.count()

6789

In [24]:
underexpose_user_feat.user_id.nunique()

6786

In [29]:
underexpose_user_feat_duplicate_row = underexpose_user_feat.duplicated(subset=['user_id'], keep=False) 
underexpose_user_feat_duplicate_data = underexpose_user_feat.loc[underexpose_user_feat_duplicate_row,:]
print(underexpose_user_feat_duplicate_data)

      user_id  user_age_level user_gender  user_city_level
1466    14818             3.0           M              3.0
1467    14818             2.0           M              3.0
5733    23453             5.0           F              2.0
5734    23453             5.0           F              5.0
6513    32152             1.0           F              6.0
6514    32152             2.0           F              6.0


- 三个用户的信息不一致

### 用户年龄等级 user_age_level 分布 

In [31]:
underexpose_user_feat.user_age_level.value_counts(dropna=False,ascending=False,sort=False)

 8.0     294
 7.0     890
 6.0     817
 5.0    1363
 4.0    1425
NaN       83
 2.0     869
 3.0     816
 1.0     232
Name: user_age_level, dtype: int64

### 用户 user_gender 分布

In [32]:
underexpose_user_feat.user_gender.value_counts(dropna=False,ascending=False,sort=False)

NaN      81
F      5211
M      1497
Name: user_gender, dtype: int64

### 用户 user_city_level 分布

In [33]:
underexpose_user_feat.user_city_level.value_counts(dropna=False,ascending=False,sort=False)

 4.0     541
 2.0    1515
 1.0     760
 6.0    1870
NaN       22
 3.0    1242
 5.0     839
Name: user_city_level, dtype: int64

## 查看商品候选集

In [36]:
list_item_id = []
list_txt_vec = []
list_img_vec = []

with open("./data/underexpose_train/underexpose_item_feat.csv") as f:
    for line in tqdm(f):
        line_split = line.strip().split(",[")
        list_item_id.append(line_split[0])
        list_txt_vec.append(line_split[1].strip("]"))
        list_img_vec.append(line_split[2].strip("]"))

underexpose_item_feat = pd.DataFrame({"item_id": list_item_id, "txt_vec": list_txt_vec, "img_vec": list_img_vec})

108916it [00:03, 31905.31it/s]


In [37]:
underexpose_item_feat.head()

Unnamed: 0,item_id,txt_vec,img_vec
0,42844,"4.514945030212402, -2.3837196826934814, 0.5004...","-2.8722801208496094, 1.4587551355361938, 2.579..."
1,67898,"-2.0029051303863525, -0.9298805594444275, 0.79...","-0.07052088528871536, -1.4393335580825806, 0.7..."
2,66446,"4.221673011779785, -1.4971394538879395, 1.1335...","-5.18036413192749, -0.38824713230133057, -0.03..."
3,63651,"2.6579699516296387, -0.941863477230072, 1.1215...","-1.077273964881897, 2.8394529819488525, 1.1834..."
4,46824,"3.192194938659668, -1.9366759061813354, 1.1999...","-3.2601945400238037, 0.49889034032821655, 2.58..."


### 查看商品 item_id 个数

In [38]:
underexpose_item_feat.item_id.count()

108916

In [39]:
underexpose_item_feat.item_id.nunique()

108916

## 用户点击商品数据

In [41]:
list_phase_range = [0, 1, 2, 3, 4, 5]
list_train_click_file = [(x, "./data/underexpose_train/underexpose_train_click-"+str(x)+".csv") for x in list_phase_range]

In [45]:
list_train_click_file

[(0, './data/underexpose_train/underexpose_train_click-0.csv'),
 (1, './data/underexpose_train/underexpose_train_click-1.csv'),
 (2, './data/underexpose_train/underexpose_train_click-2.csv'),
 (3, './data/underexpose_train/underexpose_train_click-3.csv'),
 (4, './data/underexpose_train/underexpose_train_click-4.csv'),
 (5, './data/underexpose_train/underexpose_train_click-5.csv')]

In [43]:
def get_train_click(list_train_click_file):
    list_underexpose_click = []
    columns = ["user_id", "item_id", "time", "phase"]
    for phase, file in tqdm(list_train_click_file):
        underexpose_click = pd.read_csv(file,
                                        names=["user_id", "item_id", "time"], 
                                        nrows=None,
                                        sep=","
                                       )
        underexpose_click["phase"] = phase
        underexpose_click = underexpose_click[columns]
        list_underexpose_click.append(underexpose_click)
    
    all_underexpose_click = pd.concat(list_underexpose_click)
    return all_underexpose_click

In [46]:
underexpose_train_click = get_train_click(list_train_click_file)

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.15it/s]


### 查看用户点击商品数据

In [47]:
len(underexpose_train_click)

1546265

In [48]:
underexpose_train_click.head()

Unnamed: 0,user_id,item_id,time,phase
0,4965,18,0.983763,0
1,20192,34,0.983772,0
2,30128,91,0.98378,0
3,29473,189,0.98393,0
4,10625,225,0.983925,0


In [49]:
underexpose_train_click.sort_values(['user_id','time']).head(20)

Unnamed: 0,user_id,item_id,time,phase
19829,1,78142,0.983742,0
236132,1,26646,0.983757,0
20480,1,89568,0.983763,0
19709,1,76240,0.98377,0
108033,1,87533,0.98379,0
56362,1,78380,0.98379,0
159250,1,85492,0.983875,0
160489,1,85492,0.983875,2
20968,1,97795,0.983877,0
111177,1,18522,0.983887,0


## 用户点击商品数据-测试集

In [58]:
list_phase_range = [0,1,2,3,4,5]
list_test_phase_file = [(x_,"./data/underexpose_test/underexpose_test_click/underexpose_test_click-"+str(x_)+".csv") for x_ in list_phase_range]
list_test_phase_query_file = [(x_,"./data/underexpose_test/underexpose_test_qtime/underexpose_test_qtime-"+str(x_)+".csv") for x_ in list_phase_range]

In [59]:
list_test_phase_file

[(0,
  './data/underexpose_test/underexpose_test_click/underexpose_test_click-0.csv'),
 (1,
  './data/underexpose_test/underexpose_test_click/underexpose_test_click-1.csv'),
 (2,
  './data/underexpose_test/underexpose_test_click/underexpose_test_click-2.csv'),
 (3,
  './data/underexpose_test/underexpose_test_click/underexpose_test_click-3.csv'),
 (4,
  './data/underexpose_test/underexpose_test_click/underexpose_test_click-4.csv'),
 (5,
  './data/underexpose_test/underexpose_test_click/underexpose_test_click-5.csv')]

In [60]:
list_test_phase_query_file

[(0,
  './data/underexpose_test/underexpose_test_qtime/underexpose_test_qtime-0.csv'),
 (1,
  './data/underexpose_test/underexpose_test_qtime/underexpose_test_qtime-1.csv'),
 (2,
  './data/underexpose_test/underexpose_test_qtime/underexpose_test_qtime-2.csv'),
 (3,
  './data/underexpose_test/underexpose_test_qtime/underexpose_test_qtime-3.csv'),
 (4,
  './data/underexpose_test/underexpose_test_qtime/underexpose_test_qtime-4.csv'),
 (5,
  './data/underexpose_test/underexpose_test_qtime/underexpose_test_qtime-5.csv')]

In [61]:
def get_test_click(list_test_phase_file, list_test_phase_query_file):
    list_underexpose_click = []
    columns = ["user_id", "item_id", "time", "phase"]
    for phase, file in tqdm(list_test_phase_file): # 测试集
        underexpose_click = pd.read_csv(file, 
                                        names=["user_id", "item_id", "time"],
                                        nrows=None,
                                        sep=","
                                       )
        underexpose_click["phase"] = phase
        underexpose_click = underexpose_click[columns] 
        list_underexpose_click.append(underexpose_click)

    for phase, file in tqdm(list_test_phase_query_file):
        underexpose_qtime = pd.read_csv(file
                                    ,names=["user_id", "query_time"]
                                    ,nrows=None      
                                    ,sep=","
                                   )   
        underexpose_qtime.columns = ["user_id", "time"]
        underexpose_qtime["phase"] = phase
        underexpose_qtime["item_id"] = -999  # 预测集标记
        underexpose_qtime = underexpose_qtime[columns]
        list_underexpose_click.append(underexpose_qtime)
    
    all_underexpose_click = pd.concat(list_underexpose_click)
    return all_underexpose_click[columns]

In [62]:
underexpose_test_click = get_test_click(list_test_phase_file, list_test_phase_query_file)

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 20.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 82.26it/s]


In [75]:
underexpose_test_click.head()

Unnamed: 0,user_id,item_id,time,phase
0,1133,221,0.983812,0
1,17864,253,0.983783,0
2,6941,309,0.983785,0
3,34089,358,0.983781,0
4,21659,536,0.983793,0


In [77]:
len(underexpose_test_click[underexpose_test_click['item_id']<0])

10260

In [80]:
underexpose_test_click[underexpose_test_click['item_id']<0].user_id.nunique()

10260

### 查看待预测的点击商品数据集

In [81]:
len(underexpose_test_click[underexpose_test_click["item_id"]!=-999])

146486

In [82]:
len(underexpose_test_click[underexpose_test_click["item_id"]==-999])

10260

In [83]:
underexpose_test_click.head()

Unnamed: 0,user_id,item_id,time,phase
0,1133,221,0.983812,0
1,17864,253,0.983783,0
2,6941,309,0.983785,0
3,34089,358,0.983781,0
4,21659,536,0.983793,0


## 建模数据准备

### 序列数据处理

In [84]:
underexpose_train_click.head()

Unnamed: 0,user_id,item_id,time,phase
0,4965,18,0.983763,0
1,20192,34,0.983772,0
2,30128,91,0.98378,0
3,29473,189,0.98393,0
4,10625,225,0.983925,0


In [85]:
# {phase:{user_id:{item_seq:[], time_seq:[], diff_time_seq:[]}}}
def deal_click_data(underexpose_click_data):
    underexpose_click_data = underexpose_click_data.sort_values(["user_id", "phase", "time"])
    dict_user_phase_action = {}
    for i, row in tqdm(underexpose_click_data.iterrows()):
        user_id, item_id, time, phase = int(row["user_id"]), int(row["item_id"]), float(row["time"]), int(row["phase"])
        if phase not in dict_user_phase_action:
            dict_user_phase_action[phase] = {}
        if user_id not in dict_user_phase_action[phase]:
            dict_user_phase_action[phase][user_id] = {"item_seq": [], "time_seq": [], "diff_time_seq": []}
        else:
            diff_time = (time - dict_user_phase_action[phase][user_id]["time_seq"][-1]) * 10**4
            dict_user_phase_action[phase][user_id]["diff_time_seq"].append(diff_time)
        dict_user_phase_action[phase][user_id]["item_seq"].append(item_id)
        dict_user_phase_action[phase][user_id]["time_seq"].append(time)
    return dict_user_phase_action

In [86]:
dict_train_user_phase_action = deal_click_data(underexpose_train_click)

1546265it [01:27, 17699.08it/s]


In [87]:
dict_test_user_phase_action = deal_click_data(underexpose_test_click)

156746it [00:08, 18312.93it/s]


## 查看处理好的序列数据

### 训练数据

In [88]:
dict_train_user_phase_action.keys()

dict_keys([0, 2, 3, 4, 5, 1])

In [89]:
# 每个阶段的 user 数
len(dict_train_user_phase_action[0].keys())

16842

In [90]:
dict_train_user_phase_action[0][1].keys()

dict_keys(['item_seq', 'time_seq', 'diff_time_seq'])

In [91]:
len(dict_train_user_phase_action[0][1]["item_seq"])

13

In [92]:
len(dict_train_user_phase_action[0][1]["time_seq"])

13

In [93]:
dict_train_user_phase_action[0][1]["diff_time_seq"]

[0.15036582135641297,
 0.06781414004120023,
 0.06995285001254459,
 0.19074264676177677,
 0.00020819300394769868,
 0.8595342885475699,
 0.016434629716011173,
 0.09419787189268725,
 0.0009589496041684242,
 0.0030724240551727178,
 0.0671453988720927,
 0.4826923299483532]

### 测试数据

In [94]:
dict_test_user_phase_action.keys()

dict_keys([1, 2, 3, 0, 4, 5])

In [95]:
len(dict_test_user_phase_action[0].keys())

1663

In [96]:
dict_test_user_phase_action[0][22].keys()

dict_keys(['item_seq', 'time_seq', 'diff_time_seq'])

In [98]:
len(dict_test_user_phase_action[0][22]["item_seq"])

18

In [99]:
len(dict_test_user_phase_action[0][22]["time_seq"])

18

In [100]:
dict_test_user_phase_action[0][22]["diff_time_seq"]

[0.0007570654769750718,
 0.00022081076411772926,
 0.0005867257435454576,
 0.00016403085223437586,
 0.2536484789639193,
 0.001526748711899728,
 0.00011986869941083,
 0.0889236490742018,
 0.1566810092046289,
 0.0005993435014950421,
 0.43729994576358067,
 0.027254357148898123,
 0.03961345104719349,
 0.05773255144836398,
 0.020188412703792125,
 0.5290688992520831,
 0.0004416215282354585]

In [101]:
# dict_user_feat, {user_id:{age:a, gender: g, city: c}}
# dict_user_age_level, dict_user_gender, dict_user_city_level 是各自离散化后的字典
def deal_user_feat_data(user_feat_data):
    user_feat_data = user_feat_data.fillna(-1)
    dict_user_feat = {}
    dict_user_age_level, index_user_age_level = {}, 0
    dict_user_gender, index_user_gender = {}, 0
    dict_user_city_level, index_user_city_level = {}, 0
    
    for i, row in tqdm(user_feat_data.iterrows()):
        user_id, user_age_level, user_gender, user_city_level = int(row["user_id"]), int(row["user_age_level"]), row["user_gender"], int(row["user_city_level"])
        if user_id not in dict_user_feat:
            dict_user_feat[user_id] = {}
        if user_age_level not in dict_user_age_level:
            dict_user_age_level[user_age_level] = index_user_age_level
            index_user_age_level += 1
        if user_gender not in dict_user_gender:
            dict_user_gender[user_gender] = index_user_gender
            index_user_gender += 1
        if user_city_level not in dict_user_city_level:
            dict_user_city_level[user_city_level] = index_user_city_level
            index_user_city_level += 1
        
        dict_user_feat[user_id]["user_age_level"] = dict_user_age_level[user_age_level]
        dict_user_feat[user_id]["user_gender"] = dict_user_gender[user_gender]
        dict_user_feat[user_id]["user_city_level"] = dict_user_city_level[user_city_level]
    
    return dict_user_feat, dict_user_age_level, dict_user_gender, dict_user_city_level

In [102]:
dict_user_feat, dict_user_age_level, dict_user_gender, dict_user_city_level = deal_user_feat_data(underexpose_user_feat)

6789it [00:00, 12963.74it/s]


In [103]:
dict_user_feat[17]

{'user_age_level': 0, 'user_gender': 0, 'user_city_level': 0}

In [104]:
dict_user_age_level

{8: 0, 7: 1, 6: 2, 5: 3, 4: 4, -1: 5, 2: 6, 3: 7, 1: 8}

In [105]:
dict_user_gender

{'M': 0, 'F': 1, -1: 2}

In [106]:
dict_user_city_level

{4: 0, 2: 1, 1: 2, 6: 3, -1: 4, 3: 5, 5: 6}

## 商品数据处理

In [107]:
underexpose_item_feat.head()

Unnamed: 0,item_id,txt_vec,img_vec
0,42844,"4.514945030212402, -2.3837196826934814, 0.5004...","-2.8722801208496094, 1.4587551355361938, 2.579..."
1,67898,"-2.0029051303863525, -0.9298805594444275, 0.79...","-0.07052088528871536, -1.4393335580825806, 0.7..."
2,66446,"4.221673011779785, -1.4971394538879395, 1.1335...","-5.18036413192749, -0.38824713230133057, -0.03..."
3,63651,"2.6579699516296387, -0.941863477230072, 1.1215...","-1.077273964881897, 2.8394529819488525, 1.1834..."
4,46824,"3.192194938659668, -1.9366759061813354, 1.1999...","-3.2601945400238037, 0.49889034032821655, 2.58..."


In [108]:
# dict_item_feat， {item_id:{txt_vec: [], img_vec: []}}
def deal_item_feat_data(item_feat_data):
    dict_item_feat = {}
    for i,row in tqdm(item_feat_data.iterrows()):    
        item_id, txt_vec, img_vec = int(row["item_id"]), [float(x_) for x_ in row["txt_vec"].split(",")], [float(x_) for x_ in row["img_vec"].split(",")]   
        if item_id not in dict_item_feat:
            dict_item_feat[item_id] = {}
        dict_item_feat[item_id]["txt_vec"] = txt_vec
        dict_item_feat[item_id]["img_vec"] = img_vec
    return dict_item_feat

In [109]:
dict_item_feat = deal_item_feat_data(underexpose_item_feat)

108916it [00:26, 4178.99it/s]


In [110]:
dict_item_feat[1].keys()

dict_keys(['txt_vec', 'img_vec'])

In [111]:
len(dict_item_feat[1]["txt_vec"])

128

In [112]:
len(dict_item_feat[1]["img_vec"])

128

## id 转化为 embedding 特征

### user_id 与 item_id

In [114]:
def get_node_user_item(underexpose_train_click, underexpose_test_click):
    list_user_id = []
    list_item_id = []
    list_index_user_id = []
    list_index_item_id = []
    list_score = []
    index_user_id = 0
    index_item_id = 0
    dict_user_id = {}
    dict_item_id = {}
    
    list_underexpose_click = [underexpose_train_click, underexpose_test_click]
    for underexpose_click in list_underexpose_click:
        for i, row in tqdm(underexpose_click.iterrows()):
            user_id, item_id, time, phase = int(row["user_id"]), int(row["item_id"]), float(row["time"]), int(row["phase"])
            if item_id > 0: # 非预测数据集
                list_user_id.append(user_id)
                list_item_id.append(item_id)
                list_score.append(1)
                if user_id not in dict_user_id:
                    dict_user_id[user_id] = index_user_id # 标记为 unique user_id
                    index_user_id += 1
                list_index_user_id.append(dict_user_id[user_id]) # 记录 unique user_id
                
                if item_id not in dict_item_id:
                    dict_item_id[item_id] = index_item_id
                    index_item_id += 1
                list_index_item_id.append(dict_item_id[item_id])
    
    node_user_item = pd.DataFrame({
        "user_id": list_user_id, 
        "item_id": list_item_id, 
        "score": list_score, 
        "index_user_id": list_index_user_id,
        "index_item_id": list_index_item_id
    })
    # user_id 对 item_id 的点击频率
    node_user_item = node_user_item.groupby(["user_id", "item_id", "index_user_id", "index_item_id"])["score"].sum().reset_index()
    node_user_item.columns = ["user_id", "item_id", "index_user_id", "index_item_id", "score"]
    return node_user_item, dict_user_id, dict_item_id

In [115]:
node_user_item, dict_user_id, dict_item_id = get_node_user_item(underexpose_train_click, underexpose_test_click)

1546265it [01:25, 18173.03it/s]
156746it [00:08, 18112.50it/s]


In [116]:
node_user_item.index_user_id.min(), node_user_item.index_user_id.max()

(0, 29570)

In [117]:
node_user_item.index_item_id.min(), node_user_item.index_item_id.max()

(0, 89472)

## 查看 underexpose_user_feat、underexpose_train_click、underexpose_test_click 的用户

### 三者各自用户数

In [124]:
underexpose_user_feat.user_id.nunique(), underexpose_train_click.user_id.nunique(), underexpose_test_click.user_id.nunique()

(6786, 29128, 10260)

### 点击数据集的用户数

In [130]:
len((set(underexpose_test_click.user_id.astype(str)) | set(underexpose_train_click.user_id.astype(str))))

29571

### 点击数据集(train+test) 与 用户数据集 的共同用户数

In [118]:
len((set(underexpose_test_click.user_id.astype(str)) | set(underexpose_train_click.user_id.astype(str)))
    & set(underexpose_user_feat.user_id.astype(str)))

5844

### 点击数据集(train+test) - 用户数据集 的差异用户数

In [119]:
len((set(underexpose_test_click.user_id.astype(str)) | set(underexpose_train_click.user_id.astype(str))) 
    - set(underexpose_user_feat.user_id.astype(str)))

23727

### 用户数据集 - 点击数据集(train+test) 的差异用户数

In [120]:
len(set(underexpose_user_feat.user_id.astype(str)) 
    - (set(underexpose_test_click.user_id.astype(str)) | set(underexpose_train_click.user_id.astype(str))))

942

### 三者并集

In [123]:
len(set(underexpose_user_feat.user_id.astype(str)) | set(underexpose_test_click.user_id.astype(str)) | set(underexpose_train_click.user_id.astype(str)))

30513

## 查看 underexpose_user_feat、underexpose_train_click、underexpose_test_click 的商品

### 三者各自商品数

In [128]:
underexpose_item_feat.item_id.nunique(), underexpose_train_click.item_id.nunique(), underexpose_test_click.item_id.nunique()

(108916, 89468, 56957)

### 点击数据商品数

In [131]:
len((set(underexpose_test_click.item_id.astype(str)) | set(underexpose_train_click.item_id.astype(str))))

89474

### 点击数据集(train+test) 与 商品数据集 的共同商品数

In [129]:
len((set(underexpose_test_click.item_id.astype(str)) | set(underexpose_train_click.item_id.astype(str)))
    & set(underexpose_item_feat.item_id.astype(str)))

83442

### 点击数据集(train+test) - 商品数据集 的差异商品数

In [132]:
len((set(underexpose_test_click.item_id.astype(str)) | set(underexpose_train_click.item_id.astype(str)))
    - set(underexpose_item_feat.item_id.astype(str)))

6032

### 商品数据集 - 点击数据集(train+test) 的差异商品数

In [133]:
len(set(underexpose_item_feat.item_id.astype(str))
    - (set(underexpose_test_click.item_id.astype(str)) | set(underexpose_train_click.item_id.astype(str))))

25474

### 三者并集

In [135]:
len(set(underexpose_item_feat.item_id.astype(str)) | set(underexpose_test_click.item_id.astype(str)) | set(underexpose_train_click.item_id.astype(str)))

114948