In [1]:
import pandas as pd
import numpy as np
import math
import gc
from collections import defaultdict

In [2]:
jdata_action_train = pd.read_hdf('./data/action_train.h5', key='df', mode='r')
jdata_comment = pd.read_csv('./raw/jdata_comment.csv',sep=',')
jdata_product = pd.read_csv('./raw/jdata_product.csv',sep=',')
jdata_shop = pd.read_csv('./raw/jdata_shop.csv',sep=',')
jdata_user = pd.read_csv('./raw/jdata_user.csv',sep=',')

In [3]:
# 为action数据加上商品cate与brand
df = jdata_action_train.merge(jdata_product,on='sku_id',how='left')
df[['brand']] = df[['brand']].astype('int32', errors='ignore')
df[['cate']] = df[['cate']].astype('int32', errors='ignore')

In [4]:
# brand, cate的热度统计
# cate类别数相对brand非常少
cnt_brand = df.brand.value_counts().to_dict()
cnt_cate = df.cate.value_counts().to_dict()

In [20]:
# 筛选出这两个月内购买物品超过5的用户，使得矩阵不那么稀疏
data_user = pd.DataFrame(df.groupby('user_id').count()['sku_id'].sort_values(ascending=False))
data_user.columns = ['item_cnts']
data_user.reset_index(inplace=True)

In [21]:
# 按照以上筛选建立新的df
tmp = data_user[data_user['item_cnts']>=5]
df_new = df.merge(tmp, on='user_id',how='inner').sort_values(by='action_time')[['user_id','sku_id','brand','cate','type']]
del tmp 
gc.collect()

1512

In [8]:
# 给每个行为赋其rating权重
df_new['score'] = df_new['type'].map({1:0.1, 2:0.5, 3:0.2, 4:0.3, 5:0.4})
# 建立最终的训练dataframe
df_train = pd.DataFrame(df_new.groupby(['user_id','brand']).score.sum()).reset_index()
df_train[['brand']] = df_train[['brand']].astype('int32', errors='ignore')

In [8]:
train_brand = df_train.groupby('brand')['user_id'].apply(set).to_dict()
# Brand_similarity matrix
SIM_brand = defaultdict(dict)   # dict版本相似度矩阵
SIM_brand_list = defaultdict(lambda : np.zeros(12000))
cnt = 0
train_brand_items = train_brand.items()
for brand1, users1 in train_brand_items:
    cnt +=1 
    if cnt % 2000 == 0:
        print('Finished {} brands..'.format(cnt))
    for brand2, users2 in train_brand_items:
        if brand1 != brand2:
        # 避免生成与自己的相似度
            sim = len(users1&users2)/math.sqrt(len(users1)*len(users2))
            SIM_brand[brand1][brand2] = sim
            SIM_brand_list[brand1][brand2] = sim

Finished 2000 brands..
Finished 4000 brands..
Finished 6000 brands..
Finished 8000 brands..
Finished 10000 brands..


In [9]:
### user_id与对应操作的的brands
action_brand = df_new.groupby('user_id')['brand'].apply(list).to_dict()

In [10]:
### 这一步相当于为了下一步只计算test1中拥有的user，为了节省时间
jdata_action_test1 = pd.read_hdf('./data/action_test1.h5', key='df', mode='r')
users_test1 = jdata_action_test1[jdata_action_test1['type']==2].user_id.to_list()

In [11]:
# 为了测试首先把根据每个user的历史log对应的item的相似度相加的矩阵建好
Rec = defaultdict(lambda : np.zeros(12000))
users_in_train = action_brand.keys()
users_in_test_train = set()
cnt = 0
for u in users_test1:
    cnt +=1 
    if cnt % 5000 == 0:
        print('Finished {} users!'.format(cnt))
    if u not in users_in_train:
        continue
    users_in_test_train.add(u)
    for brand in action_brand[u]:
            Rec[u] += SIM_brand_list[brand]

Finished 5000 users!
Finished 10000 users!
Finished 15000 users!
Finished 20000 users!
Finished 25000 users!
Finished 30000 users!
Finished 35000 users!
Finished 40000 users!
Finished 45000 users!
Finished 50000 users!
Finished 55000 users!


In [12]:
def Recommend(user, k):
    ### 返回array中最大的topK个元素的index
    idx = np.argpartition(Rec[user], -k)[-k:]
    return set(idx)

In [14]:
# 测试
from Metrics import metrics
mtc = metrics()
mtc.PrecisonRecall(Recommend, 3)

Generate Testset successfully....


[0.023749618786215308, 0.008258930050972049]