In [1]:
# import packages
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
from utils import *
warnings.filterwarnings('ignore')

In [2]:
data_path = '../data_path/'
save_path = '../tmp_results/'

In [3]:
all_click_df = get_all_click_sample(data_path)
all_click_df[0:2]

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
24,199992,272143,1507029683969,4,1,17,1,25,2
25,199992,348111,1507029702470,4,1,17,1,25,2


In [11]:
def itemcf_sim(df):
    """
        文章与文章之间的相似性矩阵计算
        :param df: 数据表
        :item_created_time_dict:  文章创建时间的字典
        return : 文章与文章的相似性矩阵
        思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习)， 在多路召回部分会加上关联规则的召回策略
    """
    
    user_item_time_dict = get_user_item_time(df)
    
    # 计算物品相似度
    i2i_sim = {}
    item_cnt = defaultdict(int)
    for user, item_time_list in tqdm(user_item_time_dict.items()):
        # 在基于商品的协同过滤优化的时候可以考虑时间因素
        for i, i_click_time in item_time_list:
            item_cnt[i] += 1
            i2i_sim.setdefault(i, {})
            for j, j_click_time in item_time_list:
                if(i == j):
                    continue
                i2i_sim[i].setdefault(j, 0)
                
                i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)
                
    i2i_sim_ = i2i_sim.copy()
    for i, related_items in i2i_sim.items():
        for j, wij in related_items.items():
            i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
    
    # 将得到的相似性矩阵保存到本地
    pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))
    
    return i2i_sim_

i2i_sim = itemcf_sim(all_click_df)

100%|██████████| 10000/10000 [00:00<00:00, 23290.96it/s]


In [15]:
len(i2i_sim[50644])

245

In [18]:
# 基于商品的召回i2i
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):
    """
        基于文章协同过滤的召回
        :param user_id: 用户id
        :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列   {user1: {item1: time1, item2: time2..}...}
        :param i2i_sim: 字典，文章相似性矩阵
        :param sim_item_topk: 整数， 选择与当前文章最相似的前k篇文章
        :param recall_item_num: 整数， 最后的召回文章数量
        :param item_topk_click: 列表，点击次数最多的文章列表，用户召回补全        
        return: 召回的文章列表 {item1:score1, item2: score2...}
        注意: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习)， 在多路召回部分会加上关联规则的召回策略
    """
    
    # 获取用户历史交互的文章
    user_hist_items = user_item_time_dict[user_id]
    
    item_rank = {}
    for loc, (i, click_time) in enumerate(user_hist_items):
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:
            if j in user_hist_items:
                continue
                
            item_rank.setdefault(j, 0)
            item_rank[j] +=  wij
    
    # 不足10个，用热门商品补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items(): # 填充的item应该不在原来的列表中
                continue
            item_rank[item] = - i - 100 # 随便给个负数就行
            if len(item_rank) == recall_item_num:
                break
    
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank


In [19]:
import collections
# 定义
user_recall_items_dict = collections.defaultdict(dict)

# 获取 用户 - 文章 - 点击时间的字典
user_item_time_dict = get_user_item_time(all_click_df)

# 去取文章相似度
i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))

# 相似文章的数量
sim_item_topk = 10

# 召回文章数量
recall_item_num = 10

# 用户热度补全
item_topk_click = get_item_topk_click(all_click_df, k=50)

for user in tqdm(all_click_df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, 
                                                        sim_item_topk, recall_item_num, item_topk_click)




100%|██████████| 10000/10000 [00:05<00:00, 1835.32it/s]


In [20]:
user_recall_items_dict

defaultdict(dict,
            {199992: [(234698, 0.4629845081963152),
              (235870, 0.450998173159896),
              (300755, 0.43935014561520125),
              (336221, 0.398714163978004),
              (156964, 0.32558046175154975),
              (300082, 0.31952781125572205),
              (95716, 0.3089749124573355),
              (277492, 0.30402523992610564),
              (48403, 0.2865121262339104),
              (277491, 0.27993842203651603)],
             199986: [(198567, 1.628996061162192),
              (293433, 1.628996061162192),
              (286259, 1.6289960611621919),
              (59002, 1.098529758769651),
              (5408, 1.098529758769651),
              (57605, 0.815016937384775),
              (5366, 0.395173453055193),
              (70217, 0.2854397595773914),
              (42225, 0.2790553132756236),
              (58434, 0.24014991860579388)],
             199968: [(284020, 0.22755980665670933),
              (62726, 0.19493562262564004),


In [21]:
# 将字典的形式转换成df
user_item_score_list = []

for user, items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])

100%|██████████| 10000/10000 [00:00<00:00, 60421.51it/s]


In [22]:
recall_df

Unnamed: 0,user_id,click_article_id,pred_score
0,199992,234698,0.462985
1,199992,235870,0.450998
2,199992,300755,0.439350
3,199992,336221,0.398714
4,199992,156964,0.325580
...,...,...,...
99995,7,66056,0.107155
99996,7,63772,0.107155
99997,7,63651,0.107155
99998,7,211401,0.102744


In [25]:
# 生成提交文件
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)


In [27]:
# 获取测试集
tst_click = pd.read_csv(data_path + 'testA_click_log.csv')
tst_users = tst_click['user_id'].unique()

# 从所有的召回数据中将测试集中的用户选出来
tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]

# 生成提交文件
submit(tst_recall, topk=5, model_name='itemcf_baseline')


Unnamed: 0,user_id,click_article_id,pred_score


In [30]:
recall_df[0:7]

Unnamed: 0,user_id,click_article_id,pred_score
0,199992,234698,0.462985
1,199992,235870,0.450998
2,199992,300755,0.43935
3,199992,336221,0.398714
4,199992,156964,0.32558
5,199992,300082,0.319528
6,199992,95716,0.308975


In [34]:

submit = recall_df[recall_df['rank'] <= 5].set_index(['user_id', 'rank']).unstack(-1).reset_index()

submit

Unnamed: 0_level_0,user_id,click_article_id,click_article_id,click_article_id,click_article_id,click_article_id
rank,Unnamed: 1_level_1,1.0,2.0,3.0,4.0,5.0
0,7,291633,36162,30760,364017,68865
1,12,291633,214085,159192,209682,68865
2,30,107299,59761,205958,8172,196383
3,33,50644,218028,202355,215613,196383
4,51,85267,277170,36162,217714,129640
...,...,...,...,...,...,...
9995,199831,199198,64329,198659,277278,181147
9996,199912,199198,64329,198659,156560,199197
9997,199968,284020,62726,62615,62348,62608
9998,199986,198567,293433,286259,59002,5408
