In [81]:
users = ["User1","User2","User3","User4","User5"]
items = ["ItemA","ItemB","ItemC","ItemD","ItemE"]

# 用户购买记录数据集
datasets = [
    [1,0,1,1,0],
    [1,0,0,1,1],
    [1,0,1,0,0],
    [0,1,0,1,1],
    [1,1,1,0,1]
]

In [83]:
import pandas as pd
df = pd.DataFrame(datasets,columns=items,index=users)
df

Unnamed: 0,ItemA,ItemB,ItemC,ItemD,ItemE
User1,1,0,1,1,0
User2,1,0,0,1,1
User3,1,0,1,0,0
User4,0,1,0,1,1
User5,1,1,1,0,1


In [84]:
# 计算某两项的杰卡德相似系数
from sklearn.metrics import jaccard_score

# 计算物品相似度
jaccard_score(df['ItemA'],df['ItemB'])   # 并集 = 1 个,交集 = 5 个

0.2

# User-Based CF

基于用户的协同过滤

In [85]:
# 计算两两数据的杰卡德相似系数
from sklearn.metrics.pairwise import pairwise_distances

# 计算用户间的杰卡德相似系数。 杰卡德相似系数 = 1 - 杰卡德距离
user_similar = 1 - pairwise_distances(df.values, metric='jaccard')
user_similar = pd.DataFrame(user_similar,columns=users,index=users)
user_similar



Unnamed: 0,User1,User2,User3,User4,User5
User1,1.0,0.5,0.666667,0.2,0.4
User2,0.5,1.0,0.25,0.5,0.4
User3,0.666667,0.25,1.0,0.0,0.5
User4,0.2,0.5,0.0,1.0,0.4
User5,0.4,0.4,0.5,0.4,1.0


In [53]:
# 存储每个与用户最相似的2个用户
top2_users = {}

# 为每个用户找到最相似的两个用户
for u in user_similar.index:
    _df = user_similar.loc[u].drop([u])           # 取出每一列数据，删除自己
    _df_sorted = _df.sort_values(ascending=False) # 将相似度按照降序排序
    top2_users[u] = list(_df_sorted.index[:2])    # 取出相似度最高的2个用户

# 输出
top2_users

{'User1': ['User3', 'User2'],
 'User2': ['User1', 'User4'],
 'User3': ['User1', 'User5'],
 'User4': ['User2', 'User5'],
 'User5': ['User3', 'User1']}

In [77]:
import numpy as np

# 储存推荐结果
recommended_results = {}

# 根据 top2_users 构建推荐结果
for u,sim_users in top2_users.items():                       # items():返回可遍历的(键, 值) 元组数组
    rc_res = set()                                           # 存储每个用户的推荐结果
    for sim_user in sim_users:
        _item = df.loc[sim_user].replace(0,np.nan).dropna()  # 去 df 找当前相似用户的购买记录，将 0 转换为 Nan，并删除
        rc_res = rc_res.union(set(_item.index))              # 将相似用户买过的物品放入 
        
    # 过滤掉当前用户已经买过的物品
    rc_res -= set(df.loc[u].replace(0,np.nan).dropna().index)
    recommended_results[u] = rc_res

In [78]:
recommended_results

{'User1': {'ItemE'},
 'User2': {'ItemB', 'ItemC'},
 'User3': {'ItemB', 'ItemD', 'ItemE'},
 'User4': {'ItemA', 'ItemC'},
 'User5': {'ItemD'}}

# Item-Based CF

基于物品的协同过滤

In [98]:
# 计算物品间的杰卡德相似系数。需要先将 df 翻转，将 Item 作为行。
item_similar = 1 - pairwise_distances(df.values.T,metric='jaccard')
item_similar = pd.DataFrame(item_similar,columns=items,index=items)
item_similar



Unnamed: 0,ItemA,ItemB,ItemC,ItemD,ItemE
ItemA,1.0,0.2,0.75,0.4,0.4
ItemB,0.2,1.0,0.25,0.25,0.666667
ItemC,0.75,0.25,1.0,0.2,0.2
ItemD,0.4,0.25,0.2,1.0,0.5
ItemE,0.4,0.666667,0.2,0.5,1.0


In [89]:
# 存储每个与物品最相似的两个物品
top2_items = {}

for i in item_similar.index:
    _df = item_similar.loc[i].drop([i])           # 删除自己
    _df_sorted = _df.sort_values(ascending=False) # 排序
    top2_items[i] = list(_df_sorted.index[:2])    # 取最相似的两个物品
    
top2_items

{'ItemA': ['ItemC', 'ItemD'],
 'ItemB': ['ItemE', 'ItemC'],
 'ItemC': ['ItemA', 'ItemB'],
 'ItemD': ['ItemE', 'ItemA'],
 'ItemE': ['ItemB', 'ItemD']}

In [95]:
# 储存推荐结果
recommended_results = {}

# 遍历所有用户构建推荐结果
for u in df.index:
    rc_res = set()  # 存储推荐结果
    
    have_to_buy = df.loc[u].replace(0,np.nan).dropna().index # 取出用户已购买物品的列表
    for i in have_to_buy:
        rc_res = rc_res.union(top2_items[i])               # 根据买过的物品，找到其最相似的物品，构建推荐结果
        
    # 过滤掉用户已经购买过的物品
    rc_res -= set(df.loc[u].replace(0,np.nan).dropna().index)
    recommended_results[u] = rc_res

In [96]:
recommended_results

{'User1': {'ItemB', 'ItemE'},
 'User2': {'ItemB', 'ItemC'},
 'User3': {'ItemB', 'ItemD'},
 'User4': {'ItemA', 'ItemC'},
 'User5': {'ItemD'}}