In [2]:
#基于协同过滤的推荐算法（Collaborative Filtering）
#User-Based CF 基于用户的协同过滤推荐

import numpy as np
import pandas as pd

users = ['User1', 'User2', 'User3', 'User4', 'User5']
items = ['ItemA', 'ItemB', 'ItemC', 'ItemD', 'ItemE']

# 1.构造用户购买记录数据集
datasets = [
    [1,0,1,1,0],
    [1,0,0,1,1],
    [1,0,1,0,0],
    [0,1,0,1,1],
    [1,1,1,0,1],
]
df = pd.DataFrame(datasets, columns=items, index=users)
print(df)

# 2.计算用户间相似度
user_similar = []
for user in df.index:
    # 计算每个用户和其他用户间的相似度
    similarity_list = []
    for other_user in df.index:
        user_list = list(df.loc[user])
        other_user_list = list(df.loc[other_user])
        common_num = sum([ 1 for i in range(len(user_list)) if user_list[i] + other_user_list[i] == 2 ])
        similarity = (common_num / sum(user_list)) * (common_num / sum(other_user_list))
        similarity_list.append(similarity)
    # 构建用户间相似度矩阵
    user_similar.append(similarity_list)

user_similar = pd.DataFrame(user_similar, columns=users, index=users)

print("用户之间的两两相似度：")
print(user_similar)

# 3.构造每个用户相似度topN的用户字典
topN_users = {}
for i in user_similar.index:
    # 取出每一列数据，并删除自身，然后排序数据
    _df = user_similar.loc[i].drop([i])
    _df_sorted = _df.sort_values(ascending=False)

    top2 = list(_df_sorted.index[:2])
    topN_users[i] = top2

print("Top2相似用户：")
print(topN_users)

# 4.构建推荐结果
rs_results = {}
for user, sim_users in topN_users.items():
    # 存储每个用户推荐结果
    rs_result = set()   
    # 找到最相似的topN用户购买过的物品
    for sim_user in sim_users:
        rs_result = rs_result.union(set(df.loc[sim_user].replace(0, np.nan).dropna().index))
    # 过滤掉已经购买过的物品
    rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index)

    rs_results[user] = rs_result

print("最终推荐结果：")
print(rs_results)

       ItemA  ItemB  ItemC  ItemD  ItemE
User1      1      0      1      1      0
User2      1      0      0      1      1
User3      1      0      1      0      0
User4      0      1      0      1      1
User5      1      1      1      0      1
用户之间的两两相似度：
          User1     User2     User3     User4     User5
User1  1.000000  0.444444  0.666667  0.111111  0.333333
User2  0.444444  1.000000  0.166667  0.444444  0.333333
User3  0.666667  0.166667  1.000000  0.000000  0.500000
User4  0.111111  0.444444  0.000000  1.000000  0.333333
User5  0.333333  0.333333  0.500000  0.333333  1.000000
Top2相似用户：
{'User1': ['User3', 'User2'], 'User2': ['User1', 'User4'], 'User3': ['User1', 'User5'], 'User4': ['User2', 'User5'], 'User5': ['User3', 'User1']}
最终推荐结果：
{'User1': {'ItemE'}, 'User2': {'ItemB', 'ItemC'}, 'User3': {'ItemB', 'ItemE', 'ItemD'}, 'User4': {'ItemA', 'ItemC'}, 'User5': {'ItemD'}}


In [1]:
#Item-Based CF 基于物品的协同过滤推荐
import numpy as np
import pandas as pd

users = ['User1', 'User2', 'User3', 'User4', 'User5']
items = ['ItemA', 'ItemB', 'ItemC', 'ItemD', 'ItemE']

# 1.构造用户购买记录数据集
datasets = [
    [1,0,1,1,0],
    [1,0,0,1,1],
    [1,0,1,0,0],
    [0,1,0,1,1],
    [1,1,1,0,1],
]
df = pd.DataFrame(datasets, columns=items, index=users)
print(df)

# 2.计算物品间相似度
item_similar = []
for item in df.T.index:
    # 计算每个用户和其他用户间的相似度
    similarity_list = []
    for other_item in df.T.index:
        item_list = list(df.T.loc[item])
        other_item_list = list(df.T.loc[other_item])
        common_num = sum([ 1 for i in range(len(item_list)) if item_list[i] + other_item_list[i] == 2 ])
        similarity = (common_num / sum(item_list)) * (common_num / sum(other_item_list))
        similarity_list.append(similarity)
    # 构建用户间相似度矩阵
    item_similar.append(similarity_list)

item_similar = pd.DataFrame(item_similar, columns=items, index=items)

print("物品之间的两两相似度：")
print(item_similar)

# 3.构造每个物品相似度topN的物品字典
topN_items = {}
for i in item_similar.index:
    # 取出每一列数据，并删除自身，然后排序数据
    _df = item_similar.loc[i].drop([i])
    _df_sorted = _df.sort_values(ascending=False)

    top2 = list(_df_sorted.index[:2])
    topN_items[i] = top2

print("Top2相似物品：")
print(topN_items)

# 4.构建推荐结果
rs_results = {}
for user in df.index:
    # 存储每个用户推荐结果
    rs_result = set() 
    # 根据每个物品找出最相似的topN物品
    for item in df.loc[user].replace(0, np.nan).dropna().index: 
        rs_result = rs_result.union(topN_items[item])
    # 过滤掉用户已经购买过的物品
    rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index)
    
    rs_results[user] = rs_result

print("最终推荐结果：")
print(rs_results)

       ItemA  ItemB  ItemC  ItemD  ItemE
User1      1      0      1      1      0
User2      1      0      0      1      1
User3      1      0      1      0      0
User4      0      1      0      1      1
User5      1      1      1      0      1
物品之间的两两相似度：
          ItemA     ItemB     ItemC     ItemD     ItemE
ItemA  1.000000  0.125000  0.750000  0.333333  0.333333
ItemB  0.125000  1.000000  0.166667  0.166667  0.666667
ItemC  0.750000  0.166667  1.000000  0.111111  0.111111
ItemD  0.333333  0.166667  0.111111  1.000000  0.444444
ItemE  0.333333  0.666667  0.111111  0.444444  1.000000
Top2相似物品：
{'ItemA': ['ItemC', 'ItemD'], 'ItemB': ['ItemE', 'ItemC'], 'ItemC': ['ItemA', 'ItemB'], 'ItemD': ['ItemE', 'ItemA'], 'ItemE': ['ItemB', 'ItemD']}
最终推荐结果：
{'User1': {'ItemB', 'ItemE'}, 'User2': {'ItemB', 'ItemC'}, 'User3': {'ItemB', 'ItemD'}, 'User4': {'ItemA', 'ItemC'}, 'User5': {'ItemD'}}


In [2]:
df.T.index

Index(['ItemA', 'ItemB', 'ItemC', 'ItemD', 'ItemE'], dtype='object')

In [3]:
item_list = list(df.T.loc['ItemA'])

In [4]:
item_list

[1, 1, 1, 0, 1]

In [5]:
other_item_list = list(df.T.loc['ItemB'])

In [6]:
other_item_list

[0, 0, 0, 1, 1]

In [7]:
item = [item_list,other_item_list]
item

[[1, 1, 1, 0, 1], [0, 0, 0, 1, 1]]

In [8]:
np.linalg.norm(item,ord=2)

2.1010029896154583

In [9]:
np.dot(df, df.T)

array([[3, 2, 2, 1, 2],
       [2, 3, 1, 2, 2],
       [2, 1, 2, 0, 2],
       [1, 2, 0, 3, 2],
       [2, 2, 2, 2, 4]], dtype=int64)

In [17]:
# 计算矩阵的行向量范数
norms = np.linalg.norm(df, axis=1)
norms.shape

(5,)

In [18]:
norms[:, np.newaxis].shape

(5, 1)

In [13]:
# 归一化矩阵的行向量
normalized_matrix = df / norms[:, np.newaxis]
normalized_matrix

Unnamed: 0,ItemA,ItemB,ItemC,ItemD,ItemE
User1,0.57735,0.0,0.57735,0.57735,0.0
User2,0.57735,0.0,0.0,0.57735,0.57735
User3,0.707107,0.0,0.707107,0.0,0.0
User4,0.0,0.57735,0.0,0.57735,0.57735
User5,0.5,0.5,0.5,0.0,0.5


In [14]:
np.dot(normalized_matrix, normalized_matrix.T)

array([[1.        , 0.66666667, 0.81649658, 0.33333333, 0.57735027],
       [0.66666667, 1.        , 0.40824829, 0.66666667, 0.57735027],
       [0.81649658, 0.40824829, 1.        , 0.        , 0.70710678],
       [0.33333333, 0.66666667, 0.        , 1.        , 0.57735027],
       [0.57735027, 0.57735027, 0.70710678, 0.57735027, 1.        ]])

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
r = cosine_similarity(df,df)
r

array([[1.        , 0.66666667, 0.81649658, 0.33333333, 0.57735027],
       [0.66666667, 1.        , 0.40824829, 0.66666667, 0.57735027],
       [0.81649658, 0.40824829, 1.        , 0.        , 0.70710678],
       [0.33333333, 0.66666667, 0.        , 1.        , 0.57735027],
       [0.57735027, 0.57735027, 0.70710678, 0.57735027, 1.        ]])

In [26]:
std = np.sqrt(norms)
std

array([1.31607401, 1.31607401, 1.18920712, 1.31607401, 1.41421356])

In [29]:
norms = np.linalg.norm(df, ord=2)
d = std*std.T
d

array([1.73205081, 1.73205081, 1.41421356, 1.73205081, 2.        ])

In [32]:
np.dot(df, df.T)/d[:,None]

array([[1.73205081, 1.15470054, 1.15470054, 0.57735027, 1.15470054],
       [1.15470054, 1.73205081, 0.57735027, 1.15470054, 1.15470054],
       [1.41421356, 0.70710678, 1.41421356, 0.        , 1.41421356],
       [0.57735027, 1.15470054, 0.        , 1.73205081, 1.15470054],
       [1.        , 1.        , 1.        , 1.        , 2.        ]])

In [33]:
vec1 = df
vec2 = df

In [36]:
cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
print(cos_sim)

[[0.13333333 0.06666667 0.13333333 0.13333333 0.06666667]
 [0.13333333 0.13333333 0.13333333 0.13333333 0.13333333]
 [0.13333333 0.         0.13333333 0.06666667 0.        ]
 [0.13333333 0.13333333 0.06666667 0.13333333 0.2       ]
 [0.26666667 0.06666667 0.2        0.13333333 0.13333333]]


In [40]:
list(df.loc['User1'])

[1, 0, 1, 1, 0]

In [42]:
import numpy as np

def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

# 示例向量
vector1 = np.array([1, 2, 3])
vector2 = np.array([4, 5, 6])

# 计算余弦相似度
similarity = cosine_similarity(vector1, vector2)
print("余弦相似度:", similarity)

余弦相似度: 0.8164965809277259


In [43]:
df.index

Index(['User1', 'User2', 'User3', 'User4', 'User5'], dtype='object')

In [44]:
similarity_list = []
for user in df.index:
    # 计算每个用户和其他用户间的相似度
    similarity = []
    for other_user in df.index:
        vector1 = list(df.loc[user])
        vector2 = list(df.loc[other_user])
        # 计算余弦相似度
        similarity.append(cosine_similarity(vector1, vector2))
    similarity_list.append(similarity)

In [46]:
np.array(similarity_list)

array([[1.        , 0.66666667, 0.81649658, 0.33333333, 0.57735027],
       [0.66666667, 1.        , 0.40824829, 0.66666667, 0.57735027],
       [0.81649658, 0.40824829, 1.        , 0.        , 0.70710678],
       [0.33333333, 0.66666667, 0.        , 1.        , 0.57735027],
       [0.57735027, 0.57735027, 0.70710678, 0.57735027, 1.        ]])