In [1]:
import numpy as np
import pandas as pd


#### 相似度计算

In [91]:
def euclidSim(inA, inB):
    """
    欧氏距离
    """
    return 1.0 / (1.0 + np.linalg.norm(inA - inB))
def pearsSim(inA, inB):
    """
    返回的是inA, inb的皮尔逊相关系数 对角线上为1,[0][1]取第1行第2列的,为inA与inB的相关系数
    """
    corr = np.corrcoef(inA, inB, rowvar=0)[0][1]
    return 0 if np.isnan(corr) else 0.5 + 0.5 * corr
def cosSim(inA, inB):
    num = float(inA.T * inB)
    denom = np.linalg.norm(inA) * np.linalg.norm(inB)
    if denom == 0:
        return 0
    return 0.5 + 0.5 * (num / denom)
    
def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

In [82]:
my_data = np.mat(loadExData())

In [83]:
print('欧式距离', euclidSim(my_data[:, 0], my_data[:, 4]))
print('皮尔逊相关系数', pearsSim(my_data[:, 0], my_data[:, 4]))
print('余弦相似度', cosSim(my_data[:, 0], my_data[:, 4]))

欧式距离 0.12973190755680383
皮尔逊相关系数 0.20596538173840329
余弦相似度 0.5


#### 基于物品相似度的推荐系统

In [88]:
def standEst(dataMat, user, sim, item):
    """
    获取未评论的item的评分
    """
    cols = dataMat.shape[1]
    # 初始化总相似度和总评分
    simTotal = 0
    ratTotal = 0
    # 遍历该用户下所有的item
    for i in range(cols):
        userRating = dataMat[user, i]
        if userRating == 0:
            continue
        # 能走到这里 说明用户吃过而且评过分
        # 计算用户吃过的和未吃过的相似度, item是未吃过,i是吃过的
#         print(np.nonzero(np.logical_and(dataMat[:, item].A > 0, dataMat[:, i].A > 0))[0])
        # 获取既评分过item 又评过i的所有用户
        overLap = np.nonzero(np.logical_and(dataMat[:, item].A > 0, dataMat[:, i].A > 0))[0]
        if len(overLap) == 0:
            similarity = 0
        else:
            similarity = sim(dataMat[overLap, item], dataMat[overLap, i])
        simTotal += similarity
        # similarity 相当于一个权重
        ratTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratTotal / simTotal
        
    
def recommend(dataMat, user, k=3, sim=cosSim, estMethod=standEst):
    # 获取用户没有评分的项目
    unratedItems = np.nonzero(my_data[user, :].A == 0)[1]
    if len(unratedItems) == 0:
        return 'you rated everything'
    itemScores = []
    # 遍历没有评论过的item
    for item in unratedItems:
        # 获取未评论item的评分
        estimatedScore = estMethod(dataMat, user, sim, item)
        itemScores.append((item, estimatedScore))
#     print(itemScores)
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:k]

In [85]:
my_data[0, 1] = my_data[0, 0] = my_data[1, 0] = my_data[2, 0] = 4
my_data[3, 3] = 2
my_data

matrix([[4, 4, 0, 2, 2],
        [4, 0, 0, 3, 3],
        [4, 0, 0, 1, 1],
        [1, 1, 1, 2, 0],
        [2, 2, 2, 0, 0],
        [5, 5, 5, 0, 0],
        [1, 1, 1, 0, 0]])

In [29]:
np.nonzero(my_data[2, :].A == 0)[1]

array([1, 2])

In [30]:
my_data.shape

(7, 5)

In [92]:
recommend(my_data, 2)


[(2, 2.5), (1, 2.0243290220056256)]

In [90]:
recommend(my_data, user=2, sim=euclidSim)

[(2, 3.0), (1, 2.8266504712098603)]

In [65]:
recommend(my_data, user=2, sim=pearsSim)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


[(1, 4.0), (2, 4.0)]

In [53]:
# print(recommend(my_data, user=3))
# print(recommend(my_data, user=3, sim=euclidSim))
print(recommend(my_data, user=3, sim=pearsSim))

[(4, 0)]


  c /= stddev[:, None]
  c /= stddev[None, :]


#### 利用SVD提高推荐效果

In [54]:
my_data2 = np.mat(loadExData2())
U, Sigma, VT = np.linalg.svd(my_data2)

In [55]:
Sigma

array([15.77075346, 11.40670395, 11.03044558,  4.84639758,  3.09292055,
        2.58097379,  1.00413543,  0.72817072,  0.43800353,  0.22082113,
        0.07367823])

In [58]:
Sigma2 = Sigma ** 2
sum(Sigma2)

541.9999999999994

In [59]:
# 包含的信息至少90以上
sum(Sigma2) * 0.9

487.7999999999995

In [60]:
# 这样的话 可以只取Sigma2前三个
sum(Sigma2[:3])

500.5002891275791

#### 基于SVD的评分估计

In [96]:
def svdEst(dataMat, user, sim, item):
    n = dataMat.shape[1]
    simTotal = 0
    ratSimTotal = 0
    U, Sigma, VT = np.linalg.svd(dataMat)
    Sigma4 = np.mat(np.eye(4) * Sigma[:4])
    xformedItems = dataMat.T * U[:, :4] * Sigma4.I
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0:
            continue
        similarity = sim(xformedItems[item, :].T, xformedItems[j, :].T)
        print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal


#     print(Sigma4)

In [98]:
recommend(my_data, user=1, sim=cosSim, estMethod=svdEst)

the 1 and 0 similarity is: 0.498142
the 1 and 3 similarity is: 0.498131
the 1 and 4 similarity is: 0.509974
the 2 and 0 similarity is: 0.552670
the 2 and 3 similarity is: 0.552976
the 2 and 4 similarity is: 0.217301


[(2, 3.4177569186592374), (1, 3.330717154558564)]

In [69]:
a = [1, 2, 3, 4, 5, 6]
np.eye(4) * a[:4]

array([[1., 0., 0., 0.],
       [0., 2., 0., 0.],
       [0., 0., 3., 0.],
       [0., 0., 0., 4.]])