# 第5章 ユーザベース協調フィルタリング

# 準備

In [1]:
import pprint
import numpy as np
np.set_printoptions(precision=3)

# 近傍ユーザ数
K_USERS = 3
# 閾値
THETA = 0

R = np.array([
              [np.nan, 4,      3,      1,      2,      np.nan],
              [5,      5,      4,      np.nan, 3,      3     ],
              [4,      np.nan, 5,      3,      2,      np.nan],
              [np.nan, 3,      np.nan, 2,      1,      1     ],
              [2,      1,      2,      4,      np.nan, 3     ],
])
U = np.arange(R.shape[0])
I = np.arange(R.shape[1])
Ui = [U[~np.isnan(R)[:,i]] for i in I]
Iu = [I[~np.isnan(R)[u,:]] for u in U]
ru_mean = np.nanmean(R, axis=1)
R2 = R - ru_mean.reshape((ru_mean.size, 1))

# ピアソンの相関係数

## 01 ピアソンの相関係数（分子）
## 02 ピアソンの相関係数の算出（分母左部）
## 03 ピアソンの相関係数の算出（分母右部）

In [2]:
def pearson1(u, v):
    """
    評価値行列Rにおけるユーザuとユーザvのピアソンの相関係数を返す。

    Parameters
    ----------
    u : int
        ユーザuのID
    v : int
        ユーザvのID

    Returns
    -------
    float
        ピアソンの相関係数
    """
    Iuv = np.intersect1d(Iu[u], Iu[v])

    num = np.sum([(R[u, i] - ru_mean[u]) * (R[v, i] - ru_mean[v]) for i in Iuv])
    print('num = {}'.format(num))
    den_u = np.sqrt(np.sum([(R[u, i] - ru_mean[u])**2 for i in Iuv]))
    print('den_u = {:.3f}'.format(den_u))
    den_v = np.sqrt(np.sum([(R[v, i] - ru_mean[v])**2 for i in Iuv]))
    print('den_v = {:.3f}'.format(den_v))
    
    prsn = num / (den_u * den_v)
    return prsn

In [3]:
u = 0
v = 1
prsn = pearson1(u, v)
print('pearson1({}, {}) = {:.3f}'.format(u, v, prsn))

num = 2.0
den_u = 1.658
den_v = 1.414
pearson1(0, 1) = 0.853


# 平均中心化評価値行列に基づくピアソンの相関係数

## 04 ピアソンの相関係数（分子）

## 05 ピアソンの相関係数の算出（分母左部）

## 06 ピアソンの相関係数の算出（分母右部）

In [4]:
def pearson2(u, v):
    """
    平均中心化評価値行列R2におけるユーザuとユーザvのピアソンの相関係数を返す。

    Parameters
    ----------
    u : int
        ユーザuのID
    v : int
        ユーザvのID

    Returns
    -------
    float
        ピアソンの相関係数
    """
    Iuv = np.intersect1d(Iu[u], Iu[v])
    
    num = np.sum([R2[u, i] * R2[v, i] for i in Iuv])
    print('num = {}'.format(num))
    den_u = np.sqrt(np.sum([R2[u, i]**2 for i in Iuv]))
    print('den_u = {:.3f}'.format(den_u))
    den_v = np.sqrt(np.sum([R2[v, i]**2 for i in Iuv]))
    print('den_v = {:.3f}'.format(den_v))

    prsn = num / (den_u * den_v)
    return prsn

In [5]:
u = 0
v = 1
prsn = pearson2(u, v)
print('pearson2({}, {}) = {:.3f}'.format(u, v, prsn))

num = 2.0
den_u = 1.658
den_v = 1.414
pearson2(0, 1) = 0.853


# ユーザ-ユーザ類似度行列

In [6]:
def pearson2(u, v):
    """
    平均中心化評価値行列R2におけるユーザuとユーザvのピアソンの相関係数を返す。

    Parameters
    ----------
    u : int
        ユーザuのID
    v : int
        ユーザvのID

    Returns
    -------
    float
        ピアソンの相関係数
    """
    Iuv = np.intersect1d(Iu[u], Iu[v])
    
    num = np.sum([R2[u, i] * R2[v, i] for i in Iuv])
#     print('num = {}'.format(num))
    den_u = np.sqrt(np.sum([R2[u, i]**2 for i in Iuv]))
#     print('den_u = {:.3f}'.format(den_u))
    den_v = np.sqrt(np.sum([R2[v, i]**2 for i in Iuv]))
#     print('den_v = {:.3f}'.format(den_v))

    prsn = num / (den_u * den_v)
    return prsn

In [7]:
def sim(u, v):
    """
    ユーザ類似度関数：ユーザuとユーザvのユーザ類似度を返す。

    Parameters
    ----------
    u : int
        ユーザuのID
    v : int
        ユーザvのID

    Returns
    -------
    float
        ユーザ類似度
    """
    return pearson2(u, v)

## 07 ユーザ-ユーザ類似度行列

In [8]:
S = np.zeros((U.size, U.size))
for u in U:
    for v in U:
        S[u, v] = sim(u, v)
print('S = \n{}'.format(S))

S = 
[[ 1.     0.853  0.623  0.582 -0.997]
 [ 0.853  1.     0.649  0.968 -0.853]
 [ 0.623  0.649  1.     0.8   -0.569]
 [ 0.582  0.968  0.8    1.    -0.551]
 [-0.997 -0.853 -0.569 -0.551  1.   ]]


# 類似ユーザの選定

## 08 類似度上位k人のユーザ集合

## 09 類似度がしきい値以上のユーザ集合

In [9]:
# ユーザ-ユーザ類似度行列から対象ユーザを除外した辞書
Uu = {u: {v: S[u,v] for v in U if u!=v} for u in U}
print('Uu = ')
pprint.pprint(Uu)
Uu = {u: dict(sorted(Uu[u].items(), key=lambda x:x[1], reverse=True)[:K_USERS]) for u in U}
print('Uu = ')
pprint.pprint(Uu)
Uu = {u: {user: score for user, score in Uu[u].items() if score >= THETA} for u in U}
print('Uu = ')
pprint.pprint(Uu)
# 各ユーザの類似ユーザ集合をまとめた辞書
Uu = {u: np.array(list(Uu[u].keys())) for u in U}
print('Uu = ')
pprint.pprint(Uu)

Uu = 
{0: {1: 0.8528028654224417,
     2: 0.6225430174794672,
     3: 0.5816750507471109,
     4: -0.9968461286620518},
 1: {0: 0.8528028654224417,
     2: 0.6488856845230501,
     3: 0.9684959969581863,
     4: -0.8528028654224418},
 2: {0: 0.6225430174794672,
     1: 0.6488856845230501,
     3: 0.7999999999999998,
     4: -0.5685352436149611},
 3: {0: 0.5816750507471109,
     1: 0.9684959969581863,
     2: 0.7999999999999998,
     4: -0.550920031004556},
 4: {0: -0.9968461286620518,
     1: -0.8528028654224418,
     2: -0.5685352436149611,
     3: -0.550920031004556}}
Uu = 
{0: {1: 0.8528028654224417, 2: 0.6225430174794672, 3: 0.5816750507471109},
 1: {0: 0.8528028654224417, 2: 0.6488856845230501, 3: 0.9684959969581863},
 2: {0: 0.6225430174794672, 1: 0.6488856845230501, 3: 0.7999999999999998},
 3: {0: 0.5816750507471109, 1: 0.9684959969581863, 2: 0.7999999999999998},
 4: {1: -0.8528028654224418, 2: -0.5685352436149611, 3: -0.550920031004556}}
Uu = 
{0: {1: 0.8528028654224417, 2: 0.6

# 嗜好予測

## 10 類似ユーザ集合の中でアイテムiを評価済みのユーザ集合

## 11 予測評価値

In [10]:
def predict(u, i):
    """
    予測関数：ユーザuのアイテムiに対する予測評価値を返す。

    Parameters
    ----------
    u : int
        ユーザuのID
    i : int
        アイテムiのID

    Returns
    -------
    float
        ユーザuのアイテムiに対する予測評価値
    """
    Uui = np.intersect1d(Uu[u], Ui[i])
    print('U{}{} = {}'.format(u, i, Uui))

    if Uui.size <= 0: return ru_mean[u]
    rui_pred = ru_mean[u] + np.sum(S[u, Uui] * R2[Uui, i]) / np.sum(np.abs(S[u, Uui]))
    
    return rui_pred

In [11]:
u = 0
i = 0
print('r{}{} = {:.3f}'.format(u, i, predict(u, i)))
u = 0
i = 5
print('r{}{} = {:.3f}'.format(u, i, predict(u, i)))

U00 = [1 2]
r00 = 3.289
U05 = [1 3]
r05 = 1.601


# 評価値行列の補完

## 12 評価値行列の補完

In [12]:
R3 = R.copy()
for u in U:
    for i in I:
        if np.isnan(R3[u, i]):
            R3[u, i] = predict(u, i)
print('R\'\' = \n{}'.format(R3))

U00 = [1 2]
U05 = [1 3]
U13 = [0 2 3]
U21 = [0 1 3]
U25 = [1 3]
U30 = [1 2]
U32 = [0 1 2]
U44 = []
R'' = 
[[3.289 4.    3.    1.    2.    1.601]
 [5.    5.    4.    3.449 3.    3.   ]
 [4.    4.747 5.    3.    2.    2.638]
 [2.524 3.    2.384 2.    1.    1.   ]
 [2.    1.    2.    4.    2.4   3.   ]]
