In [9]:
import numpy as np
import pandas as pd


In [10]:
from tqdm import trange


class UserBased:
    mu: np.ndarray
    sim: np.ndarray

    def __init__(self, zero_mean: bool = True, beta: int = 1, idf: bool = False, verbosity: int = 0):
        """

        :param zero_mean:
        :param beta: Discounting parameter
        :param idf: Enable inverse document frequency management
        """
        self.zero_mean = zero_mean
        self.beta = beta
        self.idf = idf
        self.verbosity = verbosity

    def fit(self, r: np.ndarray):
        m, n = r.shape
        if self.zero_mean:
            self.mu = np.nanmean(r, axis=1)
        else:
            self.mu = np.zeros(m)

        self.sim = np.zeros((m, m))

        if self.idf:
            idf = np.log(1 + m / (~np.isnan(r)).sum(axis=0))
        else:
            idf = np.ones(n)

        if self.verbosity > 0:
            print(idf)

        for i in trange(m):
            for j in range(m):
                mask = ~np.isnan(r[i, :]) & ~np.isnan(r[j, :])

                si = r[i, mask] - self.mu[i]
                sj = r[j, mask] - self.mu[j]

                self.sim[i][j] = (si * sj * idf[mask]).sum() / (
                        np.sqrt((idf[mask] * (si ** 2)).sum()) * np.sqrt((idf[mask] * (sj ** 2)).sum()))

                total_intersection = mask.sum()

                self.sim[i][j] *= min(total_intersection, self.beta) / self.beta

        return self.sim

    def predict(self, r: np.array, u: int, top_k: int = 3) -> np.ndarray:
        """

        :param r: Rating matrix
        :param u: User u
        :param top_k: Top k neighbourhood
        :return: Calculated Rating of each item
        """

        _, n = r.shape

        score = np.zeros(n)

        for j in trange(n):
            score[j] = self.predict1(r, u, j, top_k)

        return score

    def predict1(self, r: np.array, u: int, j: int, top_k: int = 3) -> float:
        _, n = r.shape

        users_rated_j = np.nonzero(~np.isnan(r[:, j]))[0]

        topk_users = users_rated_j[self.sim[u, users_rated_j].argsort()[::-1][:top_k]]

        mean_centered_topk_user_rate = r[topk_users, j] - self.mu[topk_users]

        w = self.sim[u, topk_users]

        return np.dot(mean_centered_topk_user_rate, w) / np.abs(w).sum() + self.mu[u]



In [None]:
df = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.data', delimiter=r'\t', engine='python',
                 names=['user_id', 'item_id', 'rating', 'timestamp'])
r = df.pivot(index='user_id', columns='item_id', values='rating').values

In [11]:
r = np.array([[7, 6, 7, 4, 5, 4],
              [6, 7, np.nan, 4, 3, 4],
              [np.nan, 3, 3, 1, 1, np.nan],
              [1, 2, 3, 3, 3, 4],
              [1, np.nan, 1, 2, 3, 3]])

In [12]:
irow, jcol = np.where(~np.isnan(r))
idx = np.random.choice(np.arange(3), 3, replace=False)
test_irow = irow[idx]
test_jcol = jcol[idx]

[[nan nan nan  4.  5.  4.]
 [ 6.  7. nan  4.  3.  4.]
 [nan  3.  3.  1.  1. nan]
 [ 1.  2.  3.  3.  3.  4.]
 [ 1. nan  1.  2.  3.  3.]]


In [13]:
r_copy = r.copy()
for i in test_irow:
    for j in test_jcol:
        r_copy[i][j] = np.nan




In [20]:
r_not_nan = np.argwhere(~np.isnan(r_copy))
print(r_not_nan)
print(r_not_nan[:,0])
def get_users_rated_item(r_not_nan, item):
    return r_not_nan[r_not_nan[:,0] == item][:, 1]


[[0 3]
 [0 4]
 [0 5]
 [1 0]
 [1 1]
 [1 3]
 [1 4]
 [1 5]
 [2 1]
 [2 2]
 [2 3]
 [2 4]
 [3 0]
 [3 1]
 [3 2]
 [3 3]
 [3 4]
 [3 5]
 [4 0]
 [4 2]
 [4 3]
 [4 4]
 [4 5]]
[0 0 0 1 1 1 1 1 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4]


In [15]:
user = UserBased()
sim = user.fit(r_copy)


100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1245.27it/s]


In [23]:
r_not_nan = np.argwhere(~np.isnan(r_copy))

def get_users_rated_item(r_not_nan, item):
    users = []
    for i in r_not_nan:
        if i[1] == item:
            users.append(i[0])
    return users

def get_items_rated_by_user(r_not_nan, user):
    items = []
    for i in r_not_nan:
        if i[0] == user:
            items.append(i[1])
    return items


print(sim.shape)
def get_topk_users(users_rated_item, u, k):
    topk = {}
    for i in users_rated_item:
        topk[i] = sim[i][u]
    topk = dict(sorted(topk.items(), key=lambda item: item[1]))
    topk.popitem()

    return list(reversed(list(topk)))[:k]
# item = 5
# user = 2
# k = 3
# top_k_users = get_topk_users(get_users_rated_item(r_not_nan, item), user, k)
# top_k_users


(5, 5)


In [None]:
users = get_users_rated_item(r_not_nan, 3)

get_topk_users(users, 1, 2)

In [120]:
# mu = np.nanmean(r_copy, axis=1)
#
# m, n = r_copy.shape
# r_pred = np.empty([m, n])
# r_pred[:] = np.nan
# k = 2
# w = np.random.random(k).T
# alpha = 0.001
#
#
# for iteration in range(100):
#     for idx in r_not_nan:
#         u = idx[0]
#         i = idx[1]
#         top_k_users = get_topk_users(users_rated_item=get_users_rated_item(r_not_nan, i), u=u, k=k)
#         r_pred[u, i]= np.dot(w, (r_copy[top_k_users, i] - mu[top_k_users])).sum() + mu[u]
#     print(np.power((r_copy- r_pred), 2))
#     g_wu = -2 * (r_copy[u, i] - r_pred[u, i]) * (r_copy[i , j] - mu[u])
#     w += alpha * g_wu

def predict_user(r, u, k, alpha):

    r_not_nan = np.argwhere(~np.isnan(r))
    mu = np.nanmean(r, axis=1)
    items = get_items_rated_by_user(r_not_nan, u)
    w = np.random.random(k).T
    r_pred = np.empty(n)
    r_pred[:] = np.nan

    for iteration in range(100):
        diff = np.empty(r_copy.shape[1])
        for j in items:
            top_k_users = get_topk_users(users_rated_item=get_users_rated_item(r_not_nan, j), u=u, k=k)
            r_pred[j] = mu[u] + np.dot(w, (r[top_k_users, j]) - mu[u]).sum()
            np.append(diff, (r[u] - r_pred) * (r[u, j] - mu[u]))

        g_wu = -2 * diff.sum()
        print(g_wu)
        w = w - (g_wu * alpha)

    for j in items:
            top_k_users = get_topk_users(users_rated_item=get_users_rated_item(r_not_nan, j), u=u, k=k)
            r_pred[j] = mu[u] + np.dot(w, (r[top_k_users, j]) - mu[u]).sum()
    return r_pred


In [119]:
for u in range(r_copy.shape[0]):
    u0 = predict_user(r_copy, u, 2, 0.001)
    print(u0)


[nan nan nan nan nan nan]
[nan nan nan nan nan nan]
[nan nan nan nan nan nan]
[nan nan nan nan nan nan]
[nan nan nan nan nan nan]


In [121]:
predict_user(r_copy, 0, 2, 0.001)


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


array([nan, nan, nan, nan, nan, nan])