In [1]:
import itertools

import numpy as np
import pandas as pd
from surprise import Dataset, KNNBasic
from tqdm import tqdm

In [2]:
data = pd.read_table("ml-100k/u.data", header=None, usecols=[0, 1, 2], names=["user_id", "item_id", "rating"])

In [3]:
# ndarrayの行番号とユーザーID,列番号とアイテムIDを対応付ける辞書を作成
user_id2row_num = {}
row_num2user_id = {}
for i, user_id in enumerate(set(list(data["user_id"]))):
    user_id2row_num[user_id] = i
    row_num2user_id[i] = user_id

item_id2column_num = {}
column_num2item_id = {}
for i, item_id in enumerate(set(list(data["item_id"]))):
    item_id2column_num[item_id] = i
    column_num2item_id[i] = item_id

In [4]:
data.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
# ユーザー×アイテムの評価行列を作成
rate_matrix = np.zeros((len(user_id2row_num), len(item_id2column_num)))
for row in tqdm(data.itertuples(), total=data.shape[0]):
    rate_matrix[user_id2row_num[row.user_id], item_id2column_num[row.item_id]] = row.rating

100%|██████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 855624.19it/s]


In [6]:
data.query("user_id == 1").sort_values("item_id")

Unnamed: 0,user_id,item_id,rating
32236,1,1,5
23171,1,2,3
83307,1,3,4
62631,1,4,3
47638,1,5,3
...,...,...,...
34550,1,268,5
25138,1,269,5
27216,1,270,5
35409,1,271,2


In [7]:
rate_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [8]:
# ユーザー×ユーザーの組み合わせを列挙
row_list = [i for i in range(len(row_num2user_id))]
comb = [c for c in itertools.combinations(row_list, 2)]

In [39]:
for i in range(7):
    print(i)

0
1
2
3
4
5
6


In [42]:
# それぞれのコアにできるだけ均等になるように担当の組み合わせを配布
print(len(comb))
comb_list = np.array_split(comb, 7)

444153


In [43]:
len(comb_list)

7

In [10]:
# コサイン類似度を計算
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [25]:
sim_list = []
# for c in np.array_split(comb, 7)[0]:
for c in tqdm(comb):
    sim_dict = {}
    sim_dict["comb"] = c
    sim_dict["sim"] = cos_sim(rate_matrix[c[0]], rate_matrix[c[1]])
    sim_list.append(sim_dict)

100%|███████████████████████████████████████████████████████| 444153/444153 [00:05<00:00, 86698.62it/s]


In [26]:
# ユーザー×ユーザー行列に反映
sim_matrix = np.zeros((len(user_id2row_num), len(user_id2row_num)))
for s in sim_list:
    sim_matrix[s["comb"][0], s["comb"][1]] = s["sim"]

In [27]:
sim_matrix

array([[0.        , 0.16693098, 0.04745954, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.        , 0.        , 0.11059132, ..., 0.16148478, 0.17226781,
        0.10579788],
       [0.        , 0.        , 0.        , ..., 0.10124256, 0.13341615,
        0.02655587],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.1016418 ,
        0.09511958],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.18246466],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [60]:
# 類似度が上位k件の行リストを作成し、それらのユーザーの評価を平均し、ソート
k = 5
predict_user_id = 1
predict_row_num = user_id2row_num[predict_user_id]


top_k_rows = np.argsort(sim_matrix[predict_row_num])[::-1][:k]

topk_mean_ratings = np.mean(rate_matrix[top_k_rows, :], axis=0)
for i in np.argsort(topk_mean_ratings)[::-1]:
    if rate_matrix[predict_row_num, i] == 0:
        print(column_num2item_id[i], topk_mean_ratings[i])
        break


# top_k_user_ids= [row_num2user_id[i] for i in top_k_rows]

474 4.0


In [58]:
np.argsort(topk_mean_ratings)[::-1]

array([ 173,   49,   55, ...,  771,  768, 1356])

In [56]:
kkk

array([5., 3., 4., ..., 0., 0., 0.])

In [53]:
topk_mean_ratings

array([4.2, 3.2, 1.4, ..., 0. , 0. , 0.6])

In [46]:
top_k_user_ids

[916, 864, 268, 92, 435]

In [47]:
# sim_matrix[0][np.argsort(sim_matrix[0])[::-1][:5]]
np.argsort(sim_matrix[0])[::-1][:5]

array([915, 863, 267,  91, 434])

In [37]:
max(sim_matrix[0])

0.569065731527988

In [14]:
sim_matrix(rate_matrix[0], rate_matrix[4])

0.37847518240569844

In [15]:
rate_matrix[0]

array([5., 3., 4., ..., 0., 0., 0.])

In [16]:
rate_matrix[1]

array([4., 0., 0., ..., 0., 0., 0.])

In [None]:
data.describe()

In [None]:
# Load the movielens-100k dataset
data = Dataset.load_builtin("ml-100k")

# Retrieve the trainset.
trainset = data.build_full_trainset()

In [None]:
# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

In [None]:
uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

In [None]:
iter_num = 100000000
result = 0
for i in range(iter_num):
    result = result + i

In [None]:
result

In [None]:
ls = [i for i in range(10000)]
c = itertools.combinations(ls, 2)

In [None]:
c

In [None]:
combinations = [comb for comb in c]

In [None]:
len(combinations)

In [None]:
import itertools

import numpy as np
import pandas as pd
from mpi4py import MPI
from surprise import Dataset, KNNBasic

# 設定値
core_num = 8
k = 5
predict_user_id = 1


comm = MPI.COMM_WORLD  # 並列処理開始

size = comm.Get_size()  # 並列処理に使用できるプロセッサ数
rank = comm.Get_rank()  # 各プロセッサのIDのようなもの

# コサイン類似度
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


# ユーザー×アイテムの評価行列を作成
def get_rate_matrix():
    # データインポート
    data = pd.read_table("ml-100k/u.data", header=None, usecols=[0, 1, 2], names=["user_id", "item_id", "rating"])

    # ndarrayの行番号とユーザーID,列番号とアイテムIDを対応付ける辞書を作成
    user_id2row_num = {}
    row_num2user_id = {}
    for i, user_id in enumerate(set(list(data["user_id"]))):
        user_id2row_num[user_id] = i
        row_num2user_id[i] = user_id

    item_id2column_num = {}
    column_num2item_id = {}
    for i, item_id in enumerate(set(list(data["item_id"]))):
        item_id2column_num[item_id] = i
        column_num2item_id[i] = item_id

    # ユーザー×アイテムの評価行列を作成
    rate_matrix = np.zeros((len(user_id2row_num), len(item_id2column_num)))
    # for row in tqdm(data.itertuples(), total=data.shape[0]):
    for row in data.itertuples():
        rate_matrix[user_id2row_num[row.user_id], item_id2column_num[row.item_id]] = row.rating

    return rate_matrix, user_id2row_num, row_num2user_id, item_id2column_num, column_num2item_id


# プロセス0が担当領域の分割と各プロセスへの送信、類似度計算結果の取得を行い、結果を収集してpredictする
if rank == 0:
    rate_matrix, user_id2row_num, row_num2user_id, item_id2column_num, column_num2item_id = get_rate_matrix()

    # ユーザー×ユーザーの組み合わせを列挙
    row_list = [i for i in range(len(row_num2user_id))]
    comb = [c for c in itertools.combinations(row_list, 2)]

    # それぞれのコアにできるだけ均等になるように担当の組み合わせを配布
    # TODO:実行時に指定されたコア数で分割するように変更
    comb_list = np.array_split(comb, core_num - 1)
    for i in range(core_num - 1):
        comm.Send(rate_matrix, dest=i + 1)
        comm.Send(comb_list[i], dest=i + 1)

    # 類似度計算結果を格納するユーザー×ユーザー行列を作成
    sim_matrix = np.zeros((len(user_id2row_num), len(user_id2row_num)))
    for i in range(core_num - 1):
        comm.Recv(sim_list, source=i + 1)
        for s in sim_list:
            sim_matrix[s["comb"][0], s["comb"][1]] = s["sim"]

    # 類似度が上位k件のユーザーIDリストを作成
    predict_row_num = user_id2row_num[predict_user_id]
    top_k_rows = np.argsort(sim_matrix[predict_row_num])[::-1][:k]

    topk_mean_ratings = np.mean(rate_matrix[top_k_rows, :], axis=0)
    for i in np.argsort(topk_mean_ratings)[::-1]:
        if rate_matrix[predict_row_num, i] == 0:
            print(f"itemid:{column_num2item_id[i]},predicted_score:{topk_mean_ratings[i]}")
            break

else:
    # 各プロセスは担当の組み合わせの類似度を計算し、プロセス0へ返信
    comm.Recv(rate_matrix, source=0)
    comm.Recv(calc_combs, source=0)

    # コサイン類似度を計算
    sim_list = []
    for c in calc_combs:
        sim_dict = {}
        sim_dict["comb"] = c
        sim_dict["sim"] = cos_sim(rate_matrix[c[0]], rate_matrix[c[1]])
        sim_list.append(sim_dict)

    comm.Send(sim_list, dest=0)
    print("Hello world {0} / {1}".format(rank, size))