In [3]:
import polars as pl
from src.const.path import MIND_SMALL_TRAIN_DATASET_DIR
from src.utils.dataset.MINDDataFrame import MINDDataFrame


In [4]:
news_df, behavior_df = MINDDataFrame.read_df(
    MIND_SMALL_TRAIN_DATASET_DIR / "news.tsv", MIND_SMALL_TRAIN_DATASET_DIR / "behaviors.tsv"
)


In [17]:
# 各ユーザが各ニュースを見ている回数を計測。
behavior_df.select(["user_id", "news_id", "clicked"]).groupby(["user_id", "news_id"]).sum()


user_id,news_id,clicked
str,str,i32
"""U91836""","""N33677""",0
"""U73700""","""N23877""",0
"""U73700""","""N49712""",0
"""U73700""","""N11817""",0
"""U73700""","""N45509""",0
"""U73700""","""N47098""",0
"""U8125""","""N1940""",0
"""U8125""","""N261""",0
"""U8125""","""N10413""",0
"""U8125""","""N20678""",0


In [6]:
# 繰り返し見ているのに、見ていないのは負例の意思が強そうだなーと思う。
behavior_df.shape

(5843444, 6)

In [22]:
behavior_df["user_id"].to_list()


['U13740',
 'U13740',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U34670',
 'U34670',
 'U34670',
 'U34670',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125

In [8]:
import polars as pl
import numpy as np
from implicit.als import AlternatingLeastSquares
from pydantic import BaseModel
from typing import Tuple
import scipy
from src.utils.list import uniq


class MatrixIDMapper(BaseModel):
    user_id_to_idx_map: dict[str, int]
    news_id_to_idx_map: dict[str, int]
    idx_to_user_id: list[str]
    idx_to_news_id: list[str]


def create_feedback_matrix_from_behavior_log(
    behavior_df: pl.DataFrame,
) -> Tuple[scipy.sparse.spmatrix, MatrixIDMapper]:
    total_click_df = behavior_df.select(["user_id", "news_id", "clicked"]).groupby(["user_id", "news_id"]).sum()
    user_ids: list[str] = behavior_df["user_id"].to_list()
    news_ids: list[str] = behavior_df["news_id"].to_list()

    unique_user_id, unique_news_id = uniq(user_ids), uniq(news_ids)
    user_id_to_idx_map: dict[str, int] = {uid: i for i, uid in enumerate(unique_user_id)}
    news_id_to_idx_map: dict[str, int] = {uid: i for i, uid in enumerate(unique_news_id)}

    I = np.array([user_id_to_idx_map[uid] for uid in user_ids])
    J = np.array([news_id_to_idx_map[nid] for nid in news_ids])

    clicked = behavior_df["clicked"].to_numpy()
    matrix_shape = (len(unique_user_id), len(unique_news_id))

    feedback_matrix = scipy.sparse.csr_matrix((clicked, (I, J)), shape=matrix_shape)

    return feedback_matrix, MatrixIDMapper(
        **{
            "user_id_to_idx_map": user_id_to_idx_map,
            "news_id_to_idx_map": news_id_to_idx_map,
            "idx_to_user_id": unique_user_id,
            "idx_to_news_id": unique_news_id,
        }
    )


In [9]:
feedback_matrix, id_mapper = create_feedback_matrix_from_behavior_log(behavior_df)


In [10]:
from implicit.datasets.lastfm import get_lastfm
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares()
model.fit(feedback_matrix)


100%|██████████| 15/15 [01:09<00:00,  4.62s/it]


ids:[[ 2268  3332 15344 ...  4967  4770   539]
 [ 8172 19696 14300 ... 16736 11913  3984]
 [ 2165  3987  2044 ... 10845  3332 19924]
 ...
 [ 8977 17091 10185 ...  6981 15157 11336]
 [ 5762  1036 10206 ... 17828 13293 16979]
 [ 7263  6043  2397 ...  3870 15141 17538]]
scores:[[0.15188073 0.11302099 0.10399709 ... 0.04659994 0.04285727 0.0423349 ]
 [0.11888033 0.09270014 0.08965161 ... 0.05407768 0.05278782 0.05236803]
 [0.2516582  0.19072808 0.07353164 ... 0.02922386 0.02836019 0.0282329 ]
 ...
 [0.15867803 0.15364261 0.10977218 ... 0.05501303 0.05361199 0.05345459]
 [0.11432377 0.07975747 0.07757583 ... 0.06611562 0.06508169 0.06371599]
 [0.11520977 0.11195827 0.10168878 ... 0.04656731 0.04571471 0.04565423]]


In [11]:
user_ids = np.array([1, 3])
ids, scores = model.recommend(user_ids, feedback_matrix[user_ids], N=10, filter_already_liked_items=True)

print(f"ids:{ids}")
print(f"scores:{scores}")


ValueError: user_items must contain 1 row for every user in userids

In [47]:
id_mapper.idx_to_news_id[15846]

'N65185'