In [2]:
import polars as pl
from src.const.path import MIND_SMALL_TRAIN_DATASET_DIR
from src.utils.dataset.MINDDataFrame import MINDDataFrame


In [3]:
news_df, behavior_df = MINDDataFrame.read_df(
    MIND_SMALL_TRAIN_DATASET_DIR / "news.tsv", MIND_SMALL_TRAIN_DATASET_DIR / "behaviors.tsv"
)


In [19]:
import scipy
from numpy import array

I = array([0, 3, 1, 0])
J = array([0, 3, 1, 2])
V = array([4, 5, 7, 9])
A = sparse.coo_matrix((V, (I, J)), shape=(4, 4))


In [10]:
A.toarray()

array([[4, 0, 9, 0],
       [0, 7, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 5]])

In [17]:
# 各ユーザが各ニュースを見ている回数を計測。
behavior_df.select(["user_id", "news_id", "clicked"]).groupby(["user_id", "news_id"]).sum()


user_id,news_id,clicked
str,str,i32
"""U91836""","""N33677""",0
"""U73700""","""N23877""",0
"""U73700""","""N49712""",0
"""U73700""","""N11817""",0
"""U73700""","""N45509""",0
"""U73700""","""N47098""",0
"""U8125""","""N1940""",0
"""U8125""","""N261""",0
"""U8125""","""N10413""",0
"""U8125""","""N20678""",0


In [6]:
# 繰り返し見ているのに、見ていないのは負例の意思が強そうだなーと思う。
behavior_df.shape

(5843444, 6)

In [22]:
behavior_df["user_id"].to_list()


['U13740',
 'U13740',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U91836',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U73700',
 'U34670',
 'U34670',
 'U34670',
 'U34670',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125',
 'U8125

In [44]:
from src.utils.list import uniq
from pydantic import BaseModel
from typing import Tuple


class MatrixIDMapper(BaseModel):
    user_id_to_idx_map: dict[str, int]
    news_id_to_idx_map: dict[str, int]
    idx_to_user_id: list[str]
    idx_to_news_id: list[str]


def create_feedback_matrix_from_behavior_log(
    behavior_df: pl.DataFrame,
) -> Tuple[scipy.sparse.spmatrix, MatrixIDMapper]:
    total_click_df = behavior_df.select(["user_id", "news_id", "clicked"]).groupby(["user_id", "news_id"]).sum()
    user_ids: list[str] = behavior_df["user_id"].to_list()
    news_ids: list[str] = behavior_df["news_id"].to_list()

    unique_user_id, unique_news_id = uniq(user_ids), uniq(news_ids)
    user_id_to_idx_map: dict[str, int] = {uid: i for i, uid in enumerate(unique_user_id)}
    news_id_to_idx_map: dict[str, int] = {uid: i for i, uid in enumerate(unique_news_id)}

    I = np.array([user_id_to_idx_map[uid] for uid in user_ids])
    J = np.array([news_id_to_idx_map[nid] for nid in news_ids])

    clicked = behavior_df["clicked"].to_numpy()
    matrix_shape = (len(unique_user_id), len(unique_news_id))

    feedback_matrix = sparse.csr_matrix((clicked, (I, J)), shape=matrix_shape)

    return feedback_matrix, MatrixIDMapper(
        **{
            "user_id_to_idx_map": user_id_to_idx_map,
            "news_id_to_idx_map": news_id_to_idx_map,
            "idx_to_user_id": unique_user_id,
            "idx_to_news_id": unique_news_id,
        }
    )


In [45]:
feedback_matrix, id_mapper = create_feedback_matrix_from_behavior_log(behavior_df)


In [46]:
from implicit.datasets.lastfm import get_lastfm
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares()
model.fit(feedback_matrix)

userid = 0
ids, scores = model.recommend(userid, feedback_matrix[userid], N=10, filter_already_liked_items=True)

print(f"ids:{ids}")
print(f"scores:{scores}")


100%|██████████| 15/15 [01:00<00:00,  4.02s/it]

ids:[15846  5733 10197  4240  8428  4896 20003  2262 12513 10387]
scores:[0.04059085 0.03450556 0.02596695 0.02336278 0.02251694 0.02097143
 0.01799087 0.01706607 0.01695743 0.01463447]





In [47]:
id_mapper.idx_to_news_id[15846]

'N65185'