In [2]:
from rectools import Columns
from rectools.models import ImplicitALSWrapperModel
from rectools.dataset import Dataset
from rectools.models.utils import recommend_from_scores

from implicit.als import AlternatingLeastSquares

import pandas as pd
from collections import Counter

## Загрузка данных

In [3]:
interactions_df = pd.read_csv("/content/drive/MyDrive/RecSys MTC/kion/interactions_processed.csv")
interactions_df.rename(columns={"last_watch_dt": Columns.Datetime}, inplace=True)

In [4]:
interactions_df.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72
1,699317,1659,2021-05-29,8317,100
2,656683,7107,2021-05-09,10,0


## Предобработка

In [6]:
interactions_df = interactions_df[interactions_df.watched_pct >= 5]

In [7]:
# отбросим пользователей с малым числом просмотров
active_users = []
inactive_users = []
c = Counter(interactions_df.user_id)
for user_id, entries in c.items():
    if entries >= 4:
        active_users.append(user_id)
    else:
        inactive_users.append(user_id)

interactions_df = interactions_df[interactions_df.user_id.isin(active_users)]

len(active_users), len(inactive_users)

(286206, 507802)

Дадим эмпирический рейтинг взаимодействия с айтемом на основании процента просмотра:

- 1 - просмотрено 0-10%
- 2 - просмотрено 10-30%
- 3 - просмотрено 30-60%
- 4 - просмотрено 60-85%
- 5 - просмотрено 85-100%

In [8]:
def watched_pct_to_score(pct: float) -> int:
    if 85 <= pct <= 100:
        return 5
    elif 60 <= pct < 85:
        return 4
    elif 30 <= pct < 60:
        return 3
    elif 10 <= pct < 30:
        return 2
    else:
        return 1

In [9]:
interactions_df[Columns.Weight] = interactions_df["watched_pct"].apply(lambda pct: watched_pct_to_score(pct))

In [10]:
interactions_df.tail(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight
5476247,546862,9673,2021-04-13,2308,49,3
5476249,384202,16197,2021-04-19,6203,100,5
5476250,319709,4436,2021-08-15,3921,45,3


## ALS рекомендации и их объяснение для активных пользователей

In [11]:
dataset = Dataset.construct(
    interactions_df[Columns.Interactions]
)

In [12]:
k_recs = 10

N_FACTORS = 64
RANDOM_STATE = 2023
NUM_THREADS = 16

als_model = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=N_FACTORS,
        random_state=RANDOM_STATE,
        num_threads=NUM_THREADS
    ), verbose=1,
)

In [13]:
als_model.fit(dataset)

  0%|          | 0/15 [00:00<?, ?it/s]

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fa11d157be0>

In [14]:
ui_csr = dataset.get_user_item_matrix()

user_ext_to_int = dataset.user_id_map.to_internal.to_dict()
item_int_to_ext = dataset.item_id_map.to_external.to_dict()
item_ext_to_int = {v: k for k, v in item_int_to_ext.items()}

In [15]:
model = als_model.model

In [None]:
items_df = pd.read_csv("/content/drive/MyDrive/RecSys MTC/kion/items.csv")
item_to_title = items_df[["item_id", "title"]].set_index("item_id").to_dict()["title"]

In [None]:
import pickle


with open("/content/drive/MyDrive/RecSys MTC/prod/prod_models/als/ui_csr.pickle", "wb") as f:
    pickle.dump(ui_csr, f)

with open("/content/drive/MyDrive/RecSys MTC/prod/prod_models/als/user_ext_to_int.pickle", "wb") as f:
    pickle.dump(user_ext_to_int, f)
with open("/content/drive/MyDrive/RecSys MTC/prod/prod_models/als/item_int_to_ext.pickle", "wb") as f:
    pickle.dump(item_int_to_ext, f)
with open("/content/drive/MyDrive/RecSys MTC/prod/prod_models/als/item_ext_to_int.pickle", "wb") as f:
    pickle.dump(item_ext_to_int, f)

with open("/content/drive/MyDrive/RecSys MTC/prod/prod_models/als/als_model.pickle", "wb") as f:
    pickle.dump(model, f)

with open("/content/drive/MyDrive/RecSys MTC/prod/prod_models/als/item_to_title.pickle", "wb") as f:
    pickle.dump(item_to_title, f)

In [90]:
from math import exp


def explain(user_id, item_id, threshold=0.05):
    if user_id not in user_ext_to_int or \
            item_id not in item_ext_to_int:
        return None, None

    internal_userid = user_ext_to_int[user_id]
    internal_itemid = item_ext_to_int[item_id]

    total_score, top_contributions, _ = model.explain(
        userid=internal_userid,
        user_items=ui_csr,
        itemid=internal_itemid,
        N=2
    )
    if total_score < threshold:
        return None, None

    p = int((0.5 / (1 + exp(-(total_score * 5 - 1))) + 0.5) * 100)

    title_1 = item_to_title[
        item_int_to_ext[top_contributions[0][0]]
    ]
    explanation = f"Рекомендуем тем, кому нравится «{title_1}»"

    if top_contributions[1][1] >= threshold:
        title_2 = item_to_title[
            item_int_to_ext[top_contributions[1][0]]
        ]
        explanation += f" и «{title_2}»"

    return p, explanation

In [91]:
test_users = list(user_ext_to_int.keys())[1000:1005]
test_items = list(item_ext_to_int.keys())[1000:1005]
for user_id in test_users:
    for item_id in test_items:
        print(explain(user_id, item_id))


(None, None)
(66, 'Рекомендуем тем, кому нравится «Медиатор»')
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(81, 'Рекомендуем тем, кому нравится «Балканский рубеж» и «Застава»')
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(None, None)
(84, 'Рекомендуем тем, кому нравится «Коридор бессмертия» и «Легенда № 17»')
(None, None)
(None, None)
(None, None)
(None, None)
