In [1]:
import requests
import zipfile

import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from io import BytesIO
from textwrap import wrap
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import roc_auc_score, log_loss, ndcg_score
from pprint import pprint

In [9]:
from implicit.bpr import BayesianPersonalizedRanking

In [2]:
train = pl.read_parquet('../../data/lavka/train.parquet')
train = train.sample(fraction=0.3)

In [3]:
import implicit

In [4]:
df_interact = train.filter(
    pl.col("action_type").is_in(["AT_Purchase", "AT_CartUpdate", "AT_Click"])
)

In [5]:
from lavka_recsys.utils.matrix_operations import build_interaction_matrix

In [13]:
interaction_matrix, user2idx, idx2user, item2idx, idx2item = \
            build_interaction_matrix(df_interact,
                                    user_col="user_id",
                                    item_col="product_id",
                                    binary=True)

In [28]:
type(interaction_matrix)

scipy.sparse._csr.csr_matrix

In [10]:
bpr_model = BayesianPersonalizedRanking(
    factors=100,
    random_state=42
)

In [11]:
bpr_model.fit(interaction_matrix)

  0%|          | 0/100 [00:00<?, ?it/s]

In [26]:
grouped = train.to_pandas().groupby('user_id')

In [29]:
import numpy as np

# Dictionary to store scores for each request_id
request_scores = {}

for user_id, group in tqdm(grouped):
    # Skip cold users
    if user_id not in user2idx:
        continue
    
    # Converting ids to idxs
    user_idx = user2idx[user_id]
    product_ids = group['product_id'].values
    product_idxs = [item2idx[pid] for pid in product_ids if pid in item2idx]

    # Get scores for the specified products
    product_idxs, scores = bpr_model.recommend(
        userid=user_idx,
        user_items=interaction_matrix,
        N=len(product_idxs),
        filter_already_liked_items=False,
        items=product_idxs
    )

    # Store the results
    scores_list = [(idx2item[pidx], score) for pidx, score in zip(product_idxs, scores)]
    request_scores[user_id] = scores_list

  0%|          | 0/3446 [00:00<?, ?it/s]

In [31]:
train.schema

Schema([('action_type', String),
        ('city_name', String),
        ('position_in_request', Int64),
        ('product_category', String),
        ('product_id', UInt64),
        ('product_image', String),
        ('product_name', String),
        ('request_id', UInt64),
        ('source_type', String),
        ('store_id', UInt64),
        ('timestamp', Int64),
        ('user_id', UInt64)])

In [32]:
import polars as pl

# Flatten the dictionary into a list of records
records = []
for user_id, items in request_scores.items():
    for product_id, score in items:
        records.append({'user_id': user_id, 'product_id': product_id, 'score': score})

# Create a Polars DataFrame
schema = {
    'user_id': pl.UInt64,
    'product_id': pl.UInt64,
    'score': float
}
result_df = pl.DataFrame(records, schema=schema)

In [35]:
train.join(result_df, on=['user_id', 'product_id'], how='left').describe()

statistic,action_type,city_name,position_in_request,product_category,product_id,product_image,product_name,request_id,source_type,store_id,timestamp,user_id,score
str,str,str,f64,str,f64,str,str,f64,str,f64,f64,f64,f64
"""count""","""55083124""","""55083124""",52545440.0,"""55083124""",55083124.0,"""55083124""","""55083124""",53218782.0,"""53248783""",55083124.0,55083124.0,55083124.0,54909940.0
"""null_count""","""0""","""0""",2537684.0,"""0""",0.0,"""0""","""0""",1864342.0,"""1834341""",0.0,0.0,0.0,173184.0
"""mean""",,,20.677003,,9.203e+18,,,9.2093e+18,,9.5054e+18,1691300000.0,8.6518e+18,1.666702
"""std""",,,40.215798,,5.3232e+18,,,5.3362e+18,,5.7427e+18,9513400.0,5.0993e+18,1.101319
"""min""","""AT_CartUpdate""","""Екатеринбург""",0.0,"""L-карнитин""",135310000000000.0,"""https://avatars.mds.yandex.net…","""10 шведских фрикаделек с брусн…",4626500000000.0,"""ST_Catalog""",1275200000000000.0,1672500000.0,2849200000000000.0,-4.526204
"""25%""",,,3.0,,4.6563e+18,,,4.5903e+18,,4.197e+18,1683600000.0,4.4812e+18,0.954798
"""50%""",,,8.0,,9.0017e+18,,,9.2019e+18,,1.029e+19,1692300000.0,7.5464e+18,1.85258
"""75%""",,,21.0,,1.41e+19,,,1.3842e+19,,1.4641e+19,1699300000.0,1.3746e+19,2.468862
"""max""","""AT_View""","""Челябинск""",780.0,"""сахар коричневый рафинад/колот…",1.8445e+19,"""https://avatars.mds.yandex.net…","""хлеб Бородинский «Вкус хлеба» …",1.8447e+19,"""ST_Upsale""",1.837e+19,1706900000.0,1.8437e+19,5.417954
