# Рекомендательные системы

## Урок 6. Двухуровневые модели рекомендаций

**Примечание**. Это заготовка к практической работе №6. Планирую добавить материал в течение нескольких дней.

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import bsr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender

from lightgbm import LGBMClassifier

In [2]:
RETAIL_DATA = "../hw2/retail_train.csv.zip"
PRODUCT_DATA = "../hw2/product.csv"
DEMOGRAPHIC_DATA = "../hw5/hh_demographic.csv"
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [3]:
ITEM_ID_COL = 'item_id'
USER_ID_COL = 'user_id'
ITEM_INDEX_COL = 'item_idx'
USER_INDEX_COL = 'user_idx'
WEEK_NUM_COL = 'week_no'
ACTUAL_COL = 'actual'
TOPK_PRECISION = 5
TOPK_RECALL = 50

### Библиотека

In [4]:
# Precision@K
def precision_at_k(recommended_list, bought_list, k=5):
    try:
        _rec_list = recommended_list[:k]
        _b_and_r = np.intersect1d(bought_list, _rec_list)
        return _b_and_r.size / len(_rec_list)
    except (ZeroDivisionError, TypeError):
        return 0.0

def mean_precision_at_k(df, rec, bought, k=5):
    _result = df.apply(
        lambda row: precision_at_k(row[rec], row[bought], k),
        axis=1
    )
    return np.mean(_result)

In [5]:
# Recall@K
def recall_at_k(recommended_list, bought_list, k=5):
    try:
        _rec_list = recommended_list[:k]
        _b_and_r = np.intersect1d(bought_list, _rec_list)
        return _b_and_r.size / len(bought_list)
    except (ZeroDivisionError, TypeError):
        return 0.0

def mean_recall_at_k(df, rec, bought, k=5):
    _result = df.apply(
        lambda row: recall_at_k(row[rec], row[bought], k),
        axis=1
    )
    return np.mean(_result)

In [6]:
def read_transform_csv(path, column_map={}, index=None):
    columns = pd.read_csv(path, nrows=0).columns
    _column_map = dict(zip(columns, columns.str.lower()))
    _column_map.update(column_map)
    _data = pd.read_csv(path).rename(columns=_column_map)
    if index is not None:
        return _data.set_index(index)    
    return _data

In [7]:
# Предфильтрация
def prefilter_items(data, prevalence_range = (0.05, 0.95), price_range = (1.0, 100.0)):
    # Уберем самые популярные товары и самые непопулярные товары
    pop_thr, unpop_thr = prevalence_range
    item_cum_counts = data[ITEM_ID_COL].value_counts().cumsum()
    max_count = item_cum_counts.values[-1]
    top_popular_mask = item_cum_counts < max_count * pop_thr
    top_uppopular_mask = item_cum_counts > max_count * unpop_thr
    blocked_items = item_cum_counts[top_popular_mask | top_uppopular_mask].index
    
    # Уберем товары, которые не продавались за последние 12 месяцев
    recent_sale_items = data[ITEM_ID_COL][data[WEEK_NUM_COL] > data[WEEK_NUM_COL].max() - 53]
    old_sale_items = np.setdiff1d(data[ITEM_ID_COL], recent_sale_items)
    blocked_items = np.union1d(blocked_items, old_sale_items)
    
    # Уберем слишком дешевые товары и слишком дорогие товары
    # Цена товара косвенно оценивается по sales_value
    min_price, max_price = price_range
    bad_price_items = (
        data
        .assign(price = lambda x: np.where(x['quantity'] > 0, x['sales_value'] / x['quantity'], 0.0))
        .groupby(ITEM_ID_COL)
        .agg(min_item_price=('price', 'min'), max_item_price=('price', 'max'))
        .query("min_item_price >= @max_price or max_item_price <= @min_price")
        .index
    )
    blocked_items = np.union1d(blocked_items, bad_price_items)
    return data[~np.isin(data[ITEM_ID_COL], blocked_items)].copy()

In [8]:
# RecResult - класс, аккумулирующий результаты работы разных рекомендательных моделей
class RecResult:
    def __init__(self, data_train, aggcol, aggfunc):
        agg_df = (
            data_train.groupby([USER_ID_COL, ITEM_ID_COL])
            .agg(interaction=(aggcol, aggfunc))
            .reset_index()
        )
        self.user_idx_id = pd.DataFrame(enumerate(np.sort(agg_df[USER_ID_COL].unique())), 
                                        columns=[USER_INDEX_COL, USER_ID_COL])
        self.item_idx_id = pd.DataFrame(enumerate(np.sort(agg_df[ITEM_ID_COL].unique())), 
                                        columns=[ITEM_INDEX_COL, ITEM_ID_COL])
        self.user_item_interaction = (
            agg_df
            .merge(self.user_idx_id, on=USER_ID_COL)
            .merge(self.item_idx_id, on=ITEM_ID_COL)
        )
        interaction = self.user_item_interaction['interaction'].astype(float)
        user_idx = self.user_item_interaction[USER_INDEX_COL]
        item_idx = self.user_item_interaction[ITEM_INDEX_COL]
        self.user_item_matrix = bsr_matrix((interaction, (user_idx, item_idx)), 
                                           shape=(user_idx.max()+1, item_idx.max()+1)).tocsr()
        
    def init_result(self, data_test):  # init result with actuals
        self.actual = data_test.groupby(USER_ID_COL).agg(actual=(ITEM_ID_COL, list))
        self.result = (
            self.actual
            .merge(self.user_idx_id, on=USER_ID_COL, how='inner')
            .filter([USER_ID_COL, USER_INDEX_COL, ACTUAL_COL])        
        )
        return self
    
    def _get_recommendations(self, model, userid, N):
        rec = model.recommend(userid, self.user_item_matrix.tocsr(), N=N,
                              filter_already_liked_items=False,
                              recalculate_user=False)
        rec_df = pd.DataFrame(rec, columns=[ITEM_INDEX_COL, 'model_score']).set_index(ITEM_INDEX_COL)
        return rec_df.join(self.item_idx_id)[ITEM_ID_COL].tolist()
        
    
    def add_all_recommendations(self, model, N=5, model_name=None, show_progress=False):
        if model_name is None:
            model_name = model.__class__.__name__
        
        model.fit(self.user_item_matrix.T, show_progress=show_progress)
        if hasattr(model, 'user_factors') and hasattr(model, 'item_factors'):
            fast_recs = model.user_factors @ model.item_factors.T
            item_ids = self.item_idx_id[ITEM_ID_COL].values
            rec_matrix = item_ids[np.argsort(-fast_recs)[:, :N]]
            rec_df = pd.DataFrame.from_records(
                np.expand_dims(rec_matrix, axis=1),
                columns = [model_name],
                index=self.user_idx_id[USER_INDEX_COL]
            )
            self.result = self.result.join(rec_df)
        else:        
            def _get_user_rec(userid):
                return self._get_recommendations(model=model, userid=userid, N=N) 
        
            self.result[model_name] = self.result[USER_INDEX_COL].apply(_get_user_rec)        
        return self

### Данные

In [9]:
%%time
item_features = read_transform_csv(PRODUCT_DATA, {'PRODUCT_ID': ITEM_ID_COL}, index=ITEM_ID_COL)
user_features = read_transform_csv(DEMOGRAPHIC_DATA, {'household_key': USER_ID_COL}, index=USER_ID_COL)

# train test split
data = pd.read_csv(RETAIL_DATA)
# берем данные для тренировки matching модели
data_train_matcher = data[data[WEEK_NUM_COL] < data[WEEK_NUM_COL].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]
# берем данные для валидации matching модели
data_val_matcher = data[(data[WEEK_NUM_COL] >= data[WEEK_NUM_COL].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data[WEEK_NUM_COL] < data[WEEK_NUM_COL].max() - (VAL_RANKER_WEEKS))]
# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
# берем данные для теста ranking, matching модели
data_val_ranker = data[data[WEEK_NUM_COL] >= data[WEEK_NUM_COL].max() - VAL_RANKER_WEEKS]

# Prefiltered
data_train_matcher_filtered = prefilter_items(data_train_matcher, prevalence_range = (0.05, 0.75), price_range = (1.8, 50.0))
print(f"Decreased # of items from {data_train_matcher[ITEM_ID_COL].nunique()}"
      f" to {data_train_matcher_filtered[ITEM_ID_COL].nunique()}")

Decreased # of items from 83685 to 5426
Wall time: 4.91 s


In [10]:
# Result Accumulator
recresult = RecResult(data_train_matcher_filtered, 'quantity', 'count').init_result(data_val_matcher)

In [11]:
%%time
own_recommender = ItemItemRecommender(K=1, num_threads=4)
als_recommender = AlternatingLeastSquares(factors=3, regularization=0.05, iterations=10)

recall_results = []

for k in (20, 50, 100, 200, 500):
    model_name=f"Own Rec, N={k}"
    recresult.add_all_recommendations(own_recommender, N=k, model_name=model_name)
    recall_results.append((model_name, mean_recall_at_k(recresult.result, model_name, 'actual', k=k)))
    model_name=f"ALS, N={k}"
    recresult.add_all_recommendations(als_recommender, N=k, model_name=model_name)
    recall_results.append((model_name, mean_recall_at_k(recresult.result, model_name, 'actual', k=k)))

Wall time: 32.6 s


In [12]:
pd.DataFrame(recall_results, columns=['Model', 'Recall']).sort_values('Recall', ascending=False)

Unnamed: 0,Model,Recall
9,"ALS, N=500",0.12623
7,"ALS, N=200",0.082332
8,"Own Rec, N=500",0.078483
6,"Own Rec, N=200",0.078476
4,"Own Rec, N=100",0.077368
2,"Own Rec, N=50",0.068262
5,"ALS, N=100",0.056417
0,"Own Rec, N=20",0.046991
3,"ALS, N=50",0.03803
1,"ALS, N=20",0.019701
