# Рекомендательные системы

## Урок 5. Ранжирование. Гибридные рекоммендательные системы

## Практическое задание: подбор параметров гибридной модели

In [1]:
import warnings
warnings.simplefilter(action='ignore')

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import bsr_matrix, csr_matrix
from sklearn.model_selection import ParameterGrid
from tqdm.notebook import tqdm

In [3]:
from lightfm import LightFM

In [4]:
RETAIL_DATA = "../hw2/retail_train.csv.zip"
PRODUCT_DATA = "../hw2/product.csv"
DEMOGRAPHIC_DATA = "hh_demographic.csv"

### Библиотека

In [5]:
# Precision@K
def precision_at_k(recommended_list, bought_list, k=5):
    try:
        _rec_list = recommended_list[:k]
        _b_and_r = np.intersect1d(bought_list, _rec_list)
        return _b_and_r.size / len(_rec_list)
    except (ZeroDivisionError, TypeError):
        return 0.0

def mean_precision_at_k(df, rec, bought, k=5):
    _result = df.apply(
        lambda row: precision_at_k(row[rec], row[bought], k),
        axis=1
    )
    return np.mean(_result)

In [6]:
def read_transform_csv(path, column_map={}, index=None):
    columns = pd.read_csv(path, nrows=0).columns
    _column_map = dict(zip(columns, columns.str.lower()))
    _column_map.update(column_map)
    _data = pd.read_csv(path).rename(columns=_column_map)
    if index is not None:
        return _data.set_index(index)    
    return _data

In [7]:
# Предфильтрация
def prefilter_items(data, prevalence_range = (0.05, 0.95), price_range = (1.0, 100.0)):
    # Уберем самые популярные товары и самые непопулярные товары
    pop_thr, unpop_thr = prevalence_range
    item_cum_counts = data['item_id'].value_counts().cumsum()
    max_count = item_cum_counts.values[-1]
    top_popular_mask = item_cum_counts < max_count * pop_thr
    top_uppopular_mask = item_cum_counts > max_count * unpop_thr
    blocked_items = item_cum_counts[top_popular_mask | top_uppopular_mask].index
    
    # Уберем товары, которые не продавались за последние 12 месяцев
    recent_sale_items = data['item_id'][data['week_no'] > data['week_no'].max() - 53]
    old_sale_items = np.setdiff1d(data['item_id'], recent_sale_items)
    blocked_items = np.union1d(blocked_items, old_sale_items)
    
    # Уберем слишком дешевые товары и слишком дорогие товары
    # Цена товара косвенно оценивается по sales_value
    min_price, max_price = price_range
    bad_price_items = (
        data
        .assign(price = lambda x: np.where(x['quantity'] > 0, x['sales_value'] / x['quantity'], 0.0))
        .groupby('item_id')
        .agg(min_item_price=('price', 'min'), max_item_price=('price', 'max'))
        .query("min_item_price >= @max_price or max_item_price <= @min_price")
        .index
    )
    blocked_items = np.union1d(blocked_items, bad_price_items)
    return data[~np.isin(data['item_id'], blocked_items)].copy()

In [8]:
# Получаем вспомогательную таблицу со столбцами
# User Index, Item Index, Interaction = Quantity Sum, User ID, Item ID,
def get_user_item_interaction(data, aggcol, aggfunc):
    agg_df = (
        data.groupby(['user_id', 'item_id'])
        .agg(interaction=(aggcol, aggfunc))
        .reset_index()
    )
    unique_user_id = np.sort(agg_df['user_id'].unique())
    unique_item_id = np.sort(agg_df['item_id'].unique())
    user_id_idx = pd.DataFrame(enumerate(unique_user_id), columns=['user_idx', 'user_id'])
    item_id_idx = pd.DataFrame(enumerate(unique_item_id), columns=['item_idx', 'item_id'])
    return agg_df.merge(user_id_idx, on='user_id').merge(item_id_idx, on="item_id")

In [9]:
def get_user_item_matrix(user_item_interaction):
    interaction = user_item_interaction['interaction'].astype(float)
    user_idx = user_item_interaction['user_idx']
    item_idx = user_item_interaction['item_idx']
    return bsr_matrix((interaction, (user_idx, item_idx)), 
                      shape=(user_idx.max()+1, item_idx.max()+1))

In [10]:
def get_feature_dummies(user_item_interaction, features, on='user'):
    ids = user_item_interaction.sort_values(f'{on}_idx')[f'{on}_id'].unique()
    merged = pd.DataFrame(index=ids).join(features)
    return pd.get_dummies(merged, columns=merged.columns.tolist())

### Данные

In [11]:
%%time
item_features = read_transform_csv(PRODUCT_DATA, {'PRODUCT_ID': 'item_id'}, index='item_id')
user_features = read_transform_csv(DEMOGRAPHIC_DATA, {'household_key': 'user_id'}, index='user_id')

# train test split
data = pd.read_csv(RETAIL_DATA)
test_size_weeks = 3
data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

# Actual test items
actual = data_test.groupby('user_id').agg(actual=('item_id', list))

# Prefiltered
data_train_filtered = prefilter_items(data_train, prevalence_range = (0.05, 0.75), price_range = (1.5, 50.0))

user_item_interaction = get_user_item_interaction(data_train_filtered, 'quantity', 'count')
user_item_matrix = get_user_item_matrix(user_item_interaction)

# Признаки
user_feature_dummies = get_feature_dummies(user_item_interaction, user_features, on='user')
user_feature_csr = csr_matrix(user_feature_dummies.values)
item_feature_dummies = get_feature_dummies(user_item_interaction, item_features, on='item')
item_feature_csr = csr_matrix(item_feature_dummies.values)

print(f"Decreased # of items from {data_train['item_id'].nunique()} to {data_train_filtered['item_id'].nunique()}")

Decreased # of items from 86865 to 6047
Wall time: 6.28 s


### Модель

In [12]:
%%time
# LightFM Grid Search
model_param_grid = {
    'no_components': [4, 10, 40],
    'loss': ['logistic', 'bpr', 'warp'],
    'learning_rate': [0.01],
    'item_alpha': [0.001, 0.4],
    'user_alpha': [0.001, 0.1],
    'random_state': [42],
    'k': [1, 5],
    'n': [15],
    'max_sampled': [10, 100]
}
grid_len = np.prod([len(v) for v in model_param_grid.values()])

result = actual

with tqdm(desc="LightFM Grid Search", total=grid_len) as progress:
    for prm in ParameterGrid(model_param_grid):
        param_str = ", ".join(f"{p}={repr(prm[p])}" for p in model_param_grid)
        model = LightFM(**prm)
        model_name = f"{model.__class__.__name__}({param_str})"        
        model.fit(
            user_item_matrix.sign().tocsr(),  # 0/1
            sample_weight=user_item_matrix.tocoo(),
            user_features=user_feature_csr,
            item_features=item_feature_csr,
            epochs=20,
            verbose=False
        )
        predict_score = model.predict(
            user_ids = user_item_interaction['user_idx'].values,
            item_ids = user_item_interaction['item_idx'].values,
            user_features=user_feature_csr,
            item_features=item_feature_csr,
            num_threads=10
        )
        rec_df = (
            user_item_interaction
            .assign(score=predict_score)
            .sort_values(['user_id', 'score'], ascending=(True, False))
            .groupby('user_id')
            .head(5)
            .filter(['user_id', 'item_id'])
            .groupby('user_id')
            .agg(model_name=('item_id', list))
            .rename(columns={'model_name': model_name})
        )
        result = result.join(rec_df)
        progress.update(1)

LightFM Grid Search:   0%|          | 0/144 [00:00<?, ?it/s]

Wall time: 6h 21min 34s


In [14]:
%%time
# Выбираем лучшие параметры по precision@5
gen_metrics = (
    (model_name, mean_precision_at_k(result, model_name, 'actual'))
    for model_name in result.columns[1:]
)
(
    pd.DataFrame(gen_metrics, columns=['model', 'precision@5'])
    .sort_values('precision@5', ascending=False)
    .head(20)
    .style.set_properties(subset=['model'], width='750px')
)

Wall time: 26.3 s


Unnamed: 0,model,precision@5
53,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.1, random_state=42, k=5, n=15, max_sampled=10)",0.136974
23,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.1, random_state=42, k=1, n=15, max_sampled=100)",0.136974
59,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.1, random_state=42, k=5, n=15, max_sampled=100)",0.136974
17,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.1, random_state=42, k=1, n=15, max_sampled=10)",0.136974
22,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.001, random_state=42, k=1, n=15, max_sampled=100)",0.135504
52,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.001, random_state=42, k=5, n=15, max_sampled=10)",0.135504
58,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.001, random_state=42, k=5, n=15, max_sampled=100)",0.135504
16,"LightFM(no_components=40, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.001, random_state=42, k=1, n=15, max_sampled=10)",0.135504
15,"LightFM(no_components=10, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.1, random_state=42, k=1, n=15, max_sampled=10)",0.135406
51,"LightFM(no_components=10, loss='bpr', learning_rate=0.01, item_alpha=0.001, user_alpha=0.1, random_state=42, k=5, n=15, max_sampled=10)",0.135406
