# Курсовой проект по рекомендательным системам: минимальное раб решение


In [15]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные функции
from best_rec_lib_1.metrics import precision_at_k
from best_rec_lib_1.utils import prefilter_items
from best_rec_lib_1.recommenders import MainRecommender, ItemItemRecommender


import warnings
warnings.filterwarnings('ignore')

In [54]:
data = pd.read_csv('../retail_train.csv')
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# разбиваем датасет
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [55]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [56]:
# для 5000
n_items_before = data_train['item_id'].nunique()

data_train_5000 = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_5000['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [57]:
len(data_train_5000["user_id"].unique())

2497

In [58]:
# для 20000
n_items_before = data_train['item_id'].nunique()

data_train_20000 = prefilter_items(data_train, item_features=item_features, take_n_popular=20000)

n_items_after = data_train_20000['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))
print(len(data_train_20000["user_id"].unique()))
# у меня получается количество уникальных юзеров одно и то же, не вижу смысла проводить анализ для большего топ N 

Decreased # items from 86865 to 20001
2497


In [59]:
print(len(data_train["user_id"].unique()))

2499


In [68]:
# модельки для 5000
recommender_5000 = MainRecommender(data_train_5000)
recommender_5000_tfidf = MainRecommender(data_train_5000, weighting="tfidf")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [63]:
%%time
# рекомендации    
result['own_recommender_5000'] = result['user_id'].map(lambda x: recommender_5000.get_own_recommendations(2497, N=5))
result['own_recommender_5000_tfidf'] = result['user_id'].map(lambda x: recommender_5000_tfidf.get_own_recommendations(2497, N=5))

result['als_recommender_5000'] = result['user_id'].map(lambda x: recommender_5000.get_als_recommendations(2497, N=5))
result['als_recommender_5000_tfidf'] = result['user_id'].map(lambda x: recommender_5000_tfidf.get_als_recommendations(2497, N=5))

result['su_recommender_5000'] = result['user_id'].map(lambda x: recommender_5000.get_similar_users_recommendation(2497, N=5))
result['su_recommender_5000_tfidf'] = result['user_id'].map(lambda x: recommender_5000_tfidf.get_similar_users_recommendation(2497, N=5))

# result.head(2)

CPU times: user 6.24 s, sys: 14.2 s, total: 20.5 s
Wall time: 3.7 s


In [65]:
result.head(2)

Unnamed: 0,user_id,actual,own_recommender_5000,own_recommender_5000_tfidf,als_recommender_5000,als_recommender_5000_tfidf,su_recommender_5000,su_recommender_5000_tfidf
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1029743, 1106523, 5569230, 916122, 1044078]","[1029743, 1106523, 5569230, 916122, 1044078]","[957951, 5585510, 5569471, 12810391, 5569230]","[957951, 8090532, 899624, 5569471, 8090521]","[1029743, 1106523, 5569230, 916122, 1044078]","[1029743, 1106523, 5569230, 916122, 1044078]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1029743, 1106523, 5569230, 916122, 1044078]","[1029743, 1106523, 5569230, 916122, 1044078]","[957951, 5585510, 5569471, 12810391, 5569230]","[957951, 8090532, 899624, 5569471, 8090521]","[1029743, 1106523, 5569230, 916122, 1044078]","[1029743, 1106523, 5569230, 916122, 1044078]"


Попробуем различные варианты генерации кандидатов. Какие из них дают наибольший map@k ?
- Отобираем 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна


In [67]:

# считаем map
    
print(f"own_recommender_5000_tfidf: {result.apply(lambda row: precision_at_k(row['own_recommender_5000_tfidf'], row['actual'], k=5), axis=1).mean()}")
print(f"als_recommender_5000_tfidf: {result.apply(lambda row: precision_at_k(row['als_recommender_5000_tfidf'], row['actual'], k=5), axis=1).mean()}")
print(f"su_recommender_5000_tfidf: {result.apply(lambda row: precision_at_k(row['su_recommender_5000_tfidf'], row['actual'], k=5), axis=1).mean()}")


own_recommender_5000_tfidf: 0.10842311459353485
als_recommender_5000_tfidf: 0.03858961802154773
su_recommender_5000_tfidf: 0.10842311459353485


# Бейзлайны

In [69]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### Random recommendation

In [70]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [71]:
%%time

items = data_train.item_id.unique()

result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
# map5 = mean_average_precision_at_k(random_recommendation(items, n=5), data_train., data_test, K=5)
result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

CPU times: user 2.39 s, sys: 20.3 ms, total: 2.41 s
Wall time: 2.41 s


0.00039177277179236047

### Popularity-based recommendation

In [72]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [73]:
%%time

popular_recs = popularity_recommendation(data_train, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()

CPU times: user 148 ms, sys: 35.9 ms, total: 184 ms
Wall time: 186 ms


0.15523996082272082

### Weighted random recommender

In [74]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    items = np.array(items_weights['item_id'])
    weights = np.array(items_weights['weight'])
    np.log(1 + weights)
    recs = np.random.choice(items, p=weights, size=n, replace=False)

    return recs.tolist()

In [75]:
%%time
items_weights = data_train.groupby('item_id')['sales_value'].sum().reset_index()
items_weights.rename(columns = {'sales_value' : 'weight'}, inplace = True)

sum = items_weights['weight'].sum()
for i in range(items_weights.shape[0]):
    items_weights['weight'][i] =  items_weights['weight'][i] / sum
    
result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))

result.apply(lambda row: precision_at_k(row['weighted_random_recommendation'], row['actual']), axis=1).mean()

CPU times: user 5.66 s, sys: 49.1 ms, total: 5.71 s
Wall time: 5.72 s


0.02184133202742413

### Выводы по минимальному работающему решению
- самый лучший по качеству Popularity-based recommendation, его и возьмем как базовое решение (хотя по поставленному условию все равно не дотягивает)