In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.tasks.common_metric import mean_quantile_error

In [2]:
import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Импортируем функции из src
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, features_generation
from src.recommenders import MainRecommender

In [3]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_val_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0


In [4]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [5]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [7]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Первый уровень модели.
# Генерируем кандидатов с помощью als recommendation
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))

In [8]:
users_lvl_2.head(4)

Unnamed: 0,user_id,candidates
0,2070,"[5569230, 5569471, 1008814, 5569374, 878442, 9..."
1,2021,"[1044078, 1106523, 844179, 819255, 899624, 999..."
2,1753,"[1029743, 883202, 906883, 963686, 891134, 8956..."
3,2120,"[1106523, 8090537, 1044078, 8090521, 1029743, ..."


In [9]:
# Вытягиваем строки кандидатов в столбец
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'
users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)

# Размечаем обучающую выборку на основе данных о покупках data_train_lvl_2
targets_lvl_2 = data_train_lvl_2.copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2['target'].fillna(0, inplace= True)

In [10]:
targets_lvl_2['target'].value_counts()

0.0    103048
1.0      6557
Name: target, dtype: int64

In [11]:
targets_lvl_2['target'].mean()

0.0598239131426486

In [12]:
# добавляем item_features и user_features
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

data_val_lvl_2 = data_val_lvl_2.merge(item_features, on='item_id', how='left')
data_val_lvl_2 = data_val_lvl_2.merge(user_features, on='user_id', how='left')

In [13]:
# создаем и добавляем дополнительные признаки на основе агрегатных функций
data_train = data_train_lvl_2.merge(item_features, on='item_id', how='left')
data_train = data_train.merge(user_features, on='user_id', how='left')

targets_lvl_2 = features_generation(data_train, targets_lvl_2)
data_val_lvl_2 = features_generation(data_val_lvl_2, data_val_lvl_2)

In [14]:
# создаем тестовую и обучающую выборки
X_train = targets_lvl_2
X_test = data_val_lvl_2.drop(columns=['user_id', 'item_id'])

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101319 entries, 0 to 101318
Data columns (total 30 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   user_id                 101319 non-null  int64  
 1   item_id                 101319 non-null  int64  
 2   basket_id               6557 non-null    float64
 3   day                     6557 non-null    float64
 4   quantity                6557 non-null    float64
 5   sales_value             6557 non-null    float64
 6   store_id                6557 non-null    float64
 7   retail_disc             6557 non-null    float64
 8   trans_time              6557 non-null    float64
 9   week_no                 6557 non-null    float64
 10  coupon_disc             6557 non-null    float64
 11  coupon_match_disc       6557 non-null    float64
 12  target                  101319 non-null  float64
 13  manufacturer            101319 non-null  int64  
 14  department          

In [15]:
# Модель второго уровня light Auto ML

TASK = Task('reg', loss='mse', metric=mean_quantile_error, greater_is_better=False)
TIMEOUT = 300000
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TARGET_NAME = 'target'
TEST_SIZE=0.2

roles = {'target': TARGET_NAME, 'drop': ['user_id', 'item_id']}

automl_model = TabularAutoML(task=TASK,
                            timeout=TIMEOUT,
                            cpu_limit = N_THREADS,
                            gpu_ids='all',
                            reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                             
                            general_params={'use_algos': [ ['lgb_tuned', 'cb_tuned', 'cb', 'lgb'], ['lgb_tuned', 'cb'] ]},
                             
                            tuning_params={'max_tuning_iter': 10},
                      )

In [16]:
# обучаем модель второго уровня - light Auto ML 
preds = automl_model.fit_predict(X_train, roles = roles)

INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-a550da2d-fba2-48a3-b06c-bbed0c63d2f1
INFO:optuna.study.study:Trial 0 finished with value: -1.3723641602936601e-05 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244}. Best is trial 0 with value: -1.3723641602936601e-05.
INFO:optuna.study.study:Trial 1 finished with value: -1.3723641602936601e-05 and parameters: {'feature_fraction': 0.8659969709057025, 'num_leaves': 159}. Best is trial 0 with value: -1.3723641602936601e-05.
INFO:optuna.study.study:Trial 2 finished with value: -1.3723641602936601e-05 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 0 with value: -1.3723641602936601e-05.
INFO:optuna.study.study:Trial 3 finished with value: -1.3723641602936601e-05 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 223}. Best is trial 0 with value: -1.3723641602936601e-05.
INFO:optuna.study.study:Trial 4 finished with value: -1

In [17]:
# делаем предсказания с помощью light auto ml
val_preds = automl_model.predict(X_test)

In [18]:
# предсказанные покупки пользователей, отранжированные в порядке убывания вероятности 
preds = data_val_lvl_2[['user_id', 'item_id']].copy()
preds['proba'] = val_preds.data[:, 0]
preds = preds.groupby(['user_id', 'item_id'])['proba'].mean().reset_index()
preds = preds.groupby('user_id').apply(lambda x: x.sort_values('proba', ascending=False)['item_id'].tolist())

In [19]:
# функция дает N самых вероятных покупок пользователя, если значений не хватает добираем их из самых популярных
def get_gbm_rec(user, N=5):
    recs = preds[user][:N]
    
    top = data_val_lvl_2.groupby('item_id')['quantity'].mean().reset_index()
    top.sort_values('quantity', ascending=False, inplace=True)
    top = top['item_id'][top['item_id'] != 999999].tolist()
      
    if len(recs) < N:
        recs.extend(top[:N-len(recs)])
   
    return recs

In [20]:
# реальные покупки пользователей
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

In [21]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

#Заполняем для каждого пользователя предсказаниями N покупок 
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

# рекомендации двухуровневой модели
result_lvl_2['autoML'] = result_lvl_2['user_id'].apply(lambda x: get_gbm_rec(x, N=50))

In [22]:
result_lvl_2.head(4)

Unnamed: 0,user_id,actual,autoML
0,1,"[931860, 1049998, 15971874, 1002032, 1122428, ...","[1041796, 933913, 1122428, 1132814, 1048918, 1..."
1,3,"[13842214, 851057, 879948, 994891, 7167218, 10...","[835476, 1053690, 9526886, 9526563, 7167249, 7..."
2,6,"[1006718, 7431990, 1108624, 1020683, 1104227, ...","[820165, 6553035, 1075214, 1078346, 1099058, 1..."
3,7,"[10255525, 994994, 1056418, 5571310, 14111027,...","[822049, 6424447, 6034857, 5592931, 5592610, 5..."


In [23]:
# считаем среднее значение точности предсказания двухуровневой модели
result_lvl_2.apply(lambda row: precision_at_k(row['autoML'], row['actual'], k=50), axis=1).mean()

0.6499510284035249