In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

import os, sys

import catboost as catb

module_path = os.path.abspath(os.path.join(os.pardir)) + os.path.sep  + '(src)'

if module_path not in sys.path:
    sys.path.append(module_path)
    
from metrics import recall_at_k_series, evaluate_recommenders

from recommender import RecSys

%matplotlib inline
pd.set_option('display.max_columns', None)

# **PUBLIC TEST**

### Parameters

In [2]:
# TEST / TRAIN Data
df_file_name = 'retail_train.csv.zip'
df_public_test_file_name = 'retail_test1.csv'
df_users_file_name = 'hh_demographic.csv'
df_items_file_name = 'product.csv'

# Model parameters
prefilter_params = {'n_popular_limit':5000,
                    'upper_popularity_limit':1,
                    'lower_popularity_limit':0.005,
                    'lower_price_limit':1,
                    'upper_price_limit':50,
                    'min_dep_assortment':100}

als_params = {'n_factors':50,
              'regularization':0.005,
              'iterations':30}

history_weeks_first_model = 26
history_weeks_second_model = 6

weighting = False
first_model_rec_limit = 200

### Loading data

In [3]:
%%time

df = pd.read_csv(df_file_name, compression='zip')
df_test = pd.read_csv(df_public_test_file_name)

df_users = pd.read_csv(df_users_file_name)
df_users.rename(columns={'household_key':'user_id'}, inplace=True)

df_items = pd.read_csv(df_items_file_name)
df_items.rename(columns={'PRODUCT_ID':'item_id'}, inplace=True)

Wall time: 2.44 s


### Model training (~10 min training)

In [4]:
%%time

rec_sys= RecSys(df=df,
                history_weeks_first_model=history_weeks_first_model,
                history_weeks_second_model=history_weeks_second_model,
                df_items=df_items,
                df_users=df_users,
                weighting=weighting,
                first_model_rec_limit=first_model_rec_limit,
                prefilter_params=prefilter_params,
                als_params=als_params)

HBox(children=(FloatProgress(value=0.0, max=3907.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


Wall time: 10min 11s


### Recommendations

In [5]:
# preprocessing test data
df_test_pr = rec_sys.df_test_preprocess(df_test)
df_test_pr.head(2)

Unnamed: 0,user_id,purchases
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [6]:
# Making recommendations
df_test_pr = rec_sys.predict(df_test=df_test_pr)
df_test_pr.head(2)

Unnamed: 0,user_id,purchases,top_popular,top_purchases,top_purchases_by_user,own_recommender,als_recommender,basic_recommender,catb_recommender,hit
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[1082185, 981760, 995242, 1029743, 840361, 109...","[1029743, 995242, 1106523, 981760, 1133018, 11...","[856942, 995242, 940947, 5577022, 9527290, 108...","[1081177, 1004906, 12810393, 6034857, 1006184,...","[979707, 995242, 965766, 940947, 1024306, 1081...","[819518, 826249, 826695, 829323, 833025, 83448...","[1082185, 995242, 979707, 940947, 6534178, 102...",1
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1082185, 981760, 995242, 1029743, 840361, 109...","[1029743, 995242, 1106523, 981760, 1133018, 11...","[826784, 1106523, 951590, 8090521, 901062, 980...","[1127831, 1106523, 1133018, 1081177, 5569230, ...","[1106523, 1133018, 5569230, 1081177, 8090521, ...","[825343, 826249, 826784, 827656, 833025, 83448...","[1133018, 1082185, 1106523, 5569230, 6534178, ...",1


### Metrics

In [7]:
# Evaluating recommendations
recommender_col_list = list(df_test_pr.loc[:,'top_popular':].columns.drop('hit'))
evaluate_recommenders(df_test=df_test_pr,
                      bought_col='purchases',
                      recommender_col_list=recommender_col_list,
                      k_precision=5,
                      k_recall=200)

                                    PRECISION     RECALL

top_popular                         0.173050      0.202535
top_purchases                       0.126048      0.151067


  return flags.sum() / len(recommended_list)


top_purchases_by_user               0.303317      0.171033
own_recommender                     0.139493      0.091573
als_recommender                     0.202470      0.194769
basic_recommender                   0.054040      0.208168
catb_recommender                    0.275774      0.237595


### Export public_test recommendations

In [8]:
# Exporting PUBLIC TEST recomendations
df_test_pr[['user_id', 'catb_recommender']].to_csv('public_test_recommendations.csv', index=False)
pd.read_csv('public_test_recommendations.csv')

Unnamed: 0,user_id,catb_recommender
0,1,"[1082185, 995242, 979707, 940947, 6534178, 102..."
1,2,"[1133018, 1082185, 1106523, 5569230, 6534178, ..."
2,3,"[1106523, 1082185, 1133018, 5569230, 844165, 1..."
3,6,"[1082185, 995242, 1029743, 6534178, 900802, 96..."
4,7,"[1082185, 1058997, 6534178, 1126899, 1106523, ..."
...,...,...
1880,2496,"[1082185, 981760, 6534178, 1133018, 1106523, 8..."
1881,2497,"[1082185, 995242, 1029743, 951590, 6534178, 55..."
1882,2498,"[1082185, 914190, 6534178, 951590, 1070820, 95..."
1883,2499,"[1106523, 1082185, 1133018, 981760, 5568378, 5..."


# **FINAL MODEL (training data includes public test data)**

### Parameters

In [9]:
# TEST / TRAIN Data
df_file_name = 'retail_train.csv.zip' 
df_public_test_file_name = 'retail_test1.csv'
df_users_file_name = 'hh_demographic.csv'
df_items_file_name = 'product.csv'

# Model parameters

prefilter_params = {'n_popular_limit':5000,
                    'upper_popularity_limit':1,
                    'lower_popularity_limit':0.005,
                    'lower_price_limit':1,
                    'upper_price_limit':50,
                    'min_dep_assortment':100}

als_params = {'n_factors':50,
              'regularization':0.005,
              'iterations':30}

history_weeks_first_model = 26
history_weeks_second_model = 6

weighting = False
first_model_rec_limit = 200

### Loading data

In [10]:
%%time
# making full dataset
df = pd.concat((pd.read_csv(df_file_name, compression='zip'),
                pd.read_csv(df_public_test_file_name)),
               axis=0)

df_users = pd.read_csv(df_users_file_name)
df_users.rename(columns={'household_key':'user_id'}, inplace=True)

df_items = pd.read_csv(df_items_file_name)
df_items.rename(columns={'PRODUCT_ID':'item_id'}, inplace=True)

Wall time: 2.65 s


### Model training (~10 min training)

In [11]:
%%time

rec_sys= RecSys(df=df,
                history_weeks_first_model=history_weeks_first_model,
                history_weeks_second_model=history_weeks_second_model,
                df_items=df_items,
                df_users=df_users,
                weighting=weighting,
                first_model_rec_limit=first_model_rec_limit,
                prefilter_params=prefilter_params,
                als_params=als_params)

HBox(children=(FloatProgress(value=0.0, max=3914.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


Wall time: 9min 52s


### Recommendations

In [12]:
# Exporting PUBLIC TEST recomendations
rec_sys.df_users[['user_id', 'catb_recommender']].to_csv('final_recommendations.csv', index=False)
pd.read_csv('public_test_recommendations.csv')

Unnamed: 0,user_id,catb_recommender
0,1,"[1082185, 995242, 979707, 940947, 6534178, 102..."
1,2,"[1133018, 1082185, 1106523, 5569230, 6534178, ..."
2,3,"[1106523, 1082185, 1133018, 5569230, 844165, 1..."
3,6,"[1082185, 995242, 1029743, 6534178, 900802, 96..."
4,7,"[1082185, 1058997, 6534178, 1126899, 1106523, ..."
...,...,...
1880,2496,"[1082185, 981760, 6534178, 1133018, 1106523, 8..."
1881,2497,"[1082185, 995242, 1029743, 951590, 6534178, 55..."
1882,2498,"[1082185, 914190, 6534178, 951590, 1070820, 95..."
1883,2499,"[1106523, 1082185, 1133018, 981760, 5568378, 5..."
