### Импорты библиотек

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix
from implicit import als

from lightfm import LightFM
from lightfm.evaluation import precision_at_k as prec_at_k

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, popularity_recommendation
from src.recommenders import MainRecommender
from src.preprocessing import new_item_features, new_user_features, train_test_preprocessing
from src.preprocessing import popularity_recommendation, get_important_features, get_final_recomendation

import datetime
import operator

import warnings
warnings.simplefilter('ignore')



### Загрузка данных

In [2]:
DATA_PATH = '../data/retail_train.csv'
ITEM_FEATURES_PATH = '../data/product.csv'
USER_FEATURES_PATH = '../data/hh_demographic.csv'
TEST_1_PATH = '../data/retail_test1.csv'
TEST_2_PATH = '../data/retail_test2.csv'

data = pd.read_csv(DATA_PATH)
item_features = pd.read_csv(ITEM_FEATURES_PATH)
user_features = pd.read_csv(USER_FEATURES_PATH)
test_1 = pd.read_csv(TEST_1_PATH)

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [3]:
train_1 = data

### Применяем фильтрацию к датасету, оставляем только нужных кандидатов

In [4]:
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular=200)

test_1 = prefilter_items(test_1, item_features=item_features, take_n_popular=200)

In [5]:
recommender = MainRecommender(train_1)



HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




#### Эмбеддинги товаров и юзеров

In [6]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

### Добавим фичи

In [7]:
item_features = new_item_features(train_1, item_features, items_emb_df).drop(['curr_size_of_product', 'department'], axis=1)
item_features.head(2)

Unnamed: 0,item_id,manufacturer,brand,commodity_desc,sub_commodity_desc,0,1,2,3,4,...,12,13,14,coupon_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,qnt_of_sales_per_sub_commodity_desc,qnt_of_sales_per_item_per_sub_commodity_desc_per_week
0,25671,2,National,FRZN ICE,ICE - CRUSHED/CUBED,,,,,,...,,,,,0,0.0,246664,0.009092,1047,0.010252
1,26081,2,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,,,,,...,,,,,0,0.0,0,0.0,0,0.0


In [8]:
user_features = new_user_features(train_1, user_features, users_emb_df)
user_features.head(2)

Unnamed: 0,marital_status_code,homeowner_desc,hh_comp_desc,household_size_desc,user_id,0,1,2,3,4,...,11,12,13,14,mean_time,age,income,kids,average_basket,sum_per_week
0,A,Homeowner,2 Adults No Kids,2,1,18.44478,13.46265,0.079428,8.146093,15.460316,...,8.576681,28.571972,-5.764569,12.167542,1384.800537,70,45,0,2.492077,41.683263
1,A,Homeowner,2 Adults No Kids,2,7,11.907116,19.949179,2.759631,14.283819,16.474358,...,13.135862,-1.740844,16.685032,2.346057,1689.314331,50,70,0,2.673405,30.167263


### Предобработка обучающего и валидационного датасетов

In [9]:
user_feat = pd.DataFrame(recommender.matrix_index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)

item_feat = pd.DataFrame(recommender.matrix_columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

In [10]:
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.select_dtypes(include=['object']).columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.select_dtypes(include=['object']).columns.tolist())

In [11]:
user_feat_lightfm = user_feat_lightfm.fillna(0)
item_feat_lightfm = item_feat_lightfm.fillna(0)

In [12]:
%%time
lightfm = LightFM(no_components=16,
#                 loss='bpr', # 'warp'
                loss='warp',
                learning_rate=0.001, 
                item_alpha=0.01, user_alpha=0.01, 
                random_state=42)

lightfm.fit((recommender.user_item_matrix > 0) * 1,  # user-item matrix из 0 и 1
#           sample_weight=recommender.user_item_matrix,
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4) 

Wall time: 2min 18s


<lightfm.lightfm.LightFM at 0x1f106cd1a08>

In [13]:
train_precision = prec_at_k(lightfm, 
                            recommender.user_item_matrix, 
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            k=5).mean()

train_precision

0.42056823

In [14]:
result_train_1 = train_1.groupby('user_id')['item_id'].unique().reset_index()
result_train_1.columns=['user_id', 'actual']

item_ids_ = item_feat_lightfm.index.drop(999999)
item_ids = [recommender.itemid_to_id[x] for x in item_ids_]

result_train_1['recs'] = result_train_1['user_id'].apply(lambda x: lightfm.predict(
                            recommender.userid_to_id[x],
                            item_ids=item_ids,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=-1
))


result_train_1['final_recommendations'] = result_train_1['recs'].apply(
    lambda x: list(dict(sorted(list(zip(x, item_ids_)),  reverse=True)[:5]).values())
)


result_train_1.apply(lambda row: precision_at_k(row['final_recommendations'], row['actual']), axis=1).mean()

0.41912765106042454

### Рекомендации для теста

In [15]:
test_1 = test_1[test_1['user_id'].isin(train_1['user_id'].unique())]

test_user_item_matrix = pd.pivot_table(test_1, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = test_user_item_matrix.astype(float) # необходимый тип матрицы для implicit

result_test_1 = test_1.groupby('user_id')['item_id'].unique().reset_index()
result_test_1.columns=['user_id', 'actual']

item_ids_ = item_feat_lightfm.index.drop(999999)
item_ids = [recommender.itemid_to_id[x] for x in item_ids_]

result_test_1['recs'] = result_test_1['user_id'].apply(lambda x: lightfm.predict(
                            recommender.userid_to_id[x],
                            item_ids=item_ids,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=-1
))


result_test_1['final_recommendations'] = result_test_1['recs'].apply(
    lambda x: list(dict(sorted(list(zip(x, item_ids_)),  reverse=True)[:5]).values())
)

result_test_1.apply(lambda row: precision_at_k(row['final_recommendations'], row['actual']), axis=1).mean()

0.10881104033970194