In [1]:
import datetime
import pandas as pd
import catboost as cat
import numpy as np
import typing as tp
from rectools import Columns, InternalIds
from rectools.models.base import Scores
from rectools.dataset import Dataset
from rectools.metrics import Precision
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel, RandomModel
from lightfm import LightFM
from implicit.als import AlternatingLeastSquares

In [2]:
!export OPENBLAS_NUM_THREADS=1

In [3]:
class NoneDataUsersItems(Exception):
    def __init__(self, message: str = 'Отсутствуют данные взаимодействия user_items!'):
        Exception.__init__(self)
        self.message = message


class NoneDataUsers(Exception):
    def __init__(self, message: str = 'Отсутствуют данные по пользователям!'):
        Exception.__init__(self)
        self.message = message


class NoneDataItems(Exception):
    def __init__(self, message: str = 'Отсутствуют данные по продуктам!'):
        Exception.__init__(self)
        self.message = message

In [4]:
data = pd.read_csv('/content/drive/MyDrive/retail_train.csv',
                   names=[Columns.User, 'basket_id', 'day', Columns.Item, Columns.Weight, 'sales_value',
                          'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
                          'coupon_match_disc', Columns.Datetime])
item_features = pd.read_csv('/content/drive/MyDrive/product.csv')
user_features = pd.read_csv('/content/drive/MyDrive/hh_demographic.csv')

In [5]:
class Recommended:
    """
    Модель рекомендации, которая собрала в себе методы, такие как: [ALS,LightFM, KNN, PopularModel, RandomModel]

    Методы
    --------
        prefilter_items: метод подготовки данных и очистки от ненужного 'мусора';

        train_test_split: метод разделения данных на train и test;

        prepare_items_users: подготовка user_df и item_df;

    """

    def __init__(self, data_frame=None, data_items=None,
                 data_users=None):
        try:
            self.data_frame = data_frame
            self.data_items = data_items
            self.data_users = data_users
            if data_frame is None:
                raise NoneDataUsersItems
            elif data_items is None:
                raise NoneDataItems
            elif data_users is None:
                raise NoneDataUsers
        except NoneDataUsersItems as e:
            print(e.message)
        except NoneDataItems as e:
            print(e.message)
        except NoneDataUsers as e:
            print(e.message)

        self.data_items.columns = [col.lower() for col in self.data_items.columns]
        self.data_items.rename(columns={'product_id': 'item_id'}, inplace=True)
        self.data_items.drop(columns=['curr_size_of_product'], inplace=True)

        self.data_users.columns = [col.lower() for col in self.data_users.columns]
        self.data_users.rename(columns={'household_key': 'user_id'}, inplace=True)

        self.data_frame['price'] = self.data_frame['sales_value'] / (np.maximum(self.data_frame[Columns.Weight], 1))
        self.data_frame[Columns.Datetime] = datetime.datetime.now()
        self.data_frame = self.prefilter_items(self.data_frame, data_items=self.data_items)

        self.data_frame_train, self.data_frame_test = self.train_test_split(self.data_frame, k=3)

        self.sparse_features_dataset = self.prepare_sparse_features_dataset(data=self.data_frame_train)

        self.result = self.data_frame_test.groupby('user_id')['item_id'].unique().reset_index()
        self.result.columns = [Columns.User, 'actual']

        self.recommendation_popular = self.popular_items(data_train=self.sparse_features_dataset,
                                                         users=self.data_frame_test[Columns.User].unique())
        self.recommendation_popular_to_result = self.recommendation_popular.groupby('user_id')['item_id'].unique().reset_index()
        self.result['popular'] = self.recommendation_popular_to_result[Columns.Item]

        self.recommendation_random = self.random_items(data_train=self.sparse_features_dataset,
                                                       users=self.data_frame_test[Columns.User].unique())
        self.recommendation_random_to_result = self.recommendation_random.groupby('user_id')['item_id'].unique().reset_index()
        self.result['random'] = self.recommendation_random_to_result[Columns.Item]

        self.recommendation_als = self.recommended_lvl1(data_train=self.sparse_features_dataset,
                                                        users=self.data_frame_test[Columns.User].unique())
        self.recommendation_als_to_result = self.recommendation_als.groupby('user_id')['item_id'].unique().reset_index()
        self.result['ALS'] = self.recommendation_als_to_result[Columns.Item]

        self.recommendation_LightFM = self.recommended_lvl2(data_train=self.sparse_features_dataset,
                                                            users=self.data_frame_test[Columns.User].unique())
        self.recommendation_LightFM_to_result = self.recommendation_LightFM.groupby('user_id')[
            'item_id'].unique().reset_index()
        self.result['LightFM'] = self.recommendation_LightFM_to_result[Columns.Item]

        # self.recommendation_catboost = self.recommended_cat_boost_rank(data_train=self.data_frame_train,
        #                                                                users=self.data_frame_test[
        #                                                                    Columns.User].unique())
        # self.result['catboost'] = self.recommendation_catboost['catboost_rank']

    def prefilter_items(self, data: pd.DataFrame, data_items: pd.DataFrame) -> pd.DataFrame:
        '''
        Преобразование входных данных
        '''
        # Уберем самые популярные товары (их и так купят)
        popularity = data.groupby('item_id')['user_id'].nunique().reset_index() / data['user_id'].nunique()
        popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

        top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
        data = data[~data['item_id'].isin(top_popular)]

        # Уберем самые НЕ популярные товары (их и так НЕ купят)
        top_non_popular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
        data = data[~data['item_id'].isin(top_non_popular)]

        # Уберем товары, которые не продавались за последние 12 месяцев
        data = data[data['week_no'] < 54]

        # Уберем не интересные для рекоммендаций категории (department)
        data_to_del = data_items['department'].value_counts().reset_index()
        data_to_del_name = list(data_to_del['index'][data_to_del['department'] < 10])
        data = data[~data['item_id'].isin(list(data_items['item_id'][data_items['department'].isin(data_to_del_name)]))]

        # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
        data = data[data['price'] > 1]

        # Уберем слишком дорогие товары
        data = data[data['price'] < 100]

        return data.reset_index(drop=True)

    def train_test_split(self, data: pd.DataFrame, k: int = 3):
        '''
        Разделение data

        Returns
        -------
        splitting : list, length=2 * len(arrays)
            data_train, data_test
        '''
        data_train = data[data['week_no'] < data['week_no'].max() - k].reset_index(drop=True)
        data_test = data[data['week_no'] >= data['week_no'].max() - k].reset_index(drop=True)
        return data_train, data_test

    def prepare_items_users(self, data: pd.DataFrame, data_items: pd.DataFrame, data_users: pd.DataFrame):
        '''
        Подготовка users_data и items_data
        Returns
        -------
        splitting : list, length=2 * len(arrays)
            users_data, items_data
        '''
        data_users = data_users.loc[data_users["user_id"].isin(data["user_id"])].copy()
        data_items = data_items.loc[data_items["item_id"].isin(data["item_id"])].copy()

        users_features_frames = []
        for user_feature in list(data_users.columns):
            if user_feature in [Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]:
                continue
            feature_frame = data_users.reindex(columns=["user_id", user_feature])
            feature_frame.columns = ["user_id", "value"]
            feature_frame["feature"] = user_feature
            users_features_frames.append(feature_frame)
        users_features = pd.concat(users_features_frames)
        users_features.reset_index(drop=True, inplace=True)
        users_features.fillna(0, inplace=True)

        items_features_frames = []
        for item_feature in list(data_items.columns):
            if item_feature in [Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]:
                continue
            feature_frame = data_items.reindex(columns=["item_id", item_feature])
            feature_frame.columns = ["item_id", "value"]
            feature_frame["feature"] = item_feature
            items_features_frames.append(feature_frame)
        items_features = pd.concat(items_features_frames)
        items_features.reset_index(drop=True, inplace=True)
        items_features.fillna(0, inplace=True)
        return users_features, items_features

    def prepare_sparse_features_dataset(self, data: pd.DataFrame) -> "Dataset":
        '''
        Подготовка sparse_features_dataset для использования в моделях

        Returns
        -------
        splitting : list, length=2 * len(arrays)
            Возвращает данные в формате разряженной матрицы, где учтены users,items и features

        '''
        data = data.loc[:, [Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]]
        user_features, item_features = self.prepare_items_users(data=data, data_items=self.data_items,
                                                                data_users=self.data_users)
        sparse_features_dataset = Dataset.construct(
            data,
            item_features_df=item_features,
            cat_item_features=['manufacturer', 'department', 'brand', 'commodity_desc',
                               'sub_commodity_desc'],
            user_features_df=user_features,
            cat_user_features=['age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
                               'hh_comp_desc', 'household_size_desc', 'kid_category_desc'],
            make_dense_user_features=False
        )
        return sparse_features_dataset

    def popular_items(self, data_train: "Dataset", users: np.ndarray) -> tp.Tuple[InternalIds, InternalIds, Scores]:
        '''
        Построение рекомендаций на основе популярности продуктов
        '''
        model = PopularModel()
        model.fit(data_train)
        recommendation = model.recommend(
            users=users,
            dataset=data_train,
            k=100,
            filter_viewed=True
        )
        return recommendation

    def random_items(self, data_train: "Dataset", users: np.ndarray) -> tp.Tuple[InternalIds, InternalIds, Scores]:
        '''
        Построение рекомендаций на основе выбора рандомных товаров
        '''
        model = RandomModel()
        model.fit(data_train)
        recommendation = model.recommend(
            users=users,
            dataset=data_train,
            k=100,
            filter_viewed=True
        )
        return recommendation

    def recommended_lvl1(self, data_train: "Dataset", users: np.ndarray, k: int = 100) -> tp.Tuple[
        InternalIds, InternalIds, Scores]:
        '''
        Построение рекомендаций на основе ALS
        '''
        model = ImplicitALSWrapperModel(
            model=AlternatingLeastSquares(factors=100, regularization=0.1, iterations=50, random_state=13, use_gpu=True))
        model.fit(data_train)
        recommendation = model.recommend(
            users=users,
            dataset=data_train,
            k=k,
            filter_viewed=True
        )
        return recommendation

    def recommended_lvl2(self, data_train: "Dataset", users: np.ndarray, k: int = 100) -> tp.Tuple[
        InternalIds, InternalIds, Scores]:
        '''
        Построение рекомендаций на основе LightFM
        '''
        model = LightFMWrapperModel(model=LightFM(no_components=30,
                                                  learning_rate=0.05,
                                                  item_alpha=0.1, user_alpha=0.1,
                                                  random_state=13))
        model.fit(data_train)
        recommendation = model.recommend(
            users=users,
            dataset=data_train,
            k=k,
            filter_viewed=True
        )
        return recommendation

    # def recommended_cat_boost_rank(self, data_train: pd.DataFrame, users: np.ndarray):
    #     '''
    #     Построение рекомендаций на основе Catboost
    #     '''
    #     data_users = self.data_users.loc[self.data_users["user_id"].isin(data["user_id"])].copy()
    #     data_items = self.data_items.loc[self.data_items["item_id"].isin(data["item_id"])].copy()

    #     recommendation_als = self.recommended_lvl1(data_train=self.sparse_features_dataset,
    #                                                     users=self.data_frame_train[Columns.User].unique(),k=100)
    #     recommendation_lightfm = self.recommended_lvl2(data_train=self.sparse_features_dataset,
    #                                                         users=self.data_frame_train[Columns.User].unique(),k=100)

    #     data_for_catboost = data_train.merge(data_users, on='user_id', how='left')
    #     data_for_catboost = data_for_catboost.merge(data_items, on='item_id', how='left')


    #     data_for_catboost['target'] = 0
        # data_for_catboost.fillna(0, inplace=True)
        # X_train = data_for_catboost.iloc[:, :-1]
        # y_train = data_for_catboost.iloc[:, -1]

        # model = cat.CatBoostClassifier(random_state=13,
        #                                n_estimators=600,
        #                                max_depth=5,
        #                                l2_leaf_reg=0.3,
        #                                custom_loss='F1',
        #                                loss_function='Logloss', learning_rate=0.1,
        #                                cat_features=list(data_for_catboost.select_dtypes(include=['object'])))
        # model.fit(X_train, y_train)

        # predictions = model.predict_proba(X_train)[:, 1]
        # data_for_catboost['predicted_rank'] = predictions

        # recommendations = data_for_catboost.sort_values(by=['user_id', 'predicted_rank'], ascending=[True, False])
        # recommendations = recommendations.groupby('user_id')['item_id'].unique().reset_index()
        # recommendations.columns = ['user_id', 'catboost_rank']
        # recommendations = recommendations[recommendations['user_id'].isin(users)]

        # return recommendations

In [6]:
        # data_for_catboost = data_for_catboost.merge(self.recommendation_als[['user_id', 'item_id', 'score']],
        #                                             on=['user_id'], how='left')
        # data_for_catboost = data_for_catboost.merge(self.recommendation_als[['user_id', 'item_id', 'score']],
        #                                             on=['user_id'], how='left')

In [7]:
model = Recommended(data_frame=data,data_items=item_features,data_users=user_features)



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
result = model.result

In [9]:
result

Unnamed: 0,user_id,actual,popular,random,ALS,LightFM
0,1,"[821815, 856942, 860720, 889223, 909899, 92834...","[981760, 1106523, 916122, 1127831, 1068719, 95...","[1011809, 884677, 6602369, 842700, 9836393, 92...","[1024306, 878996, 1027569, 883932, 916122, 965...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
1,2,"[831125, 847241, 904236, 911974, 940947, 95701...","[1029743, 981760, 961554, 844179, 1044078, 866...","[974712, 8119245, 1053176, 946562, 957322, 948...","[897954, 1044078, 995242, 5568378, 1037840, 81...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
2,3,"[831063, 837139, 840244, 846823, 854405, 86957...","[1029743, 981760, 916122, 1004906, 1068719, 96...","[2005209, 1349708, 947039, 837518, 948186, 697...","[1068719, 981760, 1106301, 1043590, 8090521, 1...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
3,4,"[883932, 887003, 893018, 962229, 1099446, 1104...","[1082185, 981760, 1106523, 916122, 1127831, 10...","[684649, 928626, 13158312, 873592, 1127817, 94...","[951590, 1056509, 995242, 8090521, 12301109, 8...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
4,5,"[946839, 1018007, 1071939, 1123022, 6773079, 6...","[1082185, 981760, 1106523, 1127831, 1004906, 1...","[6443743, 869755, 1004857, 5572972, 9500809, 1...","[893018, 1101173, 1004906, 987562, 1106301, 10...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
...,...,...,...,...,...,...
1844,2493,"[981760, 995055, 1051323, 827570, 999779, 1004...","[1082185, 916122, 1127831, 1068719, 961554, 95...","[1024903, 1356008, 10355368, 61748, 1083984, 9...","[951590, 893018, 910032, 1029624, 962229, 1118...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
1845,2494,"[827546, 829291, 829928, 853887, 860299, 86227...","[981760, 1106523, 916122, 1127831, 1068719, 95...","[984773, 5996551, 6632695, 5995347, 6875701, 1...","[1127831, 1070820, 981760, 844179, 5569230, 91...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
1846,2497,"[840609, 843450, 853522, 869150, 871756, 88734...","[1106523, 916122, 1068719, 961554, 866211, 995...","[1584771, 1103377, 987234, 778180, 2572127, 69...","[854852, 1096036, 995242, 1068719, 916122, 999...","[972312, 1107760, 9934800, 7409673, 997987, 10..."
1847,2498,"[1110031, 6034956, 994825, 984140, 6391291, 95...","[1082185, 1029743, 981760, 916122, 1127831, 10...","[9337171, 1046968, 42983, 47963, 1080204, 9712...","[1082185, 981760, 1029743, 961554, 1127831, 55...","[972312, 1107760, 9934800, 7409673, 997987, 11..."


In [15]:
print(f'Результат работы модели (Popular):{Precision(5).calc(model.recommendation_popular,model.data_frame_test)*100}')
print(f'Результат работы модели (Random):{Precision(5).calc(model.recommendation_random,model.data_frame_test)*100}')
print(f'Результат работы модели (ALS):{Precision(5).calc(model.recommendation_als,model.data_frame_test)*100}')
print(f'Результат работы модели (LightFM):{Precision(5).calc(model.recommendation_LightFM,model.data_frame_test)*100}')

Результат работы модели (Popular):3.829096809085993
Результат работы модели (Random):0.04326663061114116
Результат работы модели (ALS):3.980530016224986
Результат работы модели (LightFM):0.08653326122228232
