In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###Курсовой проект по курсу "Рекомендательные системы".

In [2]:
!pip3 install implicit



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender


# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Модель второго уровня
from lightgbm import LGBMClassifier

from content.drive.MyDrive.src.metrics import precision_at_k, recall_at_k
from content.drive.MyDrive.src.utils import prefilter_items
from content.drive.MyDrive.src.recommenders import MainRecommender

Загрузим данные

In [4]:
data = pd.read_csv('/content/drive/MyDrive/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Посмотрим на пользователей

In [5]:
np.max(data['user_id'])

2500

In [6]:
data['user_id'].value_counts()

2459    6544
718     6490
1111    6306
1609    6192
1453    5915
        ... 
1876       5
1897       5
494        4
2444       3
379        1
Name: user_id, Length: 2499, dtype: int64

In [7]:
users = data['user_id'].unique()
for i in range(1, 2501):
  if i not in users:
    print(i)

2325


Пользователь 2325 не совершал покупок, так же часть пользователей совершали очень мало покупок.

In [8]:
item_features = pd.read_csv('/content/drive/MyDrive/product.csv')
user_features = pd.read_csv('/content/drive/MyDrive/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

Выгрузим нужных юзеров

In [9]:
userid_list = pd.read_csv('/content/drive/MyDrive/test_users.csv')
userid_list

Unnamed: 0,user_id
0,1
1,2
2,3
3,6
4,7
...,...
1703,2494
1704,2496
1705,2498
1706,2499


In [10]:
user_list = list(userid_list['user_id'])

###Предобработка данных

In [11]:
#№Возьмём топ 5000 товаров
#take_n_popular = 5000

#popularity = data.groupby('item_id')['quantity'].sum().reset_index()
#popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

#popularity.index = popularity.index.astype('int')
#top = popularity.sort_values('n_sold', ascending=False).head(int(take_n_popular)).item_id.tolist()
    
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
#data.loc[~data['item_id'].isin(top), 'item_id'] = 999999

###Определение модели.

In [12]:
#Разабьем данные на обучающие и тестовые.
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [13]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [14]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [15]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = user_item_matrix.astype(float)

In [16]:
test = data_test.groupby('user_id')['item_id'].unique().reset_index()
test.columns=['user_id', 'actual']
test.tail(2)

Unnamed: 0,user_id,actual
2039,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."
2040,2500,"[852182, 856345, 923746, 948670, 1018007, 1044..."


In [17]:
own = ItemItemRecommender(K=1, num_threads=4) # K - кол-во билжайших соседей

own.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

  0%|          | 0/86865 [00:00<?, ?it/s]

In [18]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[999999], 
                                    recalculate_user=True)]
    return res


In [19]:
test['own_rec'] = test['user_id'].apply(lambda x: get_recommendations(x, model=own, N=5))
test.tail(2)

Unnamed: 0,user_id,actual,own_rec
2039,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[1082185, 6534178, 1070820, 1085604, 883404]"
2040,2500,"[852182, 856345, 923746, 948670, 1018007, 1044...","[1082185, 6534178, 995242, 1029743, 1126899]"


In [20]:
#recommender = MainRecommender(data_train)

In [21]:
#test['sim_rec'] = test['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=5))
#test.head(2)

In [22]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [23]:
popular_recs = popularity_recommendation(data_train, n=5)

test['pop_rec'] = test['user_id'].apply(lambda x: popular_recs)
test.head(2)

Unnamed: 0,user_id,actual,own_rec,pop_rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1082185, 995242, 1029743, 1005186, 6534178]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1082185, 1106523, 951590, 1053690, 6534178]","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [24]:
als = AlternatingLeastSquares(factors=20, 
                                regularization=0.001,
                                iterations=3, 
                                calculate_training_loss=True,
                                use_gpu=False, 
                                num_threads=4)

#als.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
#          show_progress=True)



In [25]:
#test['als_rec'] = test['user_id'].apply(lambda x: get_recommendations(x, model=als, N=5))
#test.tail(2)

###Валидация данных

In [26]:
def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list, bought_list)
    
    if sum(flags) == 0:
        return 0

    sum_ = 0
    k = len(recommended_list)-1
    for i in range(1, k+1):
        
        if flags[i - 1] == True:
            p_k = precision_at_k(recommended_list, bought_list, k=i)
#             print(p_k)
            
            sum_ += p_k
            
    result = sum_ / min(len(recommended_list), k)
    
    return result

In [27]:
def map_k(recommended_list_list, bought_list_list, k=5):
    
    ap = list()
    for i in range(len(recommended_list_list)):

      ap.append(ap_k(recommended_list_list[i], bought_list_list[i]))
    
    map_metric = np.mean(ap)
    
    return map_metric

In [28]:
map_k(test['pop_rec'], test['actual'])

0.07528989057651478

0.07528989057651478

In [29]:
map_k(test['own_rec'], test['actual'])

0.2598637650389242

0.2598637650389242

In [30]:
#map_k(test['sim_rec'], test['actual'])

0.03419483913114486

In [31]:
#map_k(test['als_rec'], test['actual'])

0.15292748652621263

###2 уровень

In [32]:
test['lvl_2_rec'] = test['user_id'].apply(lambda x: get_recommendations(x, model=own, N=20))
train_lvl_2 = test[['user_id', 'lvl_2_rec']]
train_lvl_2.tail(2)

Unnamed: 0,user_id,lvl_2_rec
2039,2499,"[1082185, 6534178, 1070820, 1085604, 883404, 6..."
2040,2500,"[1082185, 6534178, 995242, 1029743, 1126899, 5..."


In [33]:
train_lvl_2['lvl_2_rec'][1925]

[1082185, 6534178, 5574377, 1126711]

In [34]:
#Добавим наиболее популярные товары пользователям у которых не достаёт в рекомендации
overall_top_purchases = data_train.groupby('item_id')['quantity'].sum().reset_index()
overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
overall_top_purchases = overall_top_purchases[overall_top_purchases['item_id'] != 999999]
overall_top_purchases = overall_top_purchases.item_id.tolist()


In [35]:
def extend_with_top_popular_2(recommendations, N=5):
    """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

    if len(recommendations) < N:
        unique_top_purchases = list(set(overall_top_purchases) - set(recommendations))
        recommendations.extend(unique_top_purchases[:N])
        recommendations = recommendations[:N]

    return recommendations

In [36]:
train_lvl_2['lvl_2_rec'] = train_lvl_2['lvl_2_rec'].apply(lambda x: extend_with_top_popular_2(x, N=20))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
train_lvl_2['lvl_2_rec'][1925]

[1082185,
 6534178,
 5574377,
 1126711,
 1048583,
 262171,
 2621476,
 1048620,
 1048623,
 1048628,
 1048629,
 12582964,
 786492,
 1048651,
 1048662,
 6553687,
 524374,
 1048668,
 6553694,
 1048672]

In [38]:
s = train_lvl_2.apply(lambda x: pd.Series(x['lvl_2_rec']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

train_lvl_2 = train_lvl_2.drop('lvl_2_rec', axis=1).join(s)
train_lvl_2['flag'] = 1

train_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,1,1082185,1
0,1,995242,1
0,1,1029743,1
0,1,1005186,1


In [39]:
targets_lvl_2 = data_train[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = train_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [40]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')


targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(5)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
1,1,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
2,1,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
3,1,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
4,1,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown


In [41]:
targets_lvl_2 = targets_lvl_2.drop_duplicates()
targets_lvl_2.head(5)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
37,1,995242,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
69,1,1029743,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
71,1,1005186,1.0,2,SALAD BAR,National,SALAD BAR,SALAD BAR FRESH FRUIT,,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
75,1,6534178,0.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown


In [42]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [43]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [44]:
lgb = LGBMClassifier(objective='binary', max_depth=15, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(boosting_type='gbdt',
               categorical_column=['manufacturer', 'department', 'brand',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product', 'age_desc',
                                   'marital_status_code', 'income_desc',
                                   'homeowner_desc', 'hh_comp_desc',
                                   'household_size_desc', 'kid_category_desc'],
               class_weight=None, colsample_bytree=1.0, importance_type='split',
               learning_rate=0.1, max_depth=15, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [45]:
test_lvl_2 = test[['user_id', 'lvl_2_rec']]
test_lvl_2.tail(2)

Unnamed: 0,user_id,lvl_2_rec
2039,2499,"[1082185, 6534178, 1070820, 1085604, 883404, 6..."
2040,2500,"[1082185, 6534178, 995242, 1029743, 1126899, 5..."


In [46]:
s = test_lvl_2.apply(lambda x: pd.Series(x['lvl_2_rec']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

test_lvl_2 = test_lvl_2.drop('lvl_2_rec', axis=1).join(s)


test_lvl_2.head(4)

Unnamed: 0,user_id,item_id
0,1,1082185.0
0,1,995242.0
0,1,1029743.0
0,1,1005186.0


In [47]:
test_lvl_2 = test_lvl_2.merge(item_features, on='item_id', how='left')


test_lvl_2 = test_lvl_2.merge(user_features, on='user_id', how='left')

test_lvl_2.head(5)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
1,1,995242.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
2,1,1029743.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
3,1,1005186.0,2,SALAD BAR,National,SALAD BAR,SALAD BAR FRESH FRUIT,,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
4,1,6534178.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown


In [48]:
cat_feats = test_lvl_2.columns[2:].tolist()
test_lvl_2[cat_feats] = test_lvl_2[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [49]:
test_preds_proba = lgb.predict_proba(test_lvl_2)[:, 1]

In [50]:
def get_recomendation(X_test, test_preds_proba, userid_list):
    X_test['predict_proba'] = test_preds_proba
    X_test.sort_values(['user_id', 'predict_proba'], ascending=False, inplace=True)
    result = X_test.groupby('user_id').head(5)
    recs = result.groupby('user_id')['item_id']
    recomendations = []
    for user, preds in recs:
        recomendations.append({'user_id': user, 'lgbm': preds.tolist()})
    recomendations = pd.DataFrame(recomendations)
    result_lvl_2 = userid_list.merge(recomendations)
    
    return result_lvl_2

In [51]:
test_lvl_2 = get_recomendation(test_lvl_2, test_preds_proba, pd.DataFrame(test['user_id']))
test_lvl_2

Unnamed: 0,user_id,lgbm
0,1,"[1104349.0, 1005186.0, 1082185.0, 8090521.0, 9..."
1,3,"[883404.0, 1092026.0, 879755.0, 1133018.0, 809..."
2,6,"[883404.0, 879755.0, 839849.0, 995242.0, 11330..."
3,7,"[1092026.0, 1110572.0, 1082185.0, 8090521.0, 9..."
4,8,"[883404.0, 839849.0, 844165.0, 1082185.0, 1005..."
...,...,...
2036,2496,"[984677.0, 1082185.0, 999971.0, 1110572.0, 883..."
2037,2497,"[883404.0, 879755.0, 844165.0, 8090521.0, 1005..."
2038,2498,"[1092026.0, 1082185.0, 1110572.0, 986912.0, 10..."
2039,2499,"[883404.0, 999971.0, 1082185.0, 1110572.0, 109..."


In [52]:
map_k(test_lvl_2['lgbm'], test['actual'])

0.1015331536828352

Наиболее оптимальным оказался метод Итем итем рекомендации с один соседем.

###Создание рекомендаций

In [54]:
result = data.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[825123, 831447, 840361, 845307, 852014, 85498..."
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55..."


In [55]:
user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [56]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [57]:
final_model = ItemItemRecommender(K=1, num_threads=4) # K - кол-во билжайших соседей

final_model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

  0%|          | 0/89051 [00:00<?, ?it/s]

In [58]:
result['recomendations'] = result['user_id'].apply(lambda x: get_recommendations(x, model=final_model, N=5))
result.head(2)

Unnamed: 0,user_id,actual,recomendations
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[1082185, 995242, 1029743, 1005186, 6534178]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[1082185, 1106523, 1133018, 951590, 5569230]"


In [59]:
user_2325 = {'user_id': 2325, 'actual': [], 'recomendations': []}
result = result.append(user_2325, ignore_index = True)
result = result.sort_values(by=['user_id']).reset_index(drop= True)
result.head(2)

Unnamed: 0,user_id,actual,recomendations
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[1082185, 995242, 1029743, 1005186, 6534178]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[1082185, 1106523, 1133018, 951590, 5569230]"


In [60]:
popular_itm = popularity_recommendation(data, n=5)
popular_itm

[6534178, 6533889, 1029743, 6534166, 1082185]

In [61]:
def extend_with_top_popular(recommendations, pop_list, N=5):
    """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

    if len(recommendations) < N:
        unique_top_purchases = list(set(pop_list) - set(recommendations))
        recommendations.extend(unique_top_purchases[:N])
        recommendations = recommendations[:N]

    return recommendations

In [62]:
result['recomendations'] = result['recomendations'].apply(lambda x: extend_with_top_popular(x, popular_itm,  N=5))
result.head(2)

Unnamed: 0,user_id,actual,recomendations
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[1082185, 995242, 1029743, 1005186, 6534178]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[1082185, 1106523, 1133018, 951590, 5569230]"


In [63]:
result[result['user_id']== 2325]

Unnamed: 0,user_id,actual,recomendations
2324,2325,[],"[6533889, 6534178, 6534166, 1082185, 1029743]"


Подготовим рекомендации

In [64]:
result = result.drop(columns='actual')

In [65]:
result.rename(columns={'user_id': 'UserId'}, inplace=True)
result.rename(columns={'recomendations': 'Predicted'}, inplace=True)

result


Unnamed: 0,UserId,Predicted
0,1,"[1082185, 995242, 1029743, 1005186, 6534178]"
1,2,"[1082185, 1106523, 1133018, 951590, 5569230]"
2,3,"[1082185, 1106523, 951590, 1053690, 6534178]"
3,4,"[1082185, 1029743, 962229, 883404, 1075368]"
4,5,"[1082185, 1029743, 1126899, 6534178, 1138292]"
...,...,...
2495,2496,"[1082185, 6534178, 1106523, 883404, 5569230]"
2496,2497,"[1082185, 1029743, 6534178, 995242, 951590]"
2497,2498,"[1082185, 1070820, 1106523, 1126899, 1053690]"
2498,2499,"[1082185, 6534178, 1070820, 1085604, 883404]"


In [66]:
for i in result['UserId']:
  if i not in user_list:
    result = result.drop(index=(i-1))

In [67]:
result['Predicted']= result['Predicted'].apply(lambda x: str(x[0])+' '+str(x[1])+' '+str(x[2])+' '+str(x[3])+' '+str(x[4]))
result

Unnamed: 0,UserId,Predicted
0,1,1082185 995242 1029743 1005186 6534178
1,2,1082185 1106523 1133018 951590 5569230
2,3,1082185 1106523 951590 1053690 6534178
5,6,1082185 6534178 1029743 995242 5569230
6,7,1082185 1106523 1029743 1126899 1133018
...,...,...
2493,2494,1082185 6534178 1029743 1070820 1053690
2495,2496,1082185 6534178 1106523 883404 5569230
2497,2498,1082185 1070820 1106523 1126899 1053690
2498,2499,1082185 6534178 1070820 1085604 883404


In [68]:
result.to_csv('/content/drive/MyDrive/result.csv', index=False)