In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами

from scipy.sparse import csr_matrix, coo_matrix
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Матричная факторизация
from implicit import als
from implicit.nearest_neighbours import ItemItemRecommender

# Модель второго уровня
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции

from src.utils import prefilter_items, get_user_matrix, get_item_matrix, get_useritem_matrix
from src.recommenders import MainRecommender
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm
  "LightFM was compiled without OpenMP support. "


In [2]:
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [3]:
from src.metrics import prec_at_k, rec_at_k

In [4]:
# param grid to search for better arguments' values for the model
param_grid = {
    'no_components': [10, 20, 30],
    'learning_rate': np.linspace(0.005, 0.03, num=5), 
    'loss': ['bpr', 'warp'], #=, # 
    'item_alpha': np.linspace(0.001, 0.05, num=5), 
    'user_alpha': np.linspace(0.001, 0.05, num=5)
}


In [5]:
# downloading data

data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')
test_data = pd.read_csv('../data/retail_test1.csv')

In [6]:
# same column names through users', items' and transactions' tables
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [7]:
# Важна схема обучения и валидации!
# -- давние покупки --train | -- 6 недель -- val_lvl_1| -- 3 недель --val_lvl_2 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [8]:
# dataframe with purchase results for each user in validation 1 data - data_val_lvl_1
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [9]:
# dataframe with purchase results for each user in validation 2 data -data_val_lvl_2
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


# prefiltering train data, checking number of items left for recommendation

In [10]:

n_items_before = data_train_lvl_1['item_id'].nunique()

prefiltered_train_lvl_1 = prefilter_items(data_train_lvl_1)

n_items_after = prefiltered_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5000


In [11]:
# checking number of unique users in all data
data['user_id'].nunique()

2499

In [12]:
#checking number of users in test data
test_data['user_id'].nunique()

1885

In [13]:
# number of users in test data new to train data
len([us for us in test_data['user_id'].unique() if us in data['user_id']])

1885

In [84]:
#number of users in prefiltered train data
prefiltered_train_lvl_1['user_id'].values

array([2375, 2375, 2375, ...,  856,  856,  856], dtype=int64)

Warm users - valid data users, that are also in prefiltered train data
Cold users - valid data users, that are NOT in prefiltered train data

In [82]:
warm_users_1 = [x for x in result_lvl_1['user_id'].values if x in prefiltered_train_lvl_1['user_id'].unique()]
cold_users_1 = [x for x in result_lvl_1['user_id'].values if x not in prefiltered_train_lvl_1['user_id'].unique()]

In [16]:
print(f'in validation 1 data number of warm users: {len(warm_users_1)}, of cold users {len(cold_users_1)}')

in validation 1 data number of warm users: 1193, of cold users 961


In [17]:
warm_users_2 = [x for x in result_lvl_2['user_id'].values if x in prefiltered_train_lvl_1['user_id']]
cold_users_2 = [x for x in result_lvl_2['user_id'].values if x not in prefiltered_train_lvl_1['user_id']]

In [18]:
print(f'in validation 2 data number of warm users: {len(warm_users_2)}, of cold users {len(cold_users_2)}')

in validation 2 data number of warm users: 1139, of cold users 903


In [19]:
# preparing user_item matrix with number of each purchased item by each user
user_item_matrix = pd.pivot_table(prefiltered_train_lvl_1, index='user_id', columns='item_id', values='quantity',
                                      aggfunc='count', fill_value=0)
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()


In [20]:
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15511891,15596279,15596488,15596515,15778533,15926844,15926886,15927403,15927661,15927850
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
als_model = MainRecommender(prefiltered_train_lvl_1)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  4.07it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 14486.41it/s]


In [22]:
item_factors = als_model.item_factors
user_factors = als_model.user_factors

### predictions  with implicit.predict(recalculate_user=True)  on validation data users including users new for train data

In [23]:
       
def prepare_matrx(data):
        
    user_item_matrix = pd.pivot_table(data, index='user_id', columns='item_id', values='quantity',
                                      aggfunc='count', fill_value=0)
    
    user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit
    
    return user_item_matrix
         
         
def prepare_dict(user_item_matrix):
    userids = user_item_matrix.index.values
    itemids = user_item_matrix.columns.values
    matrix_userids = np.arange(len(userids))
    matrix_itemids = np.arange(len(itemids))

    id_to_itemid = dict(zip(matrix_itemids, itemids))
    id_to_userid = dict(zip(matrix_userids, userids))
    itemid_to_id = dict(zip(itemids, matrix_itemids))
    userid_to_id = dict(zip(userids, matrix_userids))

    return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
         
         
def upd_dict(self, user_id):
    if user_id not in userid_to_id.keys():
        max_id = max(list(userid_to_id.values()))
        max_id += 1
        
        userid_to_id.update({user_id: max_id})
        id_to_userid.update({max_id: user_id})
    return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
         
         
def fit(data, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
    user_item_matrix = prepare_matrx(data)
    user_item_matrix = bm25_weight(user_item_matrix.T).T
    model = AlternatingLeastSquares(factors=n_factors, regularization=regularization,
                                    iterations=iterations, num_threads=num_threads)
    model.fit(csr_matrix(user_item_matrix).T.tocsr())

    return model
         
         
def get_als_recommends(prefiltered_data, user, unfiltered_data, model, N=50):
    if user in prefiltered_data['user_id'].unique():
        prefiltered_matrix = prepare_matrx(prefiltered_data)
        id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = prepare_dict(prefiltered_matrix)
        prefiltered_matrix = bm25_weight(prefiltered_matrix.T).T

        rec = model.recommend(userid=userid_to_id[user], user_items=csr_matrix(prefiltered_matrix).tocsr(),
                              N=N, filter_already_liked_items=False, recalculate_user=True)
        rec_ids = [id_to_itemid[f[0]] for f in rec]
        
    else:
        
        new_df = pd.concat([prefiltered_data, unfiltered_data.loc[unfiltered_data['user_id']==user, :]])
        prefiltered_matrix = prepare_matrx(new_df)
        id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = prepare_dict(prefiltered_matrix)
        prefiltered_matrix = bm25_weight(prefiltered_matrix.T).T
        rec = model.recommend(userid=userid_to_id[user], user_items=csr_matrix(new_df).tocsr(),
                              N=N, filter_already_liked_items=False, recalculate_user=True)
        rec_ids = [id_to_itemid[f[0]] for f in rec]
        
    return np.array(rec_ids)


In [24]:
my_als_model = fit(prefiltered_train_lvl_1)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  3.91it/s]


In [25]:
result_lvl_1['recalc_500'] = result_lvl_1['user_id'].apply(lambda x: get_als_recommends(prefiltered_train_lvl_1,
                                                                                     x, data_val_lvl_1, my_als_model, N=500))

In [26]:
result_lvl_1

Unnamed: 0,user_id,actual,recalc_500
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1062572, 1028166, 1059930, 1082185, 856942, 9..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[834484, 1082185, 5569230, 1041259, 916122, 90..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[891423, 951590, 902172, 883932, 846550, 80905..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1082185, 878996, 857006, 965267, 1127831, 102..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[853643, 5588238, 1003188, 5584645, 5587043, 5..."
...,...,...,...
2149,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[1004906, 844179, 831628, 1020581, 12810393, 1..."
2150,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[1098066, 826249, 981760, 5569230, 904360, 899..."
2151,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[997796, 886787, 1133072, 830503, 892844, 8551..."
2152,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[826249, 883404, 1098066, 929668, 822407, 1029..."


In [27]:
result_lvl_1.apply(lambda row: precision_at_k(row['recalc_500'], row['actual'], k=5), axis=1).mean()

0.178644382544104

### predictions on train users with MainRecommender class get_als_recommendations() via multiplication of embedding matrices

In [28]:
df_als_predictions = als_model.recommendations_matrix

In [29]:
df_als_predictions.shape

(2484, 5000)

In [30]:
df_als_predictions.head(2)

Unnamed: 0,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15511891,15596279,15596488,15596515,15778533,15926844,15926886,15927403,15927661,15927850
1,0.067055,0.433216,-0.033046,-0.065813,-0.025529,0.020166,0.107086,0.752276,0.446219,0.315361,...,0.318527,0.211592,0.272231,0.256608,0.60895,0.232541,0.179535,0.438885,0.192917,0.546211
2,0.026853,0.028229,0.055169,-0.03887,-0.002899,0.005762,0.015915,0.14461,0.079669,0.08578,...,0.17521,0.255054,0.098703,-0.001271,0.056045,0.127381,0.209216,0.12692,-0.070844,0.047088


In [31]:
df_best500 = []
for x in df_als_predictions.index:
    df_best500.append([(df_als_predictions.loc[x].sort_values(ascending=False).index.values[:50])])
    

In [32]:
np.shape(df_best500)

(2484, 1, 50)

In [33]:
df_best50 = pd.DataFrame(df_best500, index=df_als_predictions.index, columns=['rec_50'])

In [177]:
r = result_lvl_1.join(df_best50, on=['user_id'], how='left')

In [178]:
r

Unnamed: 0,user_id,actual,recalc_500,rec_50
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1062572, 1028166, 1059930, 1082185, 856942, 9...","[1104349, 1062572, 982960, 1046816, 1100972, 8..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[834484, 1082185, 5569230, 1041259, 916122, 90...","[5569230, 1082185, 1041259, 854852, 916122, 90..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[891423, 951590, 902172, 883932, 846550, 80905...","[951590, 1119454, 902172, 883932, 846550, 8914..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1082185, 878996, 857006, 965267, 1127831, 102...","[1082185, 878996, 965267, 930118, 1024306, 112..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[853643, 5588238, 1003188, 5584645, 5587043, 5...","[857390, 853643, 938187, 1052046, 1082185, 112..."
...,...,...,...,...
2149,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[1004906, 844179, 831628, 1020581, 12810393, 1...","[844179, 1004906, 1020581, 12810393, 1044078, ..."
2150,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[1098066, 826249, 981760, 5569230, 904360, 899...","[1098066, 826249, 981760, 5569230, 899624, 904..."
2151,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[997796, 886787, 1133072, 830503, 892844, 8551...","[886787, 1133072, 1077490, 997796, 916122, 892..."
2152,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[826249, 883404, 1098066, 929668, 822407, 1029...","[883404, 826249, 1098066, 893018, 1096036, 907..."


In [190]:
pres_users = []
for x_user in r['user_id']:
    user_pres = precision_at_k(r.loc[r['user_id']== x_user, 'rec_50'].values[0],
                               r.loc[r['user_id']== x_user,'actual'].values[0],
                               k=5)
    pres_users.append(user_pres)

In [191]:
np.mean(pres_users)

0.17493036211699164

### predictions on warm users with MainRecommender class method get_als_recommendations()

In [94]:
preds_500 = []
for x_user in warm_users_1:

    preds_500.append([als_model.get_als_recommendations(x_user, N=500)])

In [102]:
preds_500 = pd.DataFrame(preds_500, index=warm_users_1, columns=['preds_500'])

In [103]:
preds_500.head(2)

Unnamed: 0,preds_500
1,"[1104349, 1062572, 982960, 1046816, 1100972, 8..."
2,"[5569230, 1082185, 1041259, 854852, 916122, 90..."


In [179]:
r = r.join(preds_500, on=['user_id'], how='left')

In [180]:
r.loc[r['user_id']==62,:]

Unnamed: 0,user_id,actual,recalc_500,rec_50,preds_500
49,62,"[42100, 85232, 112613, 328972, 7414659, 741508...","[1005902, 13008223, 949991, 901916, 1067779, 8...",,


In [169]:
from utils import cold_user_recommend

In [181]:
r.loc[r['user_id'].isin(cold_users_1),
      ['rec_50', 'preds_500']
     ] = r['user_id'].apply(lambda x:cold_user_recommend(prefiltered_train_lvl_1, n=50))

In [182]:
r.loc[r['user_id']==62,:]

Unnamed: 0,user_id,actual,recalc_500,rec_50,preds_500
49,62,"[42100, 85232, 112613, 328972, 7414659, 741508...","[1005902, 13008223, 949991, 901916, 1067779, 8...","[1082185, 6534178, 1029743, 995242, 1106523, 9...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."


In [192]:
pred_users = []
for x_user in r['user_id']:
    user_pred = precision_at_k(r.loc[r['user_id']== x_user, 'preds_500'].values[0],
                               r.loc[r['user_id']== x_user,'actual'].values[0],
                               k=5)
    pred_users.append(user_pred)

In [193]:
np.mean(pred_users)

0.1745589600742804

In [194]:
# after adding popular items for cold users on matrix multiplication method for warm users
pres_users = []
for x_user in r['user_id']:
    user_pres = precision_at_k(r.loc[r['user_id']== x_user, 'rec_50'].values[0],
                               r.loc[r['user_id']== x_user,'actual'].values[0],
                               k=5)
    pres_users.append(user_pres)

In [195]:
np.mean(pres_users)

0.17493036211699164

## 3. Prepare user and item features

In [196]:
user_item_lightfm = get_useritem_matrix(user_item_matrix)

In [197]:
user_item_lightfm.shape

(2484, 5000)

In [202]:
user_factors

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,15.581522,-6.529871,-3.994627,12.521516,18.642164,7.418461,-2.855771,-0.299411,-0.326062,7.125142,14.174028,1.928531,9.124504,-6.906746,-5.556875,-6.971813,8.089332,-11.332814,-3.438860,-3.547022
2,2.599997,12.282372,0.262007,2.584871,-1.713634,0.988832,7.818982,4.256641,-1.524778,-1.068416,-3.401285,3.296885,-1.044344,1.309689,-2.665504,2.716805,-0.108924,1.527397,4.022652,2.663080
3,-4.126503,8.684353,14.234260,5.696541,5.172974,0.377494,1.519951,-5.236134,4.108080,4.045520,-7.707074,0.853328,-2.939811,4.524196,-4.672180,11.550902,-5.772165,1.488417,11.340249,1.602969
4,4.595743,12.284838,1.646260,-0.917705,4.230337,0.349982,-6.109206,-4.053372,2.293658,3.455180,7.119060,5.386809,0.925491,-4.690211,-6.920101,3.548552,-4.951610,-1.490997,-0.654127,1.282208
5,4.980502,1.583530,-0.158296,-0.487368,-3.201267,-2.079031,3.446626,-1.602857,0.664324,4.149937,1.772141,1.140545,-2.902359,-3.151944,2.824872,-0.227690,0.340443,3.912955,0.398144,2.485687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,-4.825225,9.833871,-4.655272,8.854790,-0.612474,-2.970947,-1.841635,6.360375,-3.907779,6.087454,-7.599280,3.421873,14.345769,16.142994,-12.813449,-4.029080,-0.449359,5.375606,9.768623,10.730615
2497,-5.425889,12.206191,-8.521992,2.686005,6.384798,-2.610064,-9.652563,7.220903,8.167014,8.603769,3.084367,11.089072,1.237886,-7.121723,-6.773473,1.728604,7.297579,15.092903,0.099864,11.114884
2498,7.873752,1.468086,-3.000242,4.146648,-0.107880,5.077385,8.988994,-4.947285,1.916243,7.245810,9.046992,10.661127,-0.491600,6.086973,-7.389084,-1.889871,-4.206307,0.254177,2.291394,-3.533971
2499,-2.004892,5.198688,8.017133,0.524104,4.902006,7.595265,1.210670,-7.312312,12.948181,2.445407,-4.999979,3.533188,-0.459178,1.066214,-2.695649,-1.214589,-6.902646,7.675630,9.880346,4.597683


In [None]:
user_features_lightfm = get_user_matrix(prefiltered_train_lvl_1, user_features, user_factors)

In [None]:
user_features_lightfm.describe()

In [None]:
user_features_lightfm.shape

In [None]:
item_features_lightfm = get_item_matrix(prefiltered_train_lvl_1, item_features, item_factors)

In [None]:
item_features_lightfm.shape

In [None]:
item_features_lightfm.head(2)

In [None]:
# np.array_equal(coo_matrix(user_item_lightfm).col, coo_matrix(df_test).col)

### Normalizing data

In [None]:
scaler = preprocessing.MinMaxScaler()

In [None]:
def scaling_data(scaler, data_matrix):
    matrix_copy = data_matrix.copy()
    cols = matrix_copy.columns
    ind = matrix_copy.index
    d = scaler.fit_transform(matrix_copy)
    scaled_df = pd.DataFrame(d, index=ind, columns=cols)
    
    return scaled_df

In [None]:
scaled_item_features = scaling_data(scaler, item_features_lightfm)

In [None]:
scaled_user_features = scaling_data(scaler, user_features_lightfm)

In [None]:
scaled_user_item = scaling_data(scaler, user_item_matrix)

In [None]:
item_features

In [None]:
scaled_user_item.info()

In [None]:
scaled_user_item

In [None]:
model = LightFM(no_components=10,
                loss='bpr',
                learning_rate=0.005, 
                item_alpha=0.001,
                user_alpha=0.001, 
                random_state=42)

model.fit(coo_matrix(user_item_lightfm),
          user_features=csr_matrix(scaled_user_features.values).tocsr(),
          item_features=csr_matrix(scaled_item_features.values).tocsr(),
#          sample_weight=coo_matrix(scaled_user_item),
          epochs=10, 
          num_threads=1) 

In [None]:
#%%time
#gbm = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='top_k_accuracy')
#gbm.fit(coo_matrix(user_item_matrix))

#print('Best parameters found by grid search are:', gbm.best_params_)

In [None]:
user_bias, user_prop = model.get_user_representations(features=csr_matrix(scaled_user_features.values).tocsr())

In [None]:
item_bias, item_prop = model.get_item_representations(features=csr_matrix(scaled_item_features.values).tocsr())

In [None]:
item_prop

'''item_reps = np.concatenate((item_prop, np.ones((item_bias.shape[0], 1))), axis=1)
item_reps = np.concatenate((item_reps, item_bias.reshape(-1, 1)), axis=1)

user_reps = np.concatenate((user_prop, user_bias.reshape(-1, 1)), axis=1)
user_reps = np.concatenate((user_reps, np.ones((user_bias.shape[0], 1))), axis=1)
user_item_scores = user_reps.dot(item_reps.T)
'''

In [None]:
user_item_scores = user_prop.dot(item_prop.T)

In [None]:
df_user_item_scores = pd.DataFrame(user_item_scores, index=user_item_matrix.index, columns=user_item_matrix.columns)

In [None]:
df_user_item_scores

In [None]:
df_best50_scores = pd.DataFrame([[list(df_user_item_scores.loc[x].sort_values(ascending=False).index[:50])] for x in df_user_item_scores.index],
                                index=df_user_item_scores.index, columns=['rec_50'])

In [None]:
list(df_user_item_scores.loc[158].sort_values(ascending=False).index[:5])

In [None]:
list(df_user_item_scores.loc[1158].sort_values(ascending=False).index[:5])

In [None]:
list(df_user_item_scores.loc[1792].sort_values(ascending=False).index[:5])

In [None]:
list(df_user_item_scores.loc[2158].sort_values(ascending=False).index[:5])

In [None]:
type(df_best50_scores['rec_50'][1][5])

In [None]:
result_lvl_1_best50 = result_lvl_1.join(df_best50_scores, on=['user_id'], how='left')

In [None]:
result_lvl_1_best50

In [None]:
#result_lvl_1.loc[~result_lvl_1['user_id']==cold_users,:]

In [None]:
result_lvl_1_best50.columns = ['user_id', 'actual', 'recalc_500', 'rec_50']

In [None]:
result_lvl_1_best50

In [None]:
result_lvl_1_best50.apply(lambda row: recall_at_k(row['recalc_500'], row['actual'], k=5), axis=1).mean()

In [None]:
result_lvl_1.apply(lambda row: precision_at_k(row['recalc_500'], row['actual'], k=5), axis=1).mean()

In [None]:
result_lvl_1_best50.apply(lambda row: recall_at_k(row['rec50'], row['actual'], k=5), axis=1).mean()

In [None]:
result_als = pd.read_csv('../hmw/als_test_r.csv', index_col=0)
result_als.head(2)

In [None]:
num_candidates = [20, 50, 100, 200, 500]

In [None]:
for k in num_candidates:
    column_name = f'als{k}'
    for i in range(len(result_als[column_name])):
        result_als[column_name][i] = list(map(int, result_als[column_name][i][1:-1].split(', ')))

In [None]:
for i in range(len(result_als[column_name])):
    result_als['actual'][i] = list(map(int, result_als['actual'][i][1:-1].split(', ')))

In [None]:
items_500 = [i for i in result_als.loc[result_als['user_id']==1, 'als500'][0] if i in user_item_matrix.columns]

In [None]:
len(items_500)

In [None]:
scaled_item_features

In [None]:
predictions = model.predict(user_ids=1, item_ids=items_500,
                            user_features=csr_matrix(scaled_user_features.values).tocsr(),
                            item_features=csr_matrix(scaled_item_features.values).tocsr(),
                            num_threads=1)