In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

from lightfm import LightFM

In [None]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [None]:
def prefilter_items(data, take_n_popular=5000):
    # Уберем самые популярные товары (их и так купят)
    popularity = pd.DataFrame(data.groupby('item_id')['user_id'].nunique() / data['user_id'].nunique()).reset_index()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data.loc[data['item_id'].isin(top_popular), 'item_id'] = 999999

    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data.loc[data['item_id'].isin(top_notpopular), 'item_id'] = 999999

    # Уберем товары, которые не продавались за последние 12 месяцев
    day = data.groupby('item_id')['day'].max().reset_index()
    day = day[day['day'] < day['day'].max() - 365].item_id.tolist()
    data.loc[data['item_id'].isin(day), 'item_id'] = 999999

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб. 
    data['cost'] = data['sales_value'] / data['quantity']
    np.nan_to_num(data['cost'], copy=False, nan=0.0, posinf=0.0, neginf=0.0)

    costs = pd.DataFrame(data.groupby('item_id')['cost'].mean()).reset_index()
    low_cost = costs[costs['cost'] <= 1].item_id.tolist()

    data.loc[data['item_id'].isin(low_cost), 'item_id'] = 999999

    # Уберем слишком дорогие товары
    high_cost = costs[costs['cost'] > 100].item_id.tolist()
    data.loc[data['item_id'].isin(high_cost), 'item_id'] = 999999
    
    data = data.drop('cost', axis=1)

    # Top-N
    popularity = popularity[popularity['share_unique_users'] <= 0.5].sort_values('share_unique_users', ascending=False)
    top = popularity[popularity['item_id'].isin(data['item_id'])].item_id.head(take_n_popular).values
    data.loc[~data['item_id'].isin(top), 'item_id'] = 999999
    
    return data

In [None]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cost'] = data['sales_value'] / data['quantity']


Decreased # items from 86865 to 5001


In [None]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(2)

item_id,818981,819063,819255,819304,819308,819330,819518,819594,819765,819840,...,15925334,15926712,15926775,15926844,15926863,15926885,15926886,15926887,15926927,15972074
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

In [None]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)

item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

user_feat.head(2)

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2.0,None/Unknown
2,,,,,,,


In [None]:
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k

In [None]:
precision = []
recall = []

for n in ['bpr', 'warp']:
    model = LightFM(no_components=30,
        loss=n,
        learning_rate=0.05,
        item_alpha=0.1, user_alpha=0.1, 
                  random_state=42)

    model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
            sample_weight=coo_matrix(user_item_matrix),
            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
            epochs=15, 
            num_threads=4) 
    
    precision.append(precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())
    
    recall.append(recall_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())

In [None]:
precision, recall

([0.22040819, 0.32028812], [0.02128735278830375, 0.009183985107582447])

In [None]:
precision = []
recall = []
n_range = [10, 20, 30, 40]

for n in n_range:
    model = LightFM(no_components=n,
        loss='warp',
        learning_rate=0.05,
        item_alpha=0.1, user_alpha=0.1, 
                  random_state=42)

    model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
            sample_weight=coo_matrix(user_item_matrix),
            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
            epochs=15, 
            num_threads=4) 
    
    precision.append(precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())
    
    recall.append(recall_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())

In [None]:
precision, recall

([0.32028812, 0.32028812, 0.32028812, 0.32028812],
 [0.009183985107582447,
  0.009183985107582447,
  0.009183985107582447,
  0.009183985107582447])

In [None]:
precision = []
recall = []
n_range = [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]

for n in n_range:
    model = LightFM(no_components=30,
        loss='warp',
        learning_rate=n,
        item_alpha=0.1, user_alpha=0.1, 
                  random_state=42)

    model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
            sample_weight=coo_matrix(user_item_matrix),
            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
            epochs=15, 
            num_threads=4) 
    
    precision.append(precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())
    
    recall.append(recall_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())

In [None]:
precision, recall

([0.20000003,
  0.22072834,
  0.53533417,
  0.32028812,
  0.25162068,
  0.20744298,
  0.18455383],
 [0.020903499703063197,
  0.021348039738967775,
  0.029390006962210915,
  0.009183985107582447,
  0.006183538407718714,
  0.004652722436229798,
  0.004288731389935574])

In [None]:
precision = []
recall = []
n_range = [10, 1, 0.1, 0.01, 0.001]

for n in n_range:
    model = LightFM(no_components=30,
        loss='warp',
        learning_rate=0.005,
        item_alpha=n, user_alpha=0.1, 
                  random_state=42)

    model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
            sample_weight=coo_matrix(user_item_matrix),
            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
            epochs=15, 
            num_threads=4) 
    
    precision.append(precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())
    
    recall.append(recall_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())

In [None]:
precision, recall

([0.16782714, 0.22681074, 0.20744298, 0.2686675, 0.31988797],
 [0.004019803020369343,
  0.005863615675037722,
  0.004652722436229798,
  0.006021603517665426,
  0.007789997874907996])

In [None]:
precision = []
recall = []
n_range = [10, 1, 0.1, 0.01, 0.001]

for n in n_range:
    model = LightFM(no_components=30,
        loss='warp',
        learning_rate=0.05,
        item_alpha=0.001, user_alpha=n, 
                  random_state=42)

    model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
            sample_weight=coo_matrix(user_item_matrix),
            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
            epochs=15, 
            num_threads=4) 
    
    precision.append(precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())
    
    recall.append(recall_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean())

In [None]:
precision, recall

([0.24561827, 0.15134054, 0.4210484, 0.36478594, 0.32356942],
 [0.007548835010403654,
  0.004584017648440965,
  0.010698577467399004,
  0.00894880545258524,
  0.007563936356914578])

In [None]:
# Best model
model = LightFM(no_components=30,
        loss='warp',
        learning_rate=0.05,
        item_alpha=0.001, user_alpha=0.1, 
              random_state=42)

In [None]:
# Features

In [None]:
u_f = ['income_desc', 'household_size_desc', 'age_desc']
i_f = ['manufacturer', 'department', 'sub_commodity_desc', 'commodity_desc']
user_feat_lightfm = pd.get_dummies(user_feat[u_f], columns=u_f)
item_feat_lightfm = pd.get_dummies(item_feat[i_f], columns=i_f)

In [None]:
model = LightFM(no_components=30,
        loss='warp',
        learning_rate=0.05,
        item_alpha=0.001, user_alpha=n, 
                  random_state=42)

model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
        sample_weight=coo_matrix(user_item_matrix),
        user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
        item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
        epochs=15, 
        num_threads=4) 

precision_at_k(model, sparse_user_item, 
                              user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                              item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                              k=5).mean(), \
recall_at_k(model, sparse_user_item, 
                              user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                              item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                              k=5).mean()

(0.35774308, 0.009097573191484399)

In [None]:
# Оценка метрик

In [None]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list[:k])
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall

In [None]:
test_item_ids = np.array(list(id_to_itemid.keys()))
def get_prediction(user, test_item_ids, N=5):
    predictions = model.predict(user_ids=user, item_ids=test_item_ids,
                                user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                num_threads=4)
    predictions = test_item_ids[np.argsort(predictions)][::-1]
    return predictions[:N]

In [None]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[856942, 865456, 914190, 951954, 958046, 96256..."
1,3,"[835476, 1053690, 1096727, 13842214]"


In [None]:
model = LightFM(no_components=30,
    loss='warp',
    learning_rate=0.05,
    item_alpha=0.001, user_alpha=0.1, 
              random_state=42)

model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
        sample_weight=coo_matrix(user_item_matrix),
        user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
        item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
        epochs=15, 
        num_threads=4) 

result['st'] = result['user_id'].apply(lambda x: get_prediction(int(userid_to_id[x]), test_item_ids, N=5))
result.apply(lambda row: precision_at_k(row['st'], row['actual'], k=5), axis=1).mean(), \
result.apply(lambda row: recall_at_k(row['st'], row['actual'], k=5), axis=1).mean()

(0.0, 0.0)