## LightFM recommendation



In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from itertools import product
from tqdm import tqdm
tqdm.pandas()


In [2]:
train_purchases_df = pd.read_csv("../data/dressipi_recsys2022/train_purchases.csv", parse_dates=['date'])
train_sessions_df = pd.read_csv("../data/dressipi_recsys2022/train_sessions.csv" , parse_dates=['date'])
item_features_df = pd.read_csv("../data/dressipi_recsys2022/item_features.csv")
candidate_items_df = pd.read_csv("../data/dressipi_recsys2022/candidate_items.csv")


# Split the dataset

In [3]:
# Test set is
train_p_df, test_p_df = train_test_split(train_purchases_df, test_size=0.02)
#test_sessions_set = set(test_p_df['session_id'])

#train_s_df = train_sessions_df[~train_sessions_df['session_id'].isin(test_sessions_set)]
#test_s_df = train_sessions_df[train_sessions_df['session_id'].isin(test_sessions_set)]

#print(f"len(test_sessions_set)={len(test_sessions_set)}")
print(f"len(train_p_df)={len(train_p_df)} train purchases")
print(f"len(test_p_df)={len(test_p_df)}   test purchases")
#print(f"len(train_s_df)={len(train_s_df)} train sessions")
#print(f"len(test_s_df)={len(test_s_df)}   test sessions")



len(train_p_df)=980000 train purchases
len(test_p_df)=20000   test purchases


## The task is to submit a csv that has 100 ranked predictions for each query session.
Header and columns as in the example below. Header is required. The order of rows does not matter for the evaluation system but we recommend to sort the file by session_id and rank for easier manual inspection. 
<pre>
session_id,item_id,rank
1,100,1
1,105,2
1,107,3
...
1,101,100
2,108,1
2,107,2
...
</pre>

In [4]:
pip install lightfm

Note: you may need to restart the kernel to use updated packages.


In [5]:
from tqdm import tqdm

# Calculate Mrr (Mean reciprocal rank)
def calc_mrr(result_df, test_df):
    mrr = 0
    # Iterate all sessions
    for sess_id in tqdm(test_df['session_id']):
        # Make view for only this session with all ranked
        ranked = result_df[result_df['session_id']==sess_id]['item_id'].reset_index(drop=True)
        real_item_id = test_df[test_df['session_id']==sess_id]['item_id'].reset_index(drop=True)[0] 
        first_rank = 100
        found_t = ranked[ranked == real_item_id]
        if len(found_t)!=0 :
            first_rank = found_t.index[0]+1
        #    print(f'sess_id={sess_id},real_item_id={real_item_id},first_rank={first_rank}')
        #else:
        #    print(f'sess_id={sess_id},real_item_id={real_item_id}, not found')
            
        mrr =mrr+ 1/first_rank
        
    mrr = mrr / test_df['session_id'].nunique()
    return mrr
    
    
    

In [6]:
# Test of method is commented
#calc_mrr(pd.DataFrame.from_dict({'session_id':[1,1,2,2,2,3,3,3], 'item_id':[1,2,3,4,5,6,7,8]}),\
#                     pd.DataFrame.from_dict({'session_id':[1,2,3], 'item_id':[2,5,8]}))
#(1/2 + 1/3 + 1/100)/3

In [7]:

from lightfm.data import Dataset
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

## Start building the feature for light FM
User features are of the form tuple iterable tuple (session_id, [month, hour])

In [8]:
train_sessions_df['month']='m' + train_sessions_df['date'].dt.month.apply(str)
train_sessions_df['hour']='h' + train_sessions_df['date'].dt.hour.apply(str)

# Aggregate by sessionId and add hour and month as list of features
user_features = train_sessions_df.groupby('session_id').agg(
    session_id=pd.NamedAgg(column="session_id", aggfunc="min"),
    month=pd.NamedAgg(column="month", aggfunc=set),
    hour=pd.NamedAgg(column="hour", aggfunc=set),
).reset_index(drop=True).apply( \
        lambda row: (row['session_id'], []), axis = 1)
        #lambda row: (row['session_id'], list(row['month']) + (list(row['hour']))), axis = 1)

user_features.head(2)

0     (3, [])
1    (13, [])
dtype: object

Item features are tuples of this form: iterable tuple (item_id, [category1-value1, category2-value2])

In [9]:


item_features_df['cat_val'] = item_features_df['feature_category_id'].apply(str)+ '-' + item_features_df['feature_value_id'].apply(str)
#item_features_df['cat_val'] = item_features_df['feature_category_id'].apply(str)

#top_most_valuable_item_features = item_features_df['cat_val'].value_counts()[0:10]
#top_most_valuable_item_features = set(top_most_valuable_item_features.index)
#def most_valuable(features):
#    return list(set(top_most_valuable_item_features) & set(features))

# Aggregate by item_id and add category and value as list of features
item_features = item_features_df.groupby('item_id').agg(
    item_id=pd.NamedAgg(column="item_id", aggfunc="min"),
    cat_val=pd.NamedAgg(column="cat_val", aggfunc=list),
).reset_index(drop=True).progress_apply( \
#    lambda row: (row['item_id'], most_valuable(row['cat_val'])), axis = 1) 
    lambda row: (row['item_id'], []), axis = 1) 


item_features.head(5)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 23691/23691 [00:00<00:00, 138896.90it/s]


0    (2, [])
1    (3, [])
2    (4, [])
3    (7, [])
4    (8, [])
dtype: object

Building the interactions of the form: iterable tuple (session_id, item_id, weight)

In [22]:
train_sessions_df['score'] = train_sessions_df.groupby(['session_id']).cumcount()+1
train_sessions_df['weight'] = train_sessions_df['score'].apply(lambda x: np.tanh(x/9))

#train_sessions_df['views'] = train_sessions_df.groupby(['session_id']).count();
train_sessions_df[(train_sessions_df['item_id']==2447)&(train_sessions_df['session_id']==4254727)]

Unnamed: 0,session_id,item_id,date,month,hour,score,weight
4544540,4254727,2447,2021-03-11 14:11:29.645,m3,h14,3,0.321513


In [23]:
# Give tanh((cumcount()+1)/5)) for just view in the session
# First view has less weight, second has more, and so on
# and 1.0 for a purchase
train_interactions1 = train_sessions_df.progress_apply(
    lambda row: (row['session_id'], row['item_id'], row['weight']), axis = 1)  
train_interactions2 = train_p_df.progress_apply(
    lambda row: (row['session_id'], row['item_id'], 1), axis = 1) 

train_interactions = pd.concat([train_interactions1, train_interactions2])

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4743820/4743820 [01:06<00:00, 70919.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 980000/980000 [00:13<00:00, 72910.07it/s]


In [24]:
#train_interactions.reset_index(inplace=True)
#print(f"train_interactions.info() : {train_interactions.info()}")
train_interactions.head(2)

0    (3, 9655, 0.11065611052473798)
1    (3, 9655, 0.21863508368712128)
dtype: object

## Build Light FM Dataset

In [25]:
from collections import Counter
item_features_counter = Counter()
item_features.map(lambda x: item_features_counter.update(x[1]))
item_features_vals = list(item_features_counter.keys())

user_features_counter = Counter()
user_features.map(lambda x: user_features_counter.update(x[1]))
user_features_vals = list(user_features_counter.keys())

print(f"User feature values: {user_features_vals[:30]}...")
print(f"Item feature values: {item_features_vals[:30]}...")

User feature values: []...
Item feature values: []...


In [26]:
import lightfm
from lightfm import cross_validation


ds = Dataset()
ds.fit( \
       train_sessions_df['session_id'].unique(), \
       item_features_df['item_id'].unique(), \
       item_features=item_features_vals, \
       user_features=user_features_vals)
item_features_ds                  = ds.build_item_features(item_features)
train_interactions_ds, weights_ds = ds.build_interactions(train_interactions)
user_features_ds                  = ds.build_user_features(user_features)

(trn_interactions_ds, tst_interactions_ds) = \
    lightfm.cross_validation.random_train_test_split(train_interactions_ds, test_percentage=0.2, random_state=None)

## Train the model

In [27]:
model = LightFM(
    no_components=10,
    learning_rate=0.02,
    loss='warp',
    random_state=42)

model.fit(
    trn_interactions_ds,
    item_features=item_features_ds,
    user_features=user_features_ds, 
    epochs=20, num_threads=4, verbose=True)

(user_map, feature_map, item_map, item_feature_map) = ds.mapping()

Epoch: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:40<00:00,  2.04s/it]


In [28]:
def predict_ratings(model, sessions_or_purchase_df, items_to_predict) :
    session_ids = pd.Series(sessions_or_purchase_df['session_id'].unique())
    all_users = session_ids \
        .apply(lambda x: user_map[x]).to_numpy()
    all_available_items = items_to_predict.apply(lambda x: item_map[x]).to_numpy()
    users =[]
    items =[]
    for user_item_tuple in product(all_users, all_available_items):
        users.append(user_item_tuple[0])
        items.append(user_item_tuple[1])
    preds = model.predict(np.array(users), np.array(items))
    
    session_ids_expanded = []
    item_ids_expanded = []
    for tup in product(session_ids, items_to_predict):
        session_ids_expanded.append(tup[0])
        item_ids_expanded.append(tup[1])
    
    
    df_score = pd.DataFrame({'session_id': np.array(session_ids_expanded), \
                      'item_id':np.array(item_ids_expanded) , \
                      'score':np.array(preds)})
    return df_score

In [29]:
# Determine which items to predict
# For the submission we load from "candidate_items.csv" but for
# initial testing we must take them from train_sessions.

#items_to_predict_array = train_sessions_df['item_id'].unique()

# Recommend from most sold items
items_to_predict_array = \
     list(train_sessions_df['item_id'].value_counts()[0:2500].keys())

print('Items to predict: ', len(items_to_predict_array))

Items to predict:  2500


In [30]:
t = predict_ratings(model, test_p_df[0:20000], \
                    pd.Series(items_to_predict_array))
t1 = t.sort_values(['score'],ascending=False)

In [31]:
# DataFrameGroupBy: The function passed to apply must take a dataframe 
# as its first argument and return a DataFrame
# From the other side "cumcount()" is used to count in the group
t1['rank'] = t1.groupby('session_id').cumcount()+1


result_df = t1[t1['rank']<=100]#.drop('score', axis=1)
result_df.reset_index(inplace=True)
result_df.drop(['index'], inplace=True, axis=1)

result_df = result_df.sort_values( \
        by=['session_id', 'rank'],ascending=True)
#result_df.reset_index(inplace=True)
#result_df.to_csv("result_df.csv")
result_df



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,session_id,item_id,score,rank
13816,13,8060,2.769647,1
102067,13,17089,2.360272,2
125013,13,1644,2.313198,3
184278,13,27613,2.220414,4
198855,13,19882,2.201473,5
...,...,...,...,...
1738807,4439974,14529,1.426061,96
1739814,4439974,26440,1.425485,97
1744279,4439974,20629,1.422963,98
1751267,4439974,4407,1.419000,99


In [None]:
#test_p_df[test_p_df['session_id'].isin({2692943,1211445})]

calc_mrr(result_df, test_p_df)

 56%|███████████████████████████████████████████████████████▊                                            | 11161/20000 [00:32<00:25, 344.28it/s]

In [21]:
test_p_df.to_csv("test_p_df.csv")