## LightFM recommendation



In [17]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from itertools import product

In [18]:
train_purchases_df = pd.read_csv("../data/dressipi_recsys2022/train_purchases.csv", parse_dates=['date'])
train_sessions_df = pd.read_csv("../data/dressipi_recsys2022/train_sessions.csv" , parse_dates=['date'])
item_features_df = pd.read_csv("../data/dressipi_recsys2022/item_features.csv")
candidate_items_df = pd.read_csv("../data/dressipi_recsys2022/candidate_items.csv")


# Split the dataset

In [30]:
# Test set is
train_p_df, test_p_df = train_test_split(train_purchases_df, test_size=0.02)
test_sessions_set = set(test_p_df['session_id'])

train_s_df = train_sessions_df[~train_sessions_df['session_id'].isin(test_sessions_set)]
test_s_df = train_sessions_df[train_sessions_df['session_id'].isin(test_sessions_set)]

print(f"len(test_sessions_set)={len(test_sessions_set)}")
print(f"len(train_p_df)={len(train_p_df)} train purchases")
print(f"len(test_p_df)={len(test_p_df)}   test purchases")
print(f"len(train_s_df)={len(train_s_df)} train sessions")
print(f"len(test_s_df)={len(test_s_df)}   test sessions")



len(test_sessions_set)=20000
len(train_p_df)=980000 train purchases
len(test_p_df)=20000   test purchases
len(train_s_df)=4649917 train sessions
len(test_s_df)=93903   test sessions


## The task is to submit a csv that has 100 ranked predictions for each query session.
Header and columns as in the example below. Header is required. The order of rows does not matter for the evaluation system but we recommend to sort the file by session_id and rank for easier manual inspection. 
<pre>
session_id,item_id,rank
1,100,1
1,105,2
1,107,3
...
1,101,100
2,108,1
2,107,2
...
</pre>

In [20]:
pip install lightfm

Note: you may need to restart the kernel to use updated packages.


In [21]:
# Calculate Mrr (Mean reciprocal rank)
def calc_mrr(result_df, test_df):
    mrr = 0
    # Iterate all sessions
    for sess_id in test_df['session_id']:
        # Make view for only this session with all ranked
        ranked = result_df[result_df['session_id']==sess_id]['item_id'].reset_index(drop=True)
        real_item_id = test_df[test_df['session_id']==sess_id]['item_id'].reset_index(drop=True)[0] 
        first_rank = 100
        found_t = ranked[ranked == real_item_id]
        if len(found_t)!=0 :
            first_rank = found_t.index[0]+1
        mrr =mrr+ 1/first_rank
        
    mrr = mrr / test_df['session_id'].nunique()
    return mrr
    
    
    

In [22]:
# Test of method is commented
#calc_mrr(pd.DataFrame.from_dict({'session_id':[1,1,2,2,2,3,3,3], 'item_id':[1,2,3,4,5,6,7,8]}),\
#                     pd.DataFrame.from_dict({'session_id':[1,2,3], 'item_id':[2,5,8]}))
#(1/2 + 1/3 + 1/100)/3

In [23]:

from lightfm.data import Dataset
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

## Start building the feature for light FM
User features are of the form tuple iterable tuple (session_id, [month, hour])

In [56]:
train_s_df['month']='m' + train_s_df['date'].dt.month.apply(str)
train_s_df['hour']='h' + train_s_df['date'].dt.hour.apply(str)

# Aggregate by sessionId and add hour and month as list of features
user_features = list(train_s_df.groupby('session_id').agg(
    session_id=pd.NamedAgg(column="session_id", aggfunc="min"),
    month=pd.NamedAgg(column="month", aggfunc=set),
    hour=pd.NamedAgg(column="hour", aggfunc=set),
).reset_index(drop=True).apply(lambda row: (row['session_id'], list(row['month']) + (list(row['hour']))), axis = 1) )

user_features.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_s_df['month']='m' + train_s_df['date'].dt.month.apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_s_df['hour']='h' + train_s_df['date'].dt.hour.apply(str)


AttributeError: 'list' object has no attribute 'head'

Item features are tuples of this form: iterable tuple (item_id, [category1-value1, category2-value2])

In [25]:
item_features_df['cat_val'] = item_features_df['feature_category_id'].apply(str)+ '-' + item_features_df['feature_value_id'].apply(str)

# Aggregate by item_id and add category and value as list of features
item_features = item_features_df.groupby('item_id').agg(
    item_id=pd.NamedAgg(column="item_id", aggfunc="min"),
    cat_val=pd.NamedAgg(column="cat_val", aggfunc=list),
).reset_index(drop=True).apply(lambda row: (row['item_id'], row['cat_val']), axis = 1) 
item_features.head(2)

0    (2, [56-365, 62-801, 68-351, 33-802, 72-75, 29...
1    (3, [56-365, 69-592, 68-14, 17-378, 32-902, 11...
dtype: object

Building the interactions of the form: iterable tuple (session_id, item_id, weight)

In [34]:
# Give 0.3 weight for just view in the session
# and 1.0 for a purchase
train_interactions1 = train_s_df.apply(
    lambda row: (row['session_id'], row['item_id'], 0.3), axis = 1)  
train_interactions2 = train_p_df.apply(
    lambda row: (row['session_id'], row['item_id'], 1), axis = 1) 

train_interactions = pd.concat([train_interactions1, train_interactions2])

In [37]:
#train_interactions.reset_index(inplace=True)
#print(f"train_interactions.info() : {train_interactions.info()}")
train_interactions.head(2)

0    (3, 9655, 0.3)
1    (3, 9655, 0.3)
dtype: object

## Build Light FM Dataset

In [38]:
from collections import Counter
item_features_counter = Counter()
item_features.map(lambda x: item_features_counter.update(x[1]))
item_features_vals = list(item_features_counter.keys())

user_features_counter = Counter()
user_features.map(lambda x: user_features_counter.update(x[1]))
user_features_vals = list(user_features_counter.keys())

print(f"User feature values: {user_features_vals[:30]}...")
print(f"Item feature values: {item_features_vals[:30]}...")

User feature values: ['m10', 'h20', 'm4', 'h9', 'm3', 'h6', 'h11', 'm5', 'h23', 'h7', 'h8', 'm12', 'h19', 'h18', 'h13', 'm8', 'm11', 'h22', 'm2', 'h21', 'm6', 'h5', 'h14', 'h17', 'm7', 'h15', 'h12', 'h10', 'm1', 'h16']...
Item feature values: ['56-365', '62-801', '68-351', '33-802', '72-75', '29-123', '16-38', '50-76', '61-462', '53-6', '7-394', '69-885', '47-123', '69-592', '68-14', '17-378', '32-902', '11-859', '45-559', '7-452', '19-254', '46-825', '61-706', '73-544', '55-129', '63-861', '50-240', '59-180', '4-618', '5-605']...


In [58]:
2506 in train_s_df['session_id']


True

In [57]:
import lightfm
from lightfm import cross_validation


ds = Dataset()
ds.fit(train_p_df['session_id'],item_features_df['item_id'].unique(), item_features=item_features_vals, user_features=user_features_vals)
item_features_ds                  = ds.build_item_features(item_features)
train_interactions_ds, weights_ds = ds.build_interactions(train_interactions)
user_features_ds                  = ds.build_user_features(user_features)

(trn_interactions_ds, tst_interactions_ds) = lightfm.cross_validation.random_train_test_split(train_interactions_ds, test_percentage=0.2, random_state=None)

## Train the model

In [59]:
model = LightFM(
    no_components=10,
    learning_rate=0.02,
    loss='warp',
    random_state=42)

model.fit(
    trn_interactions_ds,
    item_features=item_features_ds,
    user_features=user_features_ds, 
    epochs=35, num_threads=4, verbose=True)

(user_map, feature_map, item_map, item_feature_map) = ds.mapping()

Epoch: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [12:35<00:00, 21.58s/it]


In [79]:
def predict_ratings(model, sessions_or_purchase_df) :
    all_users = pd.Series( \
        sessions_or_purchase_df['session_id'].unique()) \
        .apply(lambda x: user_map[x]).to_numpy()
    all_available_items = candidate_items_df['item_id'].apply(lambda x: item_map[x]).to_numpy()
    users =[]
    items =[]
    for user_item_tuple in product(all_users, all_available_items):
        users.append(user_item_tuple[0])
        items.append(user_item_tuple[1])
    preds = model.predict(np.array(users), np.array(items))
    return pd.Series(preds)

In [80]:
predict_ratings(model, train_p_df[0:2])

0      -0.122347
1      -0.738998
2      -1.847721
3      -0.495834
4      -0.912241
          ...   
9975   -0.888461
9976   -0.910609
9977   -0.388725
9978    2.862002
9979   -0.292852
Length: 9980, dtype: float32