## LightFM recommendation



In [5]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from itertools import product

In [6]:
train_purchases_df = pd.read_csv("../data/dressipi_recsys2022/train_purchases.csv", parse_dates=['date'])
train_sessions_df = pd.read_csv("../data/dressipi_recsys2022/train_sessions.csv" , parse_dates=['date'])
item_features_df = pd.read_csv("../data/dressipi_recsys2022/item_features.csv")
candidate_items_df = pd.read_csv("../data/dressipi_recsys2022/candidate_items.csv")


## The task is to submit a csv that has 100 ranked predictions for each query session.
Header and columns as in the example below. Header is required. The order of rows does not matter for the evaluation system but we recommend to sort the file by session_id and rank for easier manual inspection. 
<pre>
session_id,item_id,rank
1,100,1
1,105,2
1,107,3
...
1,101,100
2,108,1
2,107,2
...
</pre>

In [7]:
pip install lightfm

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Calculate Mrr (Mean reciprocal rank)
def calc_mrr(result_df, test_df):
    mrr = 0
    # Iterate all sessions
    for sess_id in test_df['session_id']:
        # Make view for only this session with all ranked
        ranked = result_df[result_df['session_id']==sess_id]['item_id'].reset_index(drop=True)
        real_item_id = test_df[test_df['session_id']==sess_id]['item_id'].reset_index(drop=True)[0] 
        first_rank = 100
        found_t = ranked[ranked == real_item_id]
        if len(found_t)!=0 :
            first_rank = found_t.index[0]+1
        mrr =mrr+ 1/first_rank
        
    mrr = mrr / test_df['session_id'].nunique()
    return mrr
    
    
    

In [9]:
# Test of method is commented
#calc_mrr(pd.DataFrame.from_dict({'session_id':[1,1,2,2,2,3,3,3], 'item_id':[1,2,3,4,5,6,7,8]}),\
#                     pd.DataFrame.from_dict({'session_id':[1,2,3], 'item_id':[2,5,8]}))
#(1/2 + 1/3 + 1/100)/3

In [10]:

from lightfm.data import Dataset
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

## Start building the feature for light FM
User features are of the form tuple iterable tuple (session_id, [month, hour])

In [88]:
train_sessions_df['month']='m' + train_sessions_df['date'].dt.month.apply(str)
train_sessions_df['hour']='h' + train_sessions_df['date'].dt.hour.apply(str)

# Aggregate by sessionId and add hour and month as list of features
user_features = train_sessions_df.groupby('session_id').agg(
    session_id=pd.NamedAgg(column="session_id", aggfunc="min"),
    month=pd.NamedAgg(column="month", aggfunc=set),
    hour=pd.NamedAgg(column="hour", aggfunc=set),
).reset_index(drop=True).apply(lambda row: (row['session_id'], list(row['month']) + (list(row['hour']))), axis = 1) 

user_features.head(2)

0    (3, [m12, h21])
1    (13, [m3, h19])
dtype: object

Item features are tuples of this form: iterable tuple (item_id, [category1-value1, category2-value2])

In [78]:
item_features_df['cat_val'] = item_features_df['feature_category_id'].apply(str)+ '-' + item_features_df['feature_value_id'].apply(str)

# Aggregate by item_id and add category and value as list of features
item_features = item_features_df.groupby('item_id').agg(
    item_id=pd.NamedAgg(column="item_id", aggfunc="min"),
    cat_val=pd.NamedAgg(column="cat_val", aggfunc=list),
).reset_index(drop=True).apply(lambda row: (row['item_id'], row['cat_val']), axis = 1) 
item_features.head(2)

0    (2, [56-365, 62-801, 68-351, 33-802, 72-75, 29...
1    (3, [56-365, 69-592, 68-14, 17-378, 32-902, 11...
dtype: object