## Baseline (random behavior)



In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from itertools import product

In [2]:
train_purchases_df = pd.read_csv("../data/dressipi_recsys2022/train_purchases.csv", parse_dates=['date'])
train_sessions_df = pd.read_csv("../data/dressipi_recsys2022/train_sessions.csv" , parse_dates=['date'])
item_features_df = pd.read_csv("../data/dressipi_recsys2022/item_features.csv")
candidate_items_df = pd.read_csv("../data/dressipi_recsys2022/candidate_items.csv")


In [3]:


##  train_purchases_df is for training, but we can use 5% for testing our baseline
_, test_df = train_test_split(train_purchases_df, test_size=0.05)

## The task is to submit a csv that has 100 ranked predictions for each query session.
Header and columns as in the example below. Header is required. The order of rows does not matter for the evaluation system but we recommend to sort the file by session_id and rank for easier manual inspection. 
<pre>
session_id,item_id,rank
1,100,1
1,105,2
1,107,3
...
1,101,100
2,108,1
2,107,2
...
</pre>

In [4]:
len(test_df['session_id'].unique())

50000

In [5]:
top_100_item_ids = \
     list(train_sessions_df['item_id'].value_counts()[0:100].keys())

In [11]:
# Generate the resulting dataframe in wanted format
# by iterating all sessions and for each session from 1 too 100

## itertools.product(X,Y,Z) is exploring all possible combinations from arrays X,Y,Z

result_df = pd.DataFrame(product(test_df['session_id'], range(0,1), range(1, 101)))
result_df.rename(columns = {0:'session_id', 1:'item_id', 2: 'rank'}, inplace = True)

result_df['item_id'] = result_df.apply(lambda x: top_100_item_ids[x['rank']-1], axis=1)
result_df

Unnamed: 0,session_id,item_id,rank
0,2457896,8060,1
1,2457896,26853,2
2,2457896,2447,3
3,2457896,1644,4
4,2457896,19882,5
...,...,...,...
4999995,1681569,16218,96
4999996,1681569,434,97
4999997,1681569,15777,98
4999998,1681569,1148,99


In [12]:
# Calculate Mrr (Mean reciprocal rank)
def calc_mrr(result_df, test_df):
    mrr = 0
    # Iterate all sessions
    for sess_id in test_df['session_id']:
        # Make view for only this session with all ranked
        ranked = result_df[result_df['session_id']==sess_id]['item_id'].reset_index(drop=True)
        real_item_id = test_df[test_df['session_id']==sess_id]['item_id'].reset_index(drop=True)[0] 
        first_rank = 100
        found_t = ranked[ranked == real_item_id]
        if len(found_t)!=0 :
            first_rank = found_t.index[0]+1
        mrr =mrr+ 1/first_rank
        
    mrr = mrr / test_df['session_id'].nunique()
    return mrr
    
    
    

In [13]:
# Test of method is commented
#calc_mrr(pd.DataFrame.from_dict({'session_id':[1,1,2,2,2,3,3,3], 'item_id':[1,2,3,4,5,6,7,8]}),\
#                     pd.DataFrame.from_dict({'session_id':[1,2,3], 'item_id':[2,5,8]}))
#(1/2 + 1/3 + 1/100)/3

In [14]:
# It is taking too long to calculate for the whole thing
calc_mrr(result_df, test_df[0:5000])

0.024578504530265898

In [15]:
calc_mrr(result_df, test_df[45000:])

0.026521128221365303