### The use of log-likelihood ratios for an item-item system as below is based on the paper http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
import itertools
import time

In [2]:
sessions = pd.read_csv('sessions.csv', index_col=0)
sessions.rename(columns={'SessionID':'SessionId'}, inplace=True)
sessions.shape

(112256, 6)

**From the documentation**: *TimeStamp – long integer value specifying the UTC date and time of the arrival of a request, coded as the number of 100-nanosecond intervals that have elapsed since 00:00:00 UTC on 1st January, 1 A.D.*

In [3]:
seconds = (datetime(1970,1,1)-datetime(1,1,1)).total_seconds()
ticks = seconds * 10**7
sessions['TimeStamp'] = pd.to_datetime((sessions['TimeStamp']-ticks)*100)
sessions.head()

Unnamed: 0,SessionId,IpId,TimeStamp,Event,Action,Product
0,0,100095PL,2020-01-13 21:35:36,Leave,,
1,1,100095PL,2020-04-18 16:14:40,Arrive,,
2,1,100095PL,2020-04-18 16:15:26,,add_to_cart,p-9967
3,1,100095PL,2020-04-18 16:33:07,,order,
4,1,100095PL,2020-04-18 16:34:25,Leave,,


In [4]:
train = sessions[sessions['TimeStamp']<=datetime(2020,3,31,23,59,59)]
test = sessions[sessions['TimeStamp']>datetime(2020,3,31,23,59,59)]

In [5]:
items = train['Product'].dropna().unique()
print(items.shape)

(4038,)


First we create a session_item table. It is our assumption that the most useful recommendations stem from looking at products purchased together in the same session, rather than simply by the same user. We will do user_item later.

In [6]:
withProduct = train[~train['Product'].isnull()]
session_item = pd.DataFrame(withProduct.groupby('SessionId')['Product'].apply(set))
print(session_item.shape)

(8401, 1)


In [7]:
def non_overlap_pair(kind, s1, s2, included=None):
    if not included:
        return (
            (kind, key[0], key[1], 1) 
            for key in itertools.product(s1, s2)  
            if key[0]!=key[1]
        )
    else:
        return (
            (kind, key[0], key[1], 1) 
            for key in itertools.product(s1, s2)  
            if (key[0]!=key[1]) and (key in included)
        )

In [8]:
start = time.time()
lst = session_item["Product"].to_list()
num_sessions = session_item.shape[0]
all_item = set(items)

both = itertools.chain(
    *(
        non_overlap_pair(0, u_item, u_item) 
        for u_item in lst
    ))


included = set(itertools.chain(
    *(
        (key for key in itertools.product(u_item, u_item) if key[0]!=key[1])
        for u_item in lst 
    )))


first = itertools.chain(
    *(
        non_overlap_pair(1, u_item, all_item-u_item, included) 
        for u_item in lst
    ))


second = itertools.chain(
    *(
        non_overlap_pair(2, all_item-u_item, u_item, included) 
        for u_item in lst
    ))

itemCooc = pd.DataFrame(
    itertools.chain(first, second, both), columns=["kind", "item1", "item2", "freq"])
itemCooc = itemCooc.groupby(["kind", "item1", "item2"]).sum()
itemCooc = itemCooc.unstack(0)
print( time.time() - start)
itemCooc.head()

23.707616329193115


Unnamed: 0_level_0,Unnamed: 1_level_0,freq,freq,freq
Unnamed: 0_level_1,kind,0,1,2
item1,item2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
c-21,c-24_205,1.0,53.0,10.0
c-21,c-24_209,1.0,53.0,14.0
c-21,c-24_211,1.0,53.0,1.0
c-21,c-261,2.0,52.0,21.0
c-21,c-272_312,2.0,52.0,54.0


In [9]:
itemCooc[('freq', 3)] = num_sessions - itemCooc[('freq', 0)] - itemCooc[('freq', 1)] - itemCooc[('freq', 2)]
itemCooc.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,freq,freq,freq,freq
Unnamed: 0_level_1,kind,0,1,2,3
item1,item2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
c-21,c-24_205,1.0,53.0,10.0,8337.0
c-21,c-24_209,1.0,53.0,14.0,8333.0
c-21,c-24_211,1.0,53.0,1.0,8346.0
c-21,c-261,2.0,52.0,21.0,8326.0
c-21,c-272_312,2.0,52.0,54.0,8293.0


In [10]:
# itemCooc[('freq',0)].value_counts()

In [11]:
def shannon(v):
    S = np.sum(v)
    return np.dot(v/S,np.log(np.add(v/S,v==0)))

In [12]:
def llr(x):
    v = np.array([x[('freq', 0)], x[('freq', 1)], x[('freq', 2)], x[('freq', 3)]])
    return 2*np.sum(v)*(shannon(v) - \
                        shannon([v[0]+v[1], v[2]+v[3]]) - \
                        shannon([v[0]+v[2], v[1]+v[3]])
                       )

In [13]:
itemCooc_llr = itemCooc.apply(lambda x: llr(x), axis=1)
itemCooc_llr.head()

item1  item2    
c-21   c-24_205     3.537431
       c-24_209     2.942233
       c-24_211     7.352830
       c-261        6.933982
       c-272_312    3.679396
dtype: float64

In [14]:
llrDf = pd.DataFrame(itemCooc_llr).rename(columns={0:'Llr'}).dropna().reset_index()

In [15]:
llrDf[llrDf['item1']=='c-21'].sort_values('Llr', ascending=False).head(2)['item2']

21      c-61
51    p-3631
Name: item2, dtype: object