## This notebook assembles the item-item and user-item systems, along with the technique known as dithering, into a single multimodal recommendation engine. The results are displayed as a streaming beautifultable.

In [169]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
import itertools
import time
import implicit
from scipy import sparse
from scipy.sparse import csr_matrix
from beautifultable import BeautifulTable
from beautifultable import BTColumnCollection
from IPython.display import clear_output
import random
from termcolor import colored

In [2]:
sessions = pd.read_csv('sessions.csv', index_col=0)
sessions.rename(columns={'SessionID':'SessionId'}, inplace=True)

In [3]:
seconds = (datetime(1970,1,1)-datetime(1,1,1)).total_seconds()
ticks = seconds * 10**7
sessions['TimeStamp'] = pd.to_datetime((sessions['TimeStamp']-ticks)*100)

In [4]:
train = sessions[sessions['TimeStamp']<=datetime(2020,3,31,23,59,59)]
test = sessions[sessions['TimeStamp']>datetime(2020,3,31,23,59,59)]

In [5]:
items = train['Product'].dropna().unique()

# Item-item recommendations

In [18]:
withProduct = train[~train['Product'].isnull()]
session_item = pd.DataFrame(withProduct.groupby('SessionId')['Product'].apply(set))

In [19]:
def non_overlap_pair(kind, s1, s2, included=None):
    if not included:
        return (
            (kind, key[0], key[1], 1) 
            for key in itertools.product(s1, s2)  
            if key[0]!=key[1]
        )
    else:
        return (
            (kind, key[0], key[1], 1) 
            for key in itertools.product(s1, s2)  
            if (key[0]!=key[1]) and (key in included)
        )

In [20]:
lst = session_item["Product"].to_list()
num_sessions = session_item.shape[0]
all_item = set(items)

both = itertools.chain(
    *(
        non_overlap_pair(0, u_item, u_item) 
        for u_item in lst
    ))


included = set(itertools.chain(
    *(
        (key for key in itertools.product(u_item, u_item) if key[0]!=key[1])
        for u_item in lst 
    )))


first = itertools.chain(
    *(
        non_overlap_pair(1, u_item, all_item-u_item, included) 
        for u_item in lst
    ))


second = itertools.chain(
    *(
        non_overlap_pair(2, all_item-u_item, u_item, included) 
        for u_item in lst
    ))

itemCooc = pd.DataFrame(
    itertools.chain(first, second, both), columns=["Kind", "Item1", "Item2", "Freq"])
itemCooc = itemCooc.groupby(["Kind", "Item1", "Item2"]).sum()
itemCooc = itemCooc.unstack(0)

In [21]:
itemCooc[('Freq', 3)] = num_sessions - itemCooc[('Freq', 0)] - itemCooc[('Freq', 1)] - itemCooc[('Freq', 2)]

In [22]:
def shannon(v):
    S = np.sum(v)
    return np.dot(v/S,np.log(np.add(v/S,v==0)))

In [23]:
def llr(x):
    v = np.array([x[('Freq', 0)], x[('Freq', 1)], x[('Freq', 2)], x[('Freq', 3)]])
    return 2*np.sum(v)*(shannon(v) - \
                        shannon([v[0]+v[1], v[2]+v[3]]) - \
                        shannon([v[0]+v[2], v[1]+v[3]])
                       )

In [24]:
itemCooc_llr = itemCooc.apply(lambda x: llr(x), axis=1)

In [25]:
llrDf = pd.DataFrame(itemCooc_llr).rename(columns={0:'Llr'}).dropna().reset_index()

In [63]:
def ditheredLlr(item):
    if item not in set(llrDf['Item1']):
        return ['']*3
    else:
        _ = llrDf[llrDf['Item1']==item].sort_values('Llr', ascending=False).reset_index(drop=True)
        _['Rank'] = _.index
        _['ditheredRank'] = np.log(_['Rank']+1) + np.random.normal(0, np.sqrt(np.log(1.5)), _['Rank'].max()+1)
        llrRecs = _.sort_values('ditheredRank').head(3)['Item2']
        if len(llrRecs) < 3:
            pad = ['']*(3-len(llrRecs))
            return llrRecs.to_list() + pad
        else:
            return llrRecs.to_list()

In [75]:
productRecs = pd.DataFrame({'Product':test['Product'].dropna().unique()})
productRecs = productRecs.apply(lambda x: ditheredLlr(x['Product']), axis=1)

# User-item recommendations

In [77]:
sessionFlags = pd.DataFrame()
sessionFlags['Add'] = train.groupby('SessionId')['Action'].apply(lambda y: 'add_to_cart' in set(y))
sessionFlags['Remove'] = train.groupby('SessionId')['Action'].apply(lambda y: 'remove_from_cart' in set(y))
sessionFlags['Delete'] = train.groupby('SessionId')['Action'].apply(lambda y: 'delete_cart' in set(y))
sessionFlags['Order'] = train.groupby('SessionId')['Action'].apply(lambda y: 'order' in set(y))
sessionFlags['Change'] = train.groupby('SessionId')['Action'].apply(lambda y: 'change_order' in set(y))

Since we are going to use the package 'implicit' we need to create a sparse item-user matrix. The matrix entries will be the confidence we have in the user preference for that item. Preferences are binary, and are based on implicit feedback, in our case based on 'action' information.

In [78]:
def userRating(x):
    if x['Add'] and x['Order'] and not x['Remove'] and not x['Delete'] and not x['Change']:
        return 5
    elif x['Add'] and x['Order'] and not x['Remove'] and not x['Delete']:
        return 4
    elif x['Add'] and x['Order'] and not x['Delete']:
        return 3
    elif x['Add'] and x['Order']:
        return 2
    elif x['Add']:
        return 1

In [79]:
user_session_item = train[~train['Product'].isnull()].drop(['TimeStamp', 'Event', 'Action'], axis=1).drop_duplicates()
user_session_item = user_session_item.merge(sessionFlags, how='left', on='SessionId')
user_session_item['Rating'] = user_session_item.apply(userRating, axis=1)
user_session_item = user_session_item.drop(['Add', 'Remove', 'Delete', 'Order', 'Change'], axis=1)

In [80]:
user_item = user_session_item.drop('SessionId', axis=1)
user_item = user_item.groupby(['IpId', 'Product']).sum()
user_item['Confidence'] = 1 + 40*user_item['Rating']
user_item = user_item.drop('Rating', axis=1)

In [81]:
item_user = user_item.unstack(0)
dtype = pd.SparseDtype()
item_user_sparse = item_user.astype(dtype)
item_user_sparse = item_user_sparse.sparse.to_coo()
item_user_sparse = item_user_sparse.tocsr()

In [82]:
model = implicit.als.AlternatingLeastSquares(factors=10)
model.fit(item_user_sparse)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [92]:
user_ids = item_user.T.reset_index(level=0, drop=True).reset_index()['IpId']
user_item_ = item_user_sparse.T.tocsr()
product_ids = pd.Series(item_user.index)

In [94]:
def ditheredImplicit(user):
    if user not in set(user_ids):
        return ['']*3
    else:
        user_id = user_ids[user_ids==user].index[0]
        recommendations = model.recommend(user_id, user_item_)
        _ = pd.DataFrame([(product_ids[product_ids.index==recommendations[j][0]].values[0], recommendations[j][1]) for j in range(len(recommendations))]).rename(columns={0:'Product', 1:'Score'})
        _['Rank'] = _.index
        _['ditheredRank'] = np.log(_['Rank']+1) + np.random.normal(0, np.sqrt(np.log(1.5)), _['Rank'].max()+1)
        userRecs = _.sort_values('ditheredRank').head(3)['Product']
        if len(userRecs) < 3:
            pad = ['']*(3-len(userRecs))
            return userRecs.to_list() + pad
        else:
            return userRecs.to_list()

# Recommendation engine

In [124]:
def recommendations(x):
    l_ = ditheredLlr(x['Product']) + ditheredImplicit(x['IpId'])
    random.shuffle(l_)
    return l_

In [127]:
test_1d = test[test['TimeStamp']<datetime(2020,4,1,23,59,59)]

In [128]:
inData = test_1d.sort_values('TimeStamp').drop('SessionId', axis=1)
inData['Recs'] = inData.apply(recommendations, axis=1)

Unnamed: 0,IpId,TimeStamp,Event,Action,Product,Recs
39839,202064PL,2020-04-01 00:00:11,,add_to_cart,p-8734,"[, c-78, c-66_71, , p-7463, ]"
39840,202064PL,2020-04-01 00:01:12,,add_to_cart,p-7592,"[p-7463, p-8808, p-4603, p-8731, p-5131, c-78]"
39841,202064PL,2020-04-01 00:04:48,,add_to_cart,p-9890,"[p-9891, p-6690, p-7463, c-66_71, p-9954, p-7380]"
39842,202064PL,2020-04-01 00:10:08,,remove_from_cart,,"[c-78, , , p-7214, , p-7463]"
39843,202064PL,2020-04-01 00:10:37,,remove_from_cart,,"[, , p-9110, p-7463, c-78, ]"


In [135]:
inData.to_csv('inData.csv')

In [190]:
print(colored(inData.iloc[0]['Recs'], 'magenta', attrs=['bold']))

[1m[35m['', 'c-78', 'c-66_71', '', 'p-7463', ''][0m


In [193]:
k = 0
while k < inData.shape[0]-8: 
    clear_output(wait=True)
    table = BeautifulTable()
    table.columns.width = [8, 13, 8, 18, 10, 9, 9, 9, 9, 9, 9]
    table.set_style(BeautifulTable.STYLE_BOX_ROUNDED)
    table.columns.header = inData.drop('Recs', axis=1).columns.to_list() + ['Rec1', 'Rec2', 'Rec3', 'Rec4', 'Rec5', 'Rec6']
    for j in range(k, k+8):
        recs = [rec for rec in inData.iloc[j]['Recs'] if rec != '']
        recs = recs + ['']*(6-len(recs))
        table.rows.append([str(cell).replace('nan','') for cell in inData.drop('Recs', axis=1).iloc[j]] + \
                          [colored(recs[i], 'magenta', attrs=['bold']) for i in range (6)])
    print(table)
    time.sleep(.35)
    k += 1

╭────────┬─────────────┬────────┬──────────────────┬──────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────╮
│  IpId  │  TimeStamp  │ Event  │      Action      │ Product  │  Rec1   │  Rec2   │  Rec3   │  Rec4   │  Rec5   │  Rec6   │
├────────┼─────────────┼────────┼──────────────────┼──────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│ 8153PL │ 2020-04-01  │ Leave  │                  │          │ [1m[35mc-66_71[0m │ [1m[35mp-10150[0m │ [1m[35mc-66_27[0m │         │         │         │
│        │  19:42:53   │        │                  │          │         │         │    [1m[35m4[0m    │         │         │         │
├────────┼─────────────┼────────┼──────────────────┼──────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│ 27750P │ 2020-04-01  │  Flit  │                  │          │  [1m[35mc-65[0m   │ [1m[35mp-4128[0m  │ [1m[35mp-10018[0m │         │         │         │
│   L    │  19:43:09   │        │        

KeyboardInterrupt: 