In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re

In [2]:
sessions = pd.read_csv('sessions.csv', index_col=0)
sessions.rename(columns={'SessionID':'SessionId'}, inplace=True)
sessions.shape

(112256, 6)

**From the documentation**: *TimeStamp – long integer value specifying the UTC date and time of the arrival of a request, coded as the number of 100-nanosecond intervals that have elapsed since 00:00:00 UTC on 1st January, 1 A.D.*

In [3]:
seconds = (datetime(1970,1,1)-datetime(1,1,1)).total_seconds()
ticks = seconds * 10**7
sessions['TimeStamp'] = pd.to_datetime((sessions['TimeStamp']-ticks)*100)
sessions.head()

Unnamed: 0,SessionId,IpId,TimeStamp,Event,Action,Product
0,0,100095PL,2020-01-13 21:35:36,Leave,,
1,1,100095PL,2020-04-18 16:14:40,Arrive,,
2,1,100095PL,2020-04-18 16:15:26,,add_to_cart,p-9967
3,1,100095PL,2020-04-18 16:33:07,,order,
4,1,100095PL,2020-04-18 16:34:25,Leave,,


In [4]:
train = sessions[sessions['TimeStamp']<=datetime(2020,3,31,23,59,59)]
test = sessions[sessions['TimeStamp']>datetime(2020,3,31,23,59,59)]

In [5]:
print(train['SessionId'].nunique())
print(test['SessionId'].nunique())

25877
14423


In [6]:
print(train['SessionId'].nunique()/sessions['SessionId'].nunique())
print(test['SessionId'].nunique()/sessions['SessionId'].nunique())

0.6421410491835823
0.3579085810710209


In [9]:
items = train['Product'].dropna().unique()
print(items.shape)
items

(4038,)


array(['p-7414', 'p-6769', 'p-4466', ..., 'p-10522', 'p-2788', 'p-8202'],
      dtype=object)

In [11]:
np.savetxt('shopItems.txt', items, fmt='%s')

First we create a session_item table. It is our assumption that the most useful recommendations stem from looking at products purchased together in the same session, rather than simply by the same user. We will do user_item later.

In [8]:
withProduct = train[~train['Product'].isnull()]
session_item = pd.DataFrame(withProduct.groupby('SessionId')['Product'].apply(set))
print(session_item.shape)
session_item

(8401, 1)


Unnamed: 0_level_0,Product
SessionId,Unnamed: 1_level_1
12,{p-7414}
19,"{p-6769, p-8068, p-4466}"
20,"{c-34_110, c-34_307, c-34_356}"
21,"{c-34_113, c-34_356, p-1775, c-34_118}"
43,"{c-39_42, c-39_41, c-39_40}"
...,...
40274,{p-2769}
40275,{p-9967}
40276,{p-3618}
40281,"{p-8990, c-65, p-8755, p-6637}"


In [12]:
withProduct.groupby('SessionId')['Product'].apply(lambda y: '\t'.join(set(y))).\
to_csv('sessionBaskets.txt', header=False, index=False)

In [9]:
train['Action'].value_counts()

add_to_cart         20226
order                3484
remove_from_cart     1900
change_order          244
delete_cart            86
Name: Action, dtype: int64

In [10]:
3484/120

29.033333333333335

Is it possible the mean number of orders per day could be as few as 30?

In [11]:
sessionFlags = pd.DataFrame()
sessionFlags['Add'] = train.groupby('SessionId')['Action'].apply(lambda y: 'add_to_cart' in set(y))
sessionFlags['Remove'] = train.groupby('SessionId')['Action'].apply(lambda y: 'remove_from_cart' in set(y))
sessionFlags['Delete'] = train.groupby('SessionId')['Action'].apply(lambda y: 'delete_cart' in set(y))
sessionFlags['Order'] = train.groupby('SessionId')['Action'].apply(lambda y: 'order' in set(y))
sessionFlags['Change'] = train.groupby('SessionId')['Action'].apply(lambda y: 'change_order' in set(y))
sessionFlags.head(20)

Unnamed: 0_level_0,Add,Remove,Delete,Order,Change
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
11,False,False,False,False,False
12,True,False,False,False,False
13,False,False,False,False,False
14,False,False,False,False,False
15,False,False,False,False,False


In [12]:
print(sessionFlags[(sessionFlags['Add']==False) & (sessionFlags['Order']==True)].shape)
print(sessionFlags[(sessionFlags['Add']==True) & (sessionFlags['Order']==True)].shape)
print(sessionFlags[(sessionFlags['Add']==True) & (sessionFlags['Order']==True) & \
                  (sessionFlags['Remove']==False) & (sessionFlags['Delete']==False)].shape)
print(sessionFlags[(sessionFlags['Add']==True) & (sessionFlags['Order']==True) & \
                  (sessionFlags['Remove']==False) & (sessionFlags['Delete']==False) & \
                  (sessionFlags['Change']==False)].shape)
print(sessionFlags[sessionFlags['Add']==True].shape)

(28, 5)
(3283, 5)
(2824, 5)
(2660, 5)
(8436, 5)


### The following piece of code was abandoned in favor of a method using a more efficient structure along with itertools -- this can be found in item_item_iter. 

In [None]:
# This did not run despite being left overnight. The notebook crashed.
# item_dict = {}
# for item in items:
#     item_dict[item] = {}
#     for jtem in items:
#         if jtem != item:
#             k = np.sum(session_item['Product'].apply(lambda y: jtem in y and item in y))
#             if k != 0:
#                 item_dict[item][jtem] = np.array([
#                     k,
#                     np.sum(session_item['Product'].apply(lambda y: jtem not in y and item in y)),
#                     np.sum(session_item['Product'].apply(lambda y: jtem in y and item not in y)),
#                     np.sum(session_item['Product'].apply(lambda y: jtem not in y and item not in y))
#                 ])                
# item_dict

In [None]:
def shannon(v):
    S = np.sum(v)
    return np.dot(v/S,np.log(np.add(v/S,v==0)))

In [None]:
def llr(v):
    return 2*np.sum(v)*(shannon(v) - \
                        shannon([v[0]+v[1], v[2]+v[3]]) - \
                        shannon([v[0]+v[2], v[1]+v[3]])
                       )