## This is the preprocessing for the 1-day data. For the full dataset, these techniques were imitated using a SQLite database.

In [21]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
import csv

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
sns.set_context('notebook')
sns.set_palette('viridis')

The use of 'usecols' below is to avoid bringing in 'UserId' which does not contain any information.

In [23]:
usecols=['IpId', 
         'TimeStamp',
         'HttpMethod',
         'Uri', 
         'ResponseCode', 
         'Bytes', 
         'Referrer', 
         'UserAgent',
        ]
eclog_1d = pd.read_csv('eclog_1day.csv', usecols=usecols)

First we remove any rows with an Ip that has at least once self-identified as a bot.

In [24]:
mask = eclog_1d['UserAgent'].apply(lambda y: re.search('bot', y, flags=re.IGNORECASE) != None)
botIps = eclog_1d['IpId'][mask].unique()
eclog_1d = eclog_1d[~eclog_1d['IpId'].apply(lambda y: y in botIps)]

Next we remove any rows with an Ip that is associated with at least one Uri signifying use of the 'management' system.

In [25]:
mask = eclog_1d['Uri'].apply(lambda y: re.match('/zarzadzanie/', y, flags=re.IGNORECASE) != None)
managementIps = eclog_1d['IpId'][mask].unique()
eclog_1d = eclog_1d[~eclog_1d['IpId'].apply(lambda y: y in managementIps)]

**From the documentation**: *TimeStamp – long integer value specifying the UTC date and time of the arrival of a request, coded as the number of 100-nanosecond intervals that have elapsed since 00:00:00 UTC on 1st January, 1 A.D.*

In [26]:
seconds = (datetime(1970,1,1)-datetime(1,1,1)).total_seconds()
ticks = seconds * 10**7
eclog_1d['TimeStamp'] = pd.to_datetime((eclog_1d['TimeStamp']-ticks)*100)

Document later.

In [27]:
mask = eclog_1d.apply(lambda x: x['HttpMethod']=='POST' and re.search('do_koszyka[^a-z_-]', x['Uri'])!=None, axis=1)
customerIps = eclog_1d['IpId'][mask].unique()
eclog_1d = eclog_1d[eclog_1d['IpId'].apply(lambda y: y in customerIps)]

Sort the remaining data into customer sessions and include some columns that will allow us to identify events 'arrive' and 'leave'. The end of a session is determined either by the Ip changing, or more than 30 minutes of inactivity.

In [28]:
sessions = eclog_1d.sort_values(['IpId','TimeStamp'])
sessions['Lag1_IpId'] = sessions['IpId'].shift(1)
sessions['Lag1_TimeStamp'] = sessions['TimeStamp'].shift(1)
sessions['Lead1_IpId'] = sessions['IpId'].shift(-1)
sessions['Lead1_TimeStamp'] = sessions['TimeStamp'].shift(-1)

In [29]:
def minutes(x,y):
    return abs((x-y).total_seconds())/60

In [30]:
def aux0(x):
    if (x['IpId'] != x['Lag1_IpId'] and x['IpId'] != x['Lead1_IpId']) or \
        (minutes(x['TimeStamp'],x['Lag1_TimeStamp']) > 30 and minutes(x['TimeStamp'],x['Lead1_TimeStamp']) > 30) or \
        (x['IpId'] != x['Lag1_IpId'] and minutes(x['TimeStamp'],x['Lead1_TimeStamp']) > 30) or \
        (x['IpId'] != x['Lead1_IpId'] and minutes(x['TimeStamp'],x['Lag1_TimeStamp']) > 30):
        return 'Flit'
    elif x['IpId'] != x['Lag1_IpId'] or minutes(x['TimeStamp'], x['Lag1_TimeStamp']) > 30:
        return 'Arrive'
    elif x['IpId'] != x['Lead1_IpId'] or minutes(x['TimeStamp'], x['Lead1_TimeStamp']) > 30:
        return 'Leave'

In [31]:
sessions['Event'] = sessions.apply(lambda x: aux0(x), axis=1)
sessions = sessions.drop(['Lag1_IpId', 'Lag1_TimeStamp', 'Lead1_IpId', 'Lead1_TimeStamp'], axis=1)

At this point we need to identify some product information. Product codes can be found in both the Uri and Refferer columns.

In [32]:
def aux1(y):
    product = re.search('p-[0-9_]+|r-[0-9_]+|kartapdf-[0-9_]+|c-[0-9_]+|m-[0-9_]+', y)
    if product != None:
        return product.group(0)

In [33]:
sessions['UriProduct'] = sessions['Uri'].apply(lambda y: aux1(y))
sessions['RefProduct'] = sessions['Referrer'].apply(lambda y: aux1(y))

It would be nice if there were only one product column. Occasionally both UriProduct and RefProduct are not none and they do not agree. Typically it seems this is when the customer is moving from one product (in the Refferer URL) to the next (in the URI). Since according to this Uri is the latest information, we determine that this takes precedence. This should have little to no effect because it happens surprisingly infrequently and never with method POST.   

In [34]:
def makeproduct(x):
    if x['UriProduct']!=None:
        return x['UriProduct']
    elif x['RefProduct']!=None:
        return x['RefProduct']

In [35]:
sessions['CurrentProduct'] = sessions.apply(lambda x: makeproduct(x), axis=1)

In [36]:
sessions = sessions.drop(['UriProduct', 'RefProduct'], axis=1)

Occasionally there will be a 'cart' or 'recommend' action without a product id alongside. If we decide to take the approach of using the last identified product in that session, we will need to have that information.

In [37]:
sessions['LastProduct'] = sessions['CurrentProduct']

In [38]:
def identbegin(y):
    if y=='Arrive' or y=='Flit':
        return 1
    else:
        return 0

In [39]:
forcumsum = sessions['Event'].apply(lambda y: identbegin(y))
sessions['SessionId'] = forcumsum.cumsum()

In [40]:
sessions['LastProduct']=sessions.groupby('SessionId')['LastProduct'].fillna(method='ffill')

We now reduce the data to the important events and actions.

In [41]:
def important(x):
    return x['Event'] != None or \
        (x['HttpMethod']=='POST' and re.search('do_koszyka[^a-z_-]', x['Uri'])!=None) or \
        (x['HttpMethod']=='POST' and re.search('koszyk_usun', x['Uri'])!=None) or \
        (x['HttpMethod']=='POST' and re.search('usun_z_koszyka', x['Uri'])!=None) or \
        (x['HttpMethod']=='POST' and re.search('zamowienie_realizacja', x['Uri'])!=None) or \
        (x['HttpMethod']=='POST' and re.search('do_zmiany_danych_zamowienie', x['Uri'])!=None)

In [42]:
mask = sessions.apply(lambda x: important(x), axis=1)
sessions = sessions[mask]

In [44]:
def makeaction(y):
    if re.search('do_koszyka[^a-z_-]', y)!=None:
        return 'add_to_cart'
    elif re.search('koszyk_usun', y)!=None:
        return 'delete_cart'
    elif re.search('usun_z_koszyka', y)!=None:
        return 'remove_from_cart'
    elif re.search('zamowienie_realizacja', y)!=None:
        return 'order'
    elif re.search('do_zmiany_danych_zamowienie', y)!=None:
        return 'change_order'

In [45]:
sessions['Action'] = sessions['Uri'].apply(lambda y: makeaction(y))

In [46]:
def makeproduct2(x):
    if x['Action']=='add_to_cart' and x['CurrentProduct']!=None:
        return x['CurrentProduct']
    elif x['Action']=='add_to_cart':
        return x['LastProduct']

In [47]:
sessions['Product'] = sessions.apply(lambda x: makeproduct2(x), axis=1)

In [50]:
sessions = sessions[['SessionId', 'IpId', 'TimeStamp', 'Event', 'Action', 'Product']]

In [52]:
sessions['SessionId'].nunique()

208

In [54]:
sessions.to_csv('sessions.csv')