In [3]:
%matplotlib inline
%cd E:\Study\Coding Tests\Inkredo events_log.csv

E:\Study\Coding Tests\Inkredo events_log.csv


In [4]:
#First we import the relevant libraries, then read the data into a DataFrame.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

In [5]:
raw_data = pd.read_csv("events_log.csv", compression = "infer")
raw_data.head()

Unnamed: 0,uuid,timestamp,session_id,group,action,checkin,page_id,n_results,result_position
0,00000736167c507e8ec225bd9e71f9e5,20160300000000.0,78245c2c3fba013a,b,searchResultPage,,cbeb66d1bc1f1bc2,5.0,
1,00000c69fe345268935463abbfa5d5b3,20160300000000.0,c559c3be98dca8a4,a,searchResultPage,,eb658e8722aad674,10.0,
2,00003bfdab715ee59077a3670331b787,20160300000000.0,760bf89817ce4b08,a,checkin,30.0,f99a9fc1f7fdd21e,,
3,0000465cd7c35ad2bdeafec953e08c1a,20160300000000.0,fb905603d31b2071,a,checkin,60.0,e5626962a6939a75,,10.0
4,000050cbb4ef5b42b16c4d2cf69e6358,20160300000000.0,c2bf5e5172a892dc,a,checkin,30.0,787dd6a4c371cbf9,,


In [12]:
data['timestamp'].head()

0    20160300000000
1    20160300000000
2    20160300000000
3    20160300000000
4    20160300000000
Name: timestamp, dtype: int64

In [13]:
data['timestamp'] = data['timestamp'].apply(int)

In [17]:
data['timestamp'].head()

0    20160300000000
1    20160300000000
2    20160300000000
3    20160300000000
4    20160300000000
Name: timestamp, dtype: int64

In [18]:
#Next we verify that the elements in the categorical columns correspond to the ones given in the column definitions.

data['group'].unique()


array(['b', 'a'], dtype=object)

In [19]:
data['action'].unique()

array(['searchResultPage', 'checkin', 'visitPage'], dtype=object)

In [20]:
#Next, we check that every event either has a checkin time, or isn't a checkin event. 
#We do this by adding the number of non-checkin events and the number of non-null entries 
#in the checkin column and checking whether it's equal to the total number of rows in the DataFrame.

data.loc[data['action'] != 'checkin', :].shape[0] + data.loc[pd.notnull(data['checkin'])].shape[0] == data.shape[0]

True

In [21]:
#Next, we check that every searchResultPage event has a non-null value for n_results.

data.loc[data['action'] == 'searchResultPage'].shape[0] == data.loc[pd.notnull(data['n_results'])].shape[0]

True

In [22]:
#Next, we check that every non-searchResultPage event has a result position. Equivalently, 
#we are checking that every non-search event is coming from a search event.

data.loc[data['action'] != 'searchResultPage'].shape[0] == data.loc[pd.notnull(data['result_position'])].shape[0]

False

In [23]:
#It turns out that this isn't the case. 
#We can get a look at what's happening by examining a session with a non-search event that has no result position:

data.loc[(data['action'] != 'searchResultPage') & ~pd.notnull(data['result_position']), :].head()

Unnamed: 0,uuid,timestamp,session_id,group,action,checkin,page_id,n_results,result_position
2,00003bfdab715ee59077a3670331b787,20160300000000,760bf89817ce4b08,a,checkin,30.0,f99a9fc1f7fdd21e,,
4,000050cbb4ef5b42b16c4d2cf69e6358,20160300000000,c2bf5e5172a892dc,a,checkin,30.0,787dd6a4c371cbf9,,
5,0000a6af2baa5af1be2431e84cb01da1,20160300000000,f6840a9614c527ad,a,checkin,180.0,6fb7b9ea87012975,,
6,0000cd61e11d5371adf974703cd4f7e7,20160300000000,51f4d3b6a8688e56,a,checkin,240.0,8ad97e7c85c58e80,,
15,00025ad0bcbd552b8b0e7c73454ca872,20160300000000,056145b0bb0d8b0a,a,checkin,10.0,e5509b31fb1687f8,,


In [24]:
data.loc[(data['action'] != 'searchResultPage') & ~pd.notnull(data['result_position']), :].shape

(33449, 9)

In [25]:
data.loc[data['session_id'] == '760bf89817ce4b08', :].sort_values(by = 'timestamp')

Unnamed: 0,uuid,timestamp,session_id,group,action,checkin,page_id,n_results,result_position
2,00003bfdab715ee59077a3670331b787,20160300000000,760bf89817ce4b08,a,checkin,30.0,f99a9fc1f7fdd21e,,
324347,cf3f95e7327e537499927e0488ae38be,20160300000000,760bf89817ce4b08,a,visitPage,,ea7f2e3f09201d2e,,3.0
323921,cf011a4644495fffa2e3b3c0c8c60244,20160300000000,760bf89817ce4b08,a,visitPage,,c466f424fcdb3629,,
312322,c791a922ebfe57329a8ea6c694c203ab,20160300000000,760bf89817ce4b08,a,checkin,20.0,b25a59573344207a,,1.0
286145,b6bfbc2c31895463a640a0f7fe34dbc9,20160300000000,760bf89817ce4b08,a,checkin,90.0,4d306d4827f4fade,,1.0
264878,a934b730fc705b8dabb147c777cb98a2,20160300000000,760bf89817ce4b08,a,searchResultPage,,7d9fbc3f887e9ad9,20.0,
263572,a860b6da92be503684476d955eb133b5,20160300000000,760bf89817ce4b08,a,checkin,50.0,b25a59573344207a,,1.0
258001,a4d0c84fea1d51368b2ec2a2587e1f4b,20160300000000,760bf89817ce4b08,a,checkin,40.0,f99a9fc1f7fdd21e,,
241951,9a91dd074ce15bfba436ab4ddceaf817,20160300000000,760bf89817ce4b08,a,checkin,60.0,b25a59573344207a,,1.0
235023,962aa581f1f4570b81e71f6faa52d70e,20160300000000,760bf89817ce4b08,a,visitPage,,b25a59573344207a,,1.0
