In [1]:
import pandas as pd

In [11]:
df = pd.read_csv('web-event-data.csv')

In [12]:
df.shape

(50000, 6)

In [13]:
df.head()

Unnamed: 0,browser_id,country_code,day,event,referrer,time
0,14679f48c247f-02e4d6a07-4e4c052f-1aeaa0-14679f...,CA,2014-06-08,browse_page,http://www.reddit.com/r/Corvette/?count=25&aft...,1402190996
1,14679f48c247f-02e4d6a07-4e4c052f-1aeaa0-14679f...,CA,2014-06-08,browse_page,http://www.tailpipe.co/model/corvette?year=196...,1402191053
2,1462a54454f178-02687cf0f-18154453-fa000-1462a5...,US,2014-06-08,home,http://www.tailpipe.dev:8000/,1402191192
3,1462a54454f178-02687cf0f-18154453-fa000-1462a5...,US,2014-06-08,browse_page,http://www.tailpipe.dev:8000/price_pulse,1402191199
4,1462a54454f178-02687cf0f-18154453-fa000-1462a5...,US,2014-06-08,browse_page,http://www.tailpipe.dev:8000/price_pulse,1402191206


In [33]:
GROUP_KEY = 'browser_id'
TIME_FIELD = 'time'
EVENT_FIELD = 'event'
FUNNEL_STEPS = ['search_page', 'product_page', 'share_product']


def funnelize(group, funnel_steps):
    curr = 0
    funnel_cts = [0 for s in funnel_steps]

    for i, row in group.iterrows():
        evt = row[EVENT_FIELD]
        if evt in funnel_steps:
            idx = funnel_steps.index(evt)
            if idx <= curr + 1:
                funnel_cts[idx] += 1
                curr = idx
        
    return funnel_cts


# sort event dataset by time
df.sort_values(by=TIME_FIELD, ascending=True, inplace=True)

# step through events by group_key (i.e., browser, visit), 
# and increment event counts by funnel step if prior funnel steps occurred
funnel_cts = df.groupby(GROUP_KEY).apply(funnelize, FUNNEL_STEPS)

# format results as a dataframe
funnel = pd.DataFrame(list(funnel_cts.values), index=funnel_cts.index, columns=FUNNEL_STEPS)

funnel.head()

Unnamed: 0_level_0,search_page,product_page,share_product
browser_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1462a54454f178-02687cf0f-18154453-fa000-1462a5445503b5,73,11,6
1462a834943150-073d49c93-18154453-232800-1462a8349444a5,0,0,0
1462a88c0ea7e-01a2ddb98-18154453-fa000-1462a88c0ed84,5,2,0
1462a9527e825-0a21ebb94-18154453-232800-1462a9527e93de,0,0,0
1463debdc1d28-01850ff9c-19154453-13c680-1463debdc1e3fe,0,0,0


In [34]:
# funnel summary (aggregate)

print funnel[FUNNEL_STEPS].sum()

search_page      20658
product_page     10508
share_product     5029
dtype: int64


In [35]:
# funnel summary, deduped across group key 
# (i.e., event occurrences become 1/0 indicators instead of counts)

funnel_norm = funnel.copy()
funnel_norm[funnel_norm != 0] = 1
print funnel_norm[FUNNEL_STEPS].sum()

search_page      4259
product_page     2560
share_product    1699
dtype: int64
