# Modules

In [1]:
import pandas as pd
import numpy as np
import random
from itertools import groupby
import datetime
import re

# Clickstream Generation

## Initial parameters

In [2]:
page_vec = np.array(['page_' + str(i+1) for i in range(5)])
sess_vec = np.array(['sess_' + str(i+1) for i in range(50)])
cust_vec = np.array(['cust_' + str(i+1) for i in range(30)])

## SessionId

In [3]:
sess_rep = np.round(np.exp(np.random.randn(len(sess_vec))))
sess_rep = np.where(sess_rep == 0, 1, sess_rep)
SessionId = np.repeat(sess_vec, list(sess_rep))

## CustId

In [4]:
cust_draw = [(k, sum(1 for _ in g)) for k, g in groupby(SessionId)]
cust_rep = [i[1] for i in cust_draw]
CustId = np.repeat(np.random.choice(cust_vec, size=len(sess_vec) ,replace=True), cust_rep)

## Page

In [5]:
Page = np.random.choice(page_vec, len(SessionId), replace=True)

## Timestamp

In [6]:
begin_date = datetime.datetime(2018, 7, 7, 0, 0, 0)
end_date = datetime.datetime(2018, 7, 7, 23, 59, 59)

In [7]:
def sample_datetime(begin, end):
    
    delta = end - begin
    delta_sec = delta.total_seconds()
    random_second = random.randrange(delta_sec)
    
    return begin + datetime.timedelta(seconds=random_second)

In [8]:
def generate_ts():
    
    Timestamp = []
    
    for i in range(len(sess_vec)):

        start = sample_datetime(begin_date, end_date)
        session_len = cust_rep[i]
        session_max = start + datetime.timedelta(0, 1800)

        Timestamp = Timestamp + [sample_datetime(start, session_max) for _ in range(session_len)]
    
    Timestamp = np.array(Timestamp)
    
    return Timestamp

In [9]:
Timestamp = generate_ts()

## Pandas DF

create dataframe

In [10]:
d = {'SessionId' : SessionId, 'CustId' : CustId, 'Page' : Page, 'Timestamp' : Timestamp}

In [11]:
clickstream = pd.DataFrame(d)

sort timestamp

In [12]:
clickstream = clickstream.sort_values(['SessionId', 'CustId', 'Timestamp']).reset_index()

change column order

In [13]:
clickstream = clickstream[['SessionId', 'CustId', 'Page', 'Timestamp']]

In [67]:
clickstream['PageCnt'] = clickstream.groupby('SessionId')['Page'].transform(lambda x: len(x))
clickstream['PageUnq'] = clickstream.groupby('SessionId')['Page'].transform(lambda x: x.nunique())
clickstream['Path'] = clickstream.groupby('SessionId')['Page'].transform(lambda x: '[%s]' % ','.join(x))
clickstream['Duration'] = clickstream.groupby('SessionId')['Timestamp'].transform(lambda x: x.shift(-1) - x)
clickstream['SessionDuration'] = clickstream.groupby('SessionId')['Timestamp'].transform(lambda x: x.max() - x.min())
clickstream['PageCntDist'] = pd.qcut(clickstream['PageCnt'], 5, duplicates='drop', labels=[0.25,0.5,0.75,1])

In [68]:
clickstream.head()

Unnamed: 0,SessionId,CustId,Page,Timestamp,PageCnt,PageUnq,Path,Duration,SessionDuration,PageCntDist
0,sess_1,cust_18,page_3,2018-07-07 10:15:51,1,1,[page_3],NaT,00:00:00,0.25
1,sess_10,cust_21,page_1,2018-07-07 23:10:17,3,3,"[page_1,page_4,page_3]",00:00:44,00:04:56,0.5
2,sess_10,cust_21,page_4,2018-07-07 23:11:01,3,3,"[page_1,page_4,page_3]",00:04:12,00:04:56,0.5
3,sess_10,cust_21,page_3,2018-07-07 23:15:13,3,3,"[page_1,page_4,page_3]",NaT,00:04:56,0.5
4,sess_11,cust_2,page_3,2018-07-07 16:45:46,1,1,[page_3],NaT,00:00:00,0.25


## Visualization

In [100]:
import plotly.plotly as py
import plotly.graph_objs as go

In [101]:
plotly.offline.init_notebook_mode(connected=True)

In [102]:
trace = go.Scatter(
    x = np.array(clickstream['PageCnt']),
    y = np.array(clickstream['PageUnq']),
    mode = 'markers'
)

In [107]:
layout = go.Layout(
    
    title = 'test correlation',
    
    xaxis = dict(
    
        title = 'PageCnt',
        
        titlefont = dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis = dict(
    
        title = 'PageUnq',

        titlefont = dict(

            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'  
        )
    )  
)

In [108]:
fig = go.Figure(data=data, layout=layout)

In [109]:
plotly.offline.iplot(fig)