In [1]:
%cd "/Users/gxav/Documents/SIGIR2021/workspace"

/Users/gxav/Documents/SIGIR2021/workspace


In [2]:
import pandas as pd
import datetime
import numpy as np

### Set simulation parameters

In [23]:
number_customers = 10000
first_sessions_simu_period = (datetime.datetime(2015, 1, 1), datetime.datetime(2025, 1, 1))
# Parameters of the Negative Binomiale distributions from which
## the number of sessions is drawn for each customer
## the average days between sessions is drawn for each customer
NB_params_number_of_sessions = (4, 0.1)
NB_params_avg_days_between_sessions = (1.5, 0.1)

In [24]:
first_sessions_simu_period_ms = (
    int(round(first_sessions_simu_period[0].timestamp() * 1000)),
    int(round(first_sessions_simu_period[1].timestamp() * 1000))
)

### Use SIGIR2021 data as browsing events distribution for our simulation
#### https://github.com/coveooss/SIGIR-ecom-data-challenge
#### The file browsing_train.csv contains almost 5M anonymized shopping sessions
#### Each row corresponds to a browsing event in a session, containing session and timestamp information, as well as (hashed) details on the interaction (was it purchase or a detail event? Was it a simple pageview or a specific product action?)

In [5]:
browsing = pd.read_csv("browsing_train.csv")

In [6]:
browsing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36079307 entries, 0 to 36079306
Data columns (total 6 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   session_id_hash            object
 1   event_type                 object
 2   product_action             object
 3   product_sku_hash           object
 4   server_timestamp_epoch_ms  int64 
 5   hashed_url                 object
dtypes: int64(1), object(5)
memory usage: 1.6+ GB


In [7]:
print(
    datetime.datetime.fromtimestamp(browsing.server_timestamp_epoch_ms.min()/1000),
    datetime.datetime.fromtimestamp(browsing.server_timestamp_epoch_ms.max()/1000)
)

2019-01-15 13:02:44.513000 2019-04-15 11:59:58.560000


In [8]:
browsing.session_id_hash.nunique()

4934699

In [9]:
browsing_session_ids = browsing.session_id_hash.unique()

In [10]:
browsing_session_first_timestamp = browsing.groupby('session_id_hash').server_timestamp_epoch_ms.agg('min')

### Create customers info randomly 
#### on first session, number of sessions, average days between sessions and sessions time

In [25]:
cust_simu = pd.DataFrame({
    'cust_id': range(number_customers),
    'first_session': np.random.uniform(
        first_sessions_simu_period_ms[0],
        first_sessions_simu_period_ms[1],
        number_customers
    ).astype(int),
    'number_sessions': np.random.negative_binomial(
        NB_params_number_of_sessions[0],
        NB_params_number_of_sessions[1],
        number_customers
    ) + 1,
    'avg_days_between_sessions': np.random.negative_binomial(
        NB_params_avg_days_between_sessions[0],
        NB_params_avg_days_between_sessions[1],
        number_customers
    ) + 0.5,
})

In [26]:
def session_timestamps(X):
    first_session = int(X.first_session)
    number_sessions = int(X.number_sessions)
    avg_days_between_sessions = X.avg_days_between_sessions
    sessions = [first_session]
    if number_sessions == 1:
        return sessions
    time_between_sessions = np.random.exponential(
        avg_days_between_sessions, number_sessions - 1) * 24 * 60 * 60 * 1000
    for i in range(number_sessions - 1):
        sessions.append(sessions[i] + int(time_between_sessions[i]))
    return sessions

In [27]:
cust_simu['session_timestamps'] = cust_simu.apply(session_timestamps, axis=1)

### Discard info not needed any more and draw randomly sessions ids from the browsing example

In [28]:
cust_simu.drop(columns=['first_session', 'number_sessions', 'avg_days_between_sessions'], inplace=True)

In [29]:
cust_simu = cust_simu.explode('session_timestamps')

In [30]:
cust_simu['session_timestamps'] = cust_simu['session_timestamps'].astype(int)

In [31]:
cust_simu['session_id_hash'] = np.random.choice(browsing_session_ids, len(cust_simu))

#### Compute adjustments we will need to apply to the browsing timestamps 

In [32]:
cust_simu['timestamp_adjustment'] = \
    cust_simu['session_timestamps'] - browsing_session_first_timestamp[cust_simu['session_id_hash']].values

### Create final log data
#### by merging sessions info availabe in the browsing example (and adjusting the timestamps to our simu)

In [33]:
cust_simu = pd.merge(cust_simu, browsing, how='left', on='session_id_hash')

In [34]:
cust_simu['server_timestamp_epoch_ms'] += cust_simu['timestamp_adjustment']

In [35]:
cust_simu.drop(columns=['session_timestamps', 'timestamp_adjustment'], inplace=True)

In [40]:
cust_simu = cust_simu[cust_simu.server_timestamp_epoch_ms < first_sessions_simu_period_ms[1]]

In [41]:
cust_simu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2480611 entries, 0 to 2723598
Data columns (total 7 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   cust_id                    int64 
 1   session_id_hash            object
 2   event_type                 object
 3   product_action             object
 4   product_sku_hash           object
 5   server_timestamp_epoch_ms  int64 
 6   hashed_url                 object
dtypes: int64(2), object(5)
memory usage: 151.4+ MB


In [37]:
cust_simu

Unnamed: 0,cust_id,session_id_hash,event_type,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
0,0,76df2656881518ce27054143af14eb2beb42ee3dfc94a9...,pageview,,,1702428212186,b0525e1207a512f1d3dd9de2303685ad6cfea0701d8560...
1,0,900992ead958a465a355e7188f203e69ce9cfb60e82ecb...,event_product,detail,f48105e564f1077da3a33e178e850453652eded75cd402...,1703212534358,d86fe175b47d2860a8f549fd14b664e89dc70e94bc6a70...
2,0,900992ead958a465a355e7188f203e69ce9cfb60e82ecb...,pageview,,,1703212534358,d86fe175b47d2860a8f549fd14b664e89dc70e94bc6a70...
3,0,4fd6afba89ae40bad5cc1ce8ca968648486f5a685da811...,pageview,,,1704180073258,f633546b2a55c2b857fa70b5bb29355d7ad9c5d280818a...
4,0,4fd6afba89ae40bad5cc1ce8ca968648486f5a685da811...,event_product,detail,059b2dc661874dd7d2fe86a33b75d673855eea53cc0b54...,1704180073258,f633546b2a55c2b857fa70b5bb29355d7ad9c5d280818a...
...,...,...,...,...,...,...,...
2723594,9999,a5bb728d128743ccdb557a0fabd5c35428bcab77de1973...,pageview,,,1705355677760,195e34e6c0b8f196ab2e677d851b8394a2b335bab4cbd0...
2723595,9999,a5bb728d128743ccdb557a0fabd5c35428bcab77de1973...,pageview,,,1705355681411,d1700644233a33681ddba8fe2074e5679b3d2f4d8699bb...
2723596,9999,a5bb728d128743ccdb557a0fabd5c35428bcab77de1973...,pageview,,,1705355691710,24835d7ce63d738e7d3fba26b102a769608f971091b558...
2723597,9999,a5bb728d128743ccdb557a0fabd5c35428bcab77de1973...,pageview,,,1705355718818,3e0d264ba8758bf5ba78e3ed04debdd280c5b24cf997af...


In [42]:
print(
    datetime.datetime.fromtimestamp(cust_simu.server_timestamp_epoch_ms.min()/1000),
    datetime.datetime.fromtimestamp(cust_simu.server_timestamp_epoch_ms.max()/1000)
)

2015-01-01 04:54:59.654000 2024-12-31 23:56:06.349000
