## The purpose of this notebook is to output a processed dataset that covers the same time span as that generated from the 1-day data and check they are essentially the same.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re

In [2]:
sessions = pd.read_csv('sessions.csv', index_col=0)
sessions.rename(columns={'SessionID':'SessionId'})
sessions.head()

Unnamed: 0,SessionID,IpId,TimeStamp,Event,Action,Product
0,0,100095PL,637145481360000000,Leave,,
1,1,100095PL,637228232800000000,Arrive,,
2,1,100095PL,637228233260000000,,add_to_cart,p-9967
3,1,100095PL,637228243870000000,,order,
4,1,100095PL,637228244650000000,Leave,,


**From the documentation**: *TimeStamp – long integer value specifying the UTC date and time of the arrival of a request, coded as the number of 100-nanosecond intervals that have elapsed since 00:00:00 UTC on 1st January, 1 A.D.*

In [3]:
seconds = (datetime(1970,1,1)-datetime(1,1,1)).total_seconds()
ticks = seconds * 10**7
sessions['TimeStamp'] = pd.to_datetime((sessions['TimeStamp']-ticks)*100)
sessions.head()

Unnamed: 0,SessionID,IpId,TimeStamp,Event,Action,Product
0,0,100095PL,2020-01-13 21:35:36,Leave,,
1,1,100095PL,2020-04-18 16:14:40,Arrive,,
2,1,100095PL,2020-04-18 16:15:26,,add_to_cart,p-9967
3,1,100095PL,2020-04-18 16:33:07,,order,
4,1,100095PL,2020-04-18 16:34:25,Leave,,


In [4]:
sessions.sort_values('TimeStamp').head()

Unnamed: 0,SessionID,IpId,TimeStamp,Event,Action,Product
102414,36591,7PL,2019-11-30 23:00:09,Arrive,,
79103,27342,39PL,2019-11-30 23:02:05,Flit,,
100256,35771,74PL,2019-11-30 23:07:38,Arrive,,
103354,36946,81PL,2019-11-30 23:10:05,Arrive,,
100257,35771,74PL,2019-11-30 23:23:55,,add_to_cart,c-35_313


In [5]:
pd.options.display.max_rows = 200
sessions.sort_values(['IpId','TimeStamp']).head(200)

Unnamed: 0,SessionID,IpId,TimeStamp,Event,Action,Product
0,0,100095PL,2020-01-13 21:35:36,Leave,,
1,1,100095PL,2020-04-18 16:14:40,Arrive,,
2,1,100095PL,2020-04-18 16:15:26,,add_to_cart,p-9967
3,1,100095PL,2020-04-18 16:33:07,,order,
4,1,100095PL,2020-04-18 16:34:25,Leave,,
5,2,1000PL,2019-12-01 10:37:01,Flit,,
6,3,1000PL,2020-02-27 14:31:11,Arrive,,
7,3,1000PL,2020-02-27 14:31:22,Leave,,
8,4,1000PL,2020-03-13 13:14:06,Arrive,,
9,4,1000PL,2020-03-13 13:17:16,Leave,,


In [6]:
print(sessions.shape)
print(sessions['SessionID'].max())

(112256, 6)
40297


In [7]:
sessions_1d = sessions[sessions['TimeStamp'].between(datetime(2019,12,10,23,0,0),datetime(2019,12,11,22,59,59))]
sessions_1d.head()

Unnamed: 0,SessionID,IpId,TimeStamp,Event,Action,Product
654,207,10201PL,2019-12-11 13:50:36,Arrive,,
655,207,10201PL,2019-12-11 13:50:53,,add_to_cart,p-9623
656,207,10201PL,2019-12-11 14:01:02,Leave,,
1178,385,10318PL,2019-12-11 12:20:24,Arrive,,
1179,385,10318PL,2019-12-11 12:22:45,Leave,,


In [8]:
sessions_1d.to_csv('1dfromsqlite.csv')