In [147]:
import pandas as pd
import numpy as np
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
import odo

In [None]:
output_notebook()

In [132]:
frame_len = 3_000_000

In [16]:
df = pd.DataFrame(np.random.randint(0, 5, size=(frame_len, 4)), columns=list('ABCD'))

In [18]:
dft = pd.DataFrame({'A' : [np.random.choice(['one', 'two', 'three', 'four'])
                           for i in range(frame_len)],
                    'B' : [np.random.choice(pd.date_range(pd.datetime(2013,1,1), pd.datetime(2013,1,3))) 
                           for i in range(frame_len)],
                    'C' : np.random.randn(frame_len),
                    'D' : np.random.randint(0, 6, frame_len)})

In [53]:
def plot(title, data):
    p = figure(title=title, plot_width=900, plot_height=400)
    p.line(x=range(len(data)), y=data, line_width=1)
    show(p)

In [31]:
user_count = 1_000_000 # total users of the system
session_count = 3 # number of sessions per user per day on average
day_length_sec = 24*60*60 # full 24 hours in seconds
session_mean_duration = 20*60 # 20 minutes
session_variance = 10*60 # 10 minutes 
session_mean_delay = day_length_sec / (user_count * session_count)
session_mean_delay

0.0288

In [133]:
session_delays = np.random.exponential(scale=session_mean_delay, size=frame_len)
plot('Delays', session_delays[:10000])

In [134]:
session_durations = np.random.lognormal(mean=0, sigma=0.5, size=frame_len)*20*60
plot('Durations', session_durations[:10000])

In [135]:
session_frame = pd.DataFrame({'delay': session_delays, 'duration': session_durations})

In [136]:
session_frame['invite_time'] = pd.Series([pd.datetime(2016,1,1)]*frame_len) + \
pd.to_timedelta(session_frame.delay.cumsum(), unit='s')

In [138]:
user1 = np.random.randint(0, user_count, frame_len)
user2 = np.random.randint(0, user_count, frame_len)

In [140]:
session_frame['user1'] = user1
session_frame['user2'] = user2

In [142]:
session_frame['end_time'] = session_frame.invite_time + pd.to_timedelta(session_frame.duration, unit='s')

In [144]:
session_frame['response_code'] = np.random.choice([200, 400], size=frame_len, p=[0.95, 0.05])

In [145]:
session_frame

Unnamed: 0,delay,duration,invite_time,user1,user2,end_time,response_code
0,0.014574,869.905022,2016-01-01 00:00:00.014574000,703002,39327,2016-01-01 00:14:29.919596000,200
1,0.014763,2004.460109,2016-01-01 00:00:00.029337000,971004,309391,2016-01-01 00:33:24.489446000,200
2,0.008410,1031.466424,2016-01-01 00:00:00.037747000,780239,327716,2016-01-01 00:17:11.504171000,200
3,0.029324,807.353420,2016-01-01 00:00:00.067071000,829066,583691,2016-01-01 00:13:27.420491000,200
4,0.003053,2315.062994,2016-01-01 00:00:00.070124000,213022,156978,2016-01-01 00:38:35.133117999,400
5,0.001668,1710.933445,2016-01-01 00:00:00.071792000,178360,986105,2016-01-01 00:28:31.005237000,200
6,0.021719,1228.231290,2016-01-01 00:00:00.093511000,12467,379156,2016-01-01 00:20:28.324801000,200
7,0.000495,1015.574692,2016-01-01 00:00:00.094006000,849035,39050,2016-01-01 00:16:55.668698000,400
8,0.059241,291.208860,2016-01-01 00:00:00.153247000,723085,552691,2016-01-01 00:04:51.362107000,200
9,0.000708,800.939577,2016-01-01 00:00:00.153955000,247944,420215,2016-01-01 00:13:21.093532000,200


In [154]:
odo.odo(session_frame, 'sqlite:///local-data/budge.sqlite::session')

Table('session', MetaData(bind=Engine(sqlite:///local-data/budge.sqlite)), Column('delay', FLOAT(), table=<session>, nullable=False), Column('duration', FLOAT(), table=<session>, nullable=False), Column('invite_time', DateTime(), table=<session>), Column('user1', BigInteger(), table=<session>, nullable=False), Column('user2', BigInteger(), table=<session>, nullable=False), Column('end_time', DateTime(), table=<session>), Column('response_code', BigInteger(), table=<session>, nullable=False), schema=None)