In [75]:
import random
import datetime
import math

In [72]:
example_page_transition_probabilities = {
    'page_names' : ['home','buy', 'thanks'],
    'matrix' : [
        [0.000, 0.265, 0.700],
        [0.400, 0.000, 0.432],
        [0.910, 0.000, 0.000]
    ],
    'duration_means' : [21.2, 59.3, 154.3],
    'duration_stds'  : [5.3, 20.2, 51.2]
}

In [51]:
def test_transition_probs(transition_probs):
    page_names = transition_probs['page_names']
    matrix = transition_probs['matrix']
    duration_means = transition_probs['duration_means']
    duration_stds = transition_probs['duration_stds']
    assert len(page_names) == len(matrix)
    assert len(matrix) == len(matrix[0])
    assert len(duration_stds) == len(duration_stds)
    for row in matrix:
        assert(sum(row) <= 1.0)

In [67]:
def generate_random_datetime(start_date, end_date):
        date_range = [start_date + datetime.timedelta(days=i) for i in range((end_date-start_date).days)]
        date = random.choice(date_range)
        hours = int(math.floor(random.random() * 24))
        mins = int(math.floor(random.random() * 60))
        secs = int(math.floor(random.random() * 60))
        t = datetime.time(hours, mins, secs)
        random_datetime = datetime.datetime.combine(date, t)
        return random_datetime

In [68]:
class RandomSessionGenerator(object):
    
    def __init__(self, start_page_name, page_transition_probabilities, start_datetime):
        self.name = start_page_name
        self.at = start_datetime
        self._page_names = page_transition_probabilities['page_names']
        self._duration_means = page_transition_probabilities['duration_means']
        self._duration_stds  = page_transition_probabilities['duration_stds']
        self._matrix = page_transition_probabilities['matrix']
        self._ids_used = set([])
        
    def __iter__(self):
        return self
    
    def _generate_random_id(self, lower=1000, upper=10000):
        if len(self._ids_used) > (upper - lower) / 10:
            return self._generate_random_id(lower=lower, upper=10*upper)
        else:
            while True:
                r_id = random.randint(lower, upper)
                if r_id not in self._ids_used:
                    self._ids_used.add(r_id)
                    return r_id
                else: 
                    return _generate_random_id(lower=lower, upper=upper)
    
    def next(self):
        row_num = self._page_names.index(self.name)
        transition_probabilities = self._matrix[row_num]
        cumulative_p = 0
        r = random.random()
        for i, p in enumerate(transition_probabilities):
            cumulative_p += p
            if r < cumulative_p:
                self.name = self._page_names[i]
                duration = max(random.gauss(self._duration_means[i], self._duration_stds[i]), 1.0)
                self.at += datetime.timedelta(seconds=duration)
                return {'name' : self.name, 
                        'at': self.at.strftime("%Y-%m-%dT%H:%M:%S"),
                       'id': self._generate_random_id()}
        else:
            raise StopIteration()

In [78]:
def generate_random_sessions(N, start_page_name, page_transition_probabilities, start_date, end_date, amount_generator):
    sessions = []
    for _ in range(N):   
        random_start_datetime = generate_random_datetime(start_date, end_date)
        random_session_generator = RandomSessionGenerator(start_page_name, page_transition_probabilities, random_start_datetime)
        visits = [v for v in random_session_generator]
        amount_spent = amount_generator(visits)
        sessions.append({'page_visits' : visits, 'amount_spent' : amount_spent})
    return sessions
        
    

In [79]:
def uniform_amount_generator(visits):
    purchase_made = False
    for v in visits:
        if v['name'] == 'thanks':
            purchase_made = True
            break
    if not purchase_made:
        return 0.0
    else:
        return max(0.99, random.gauss(60.0, 20.0))

In [80]:
data = generate_random_sessions(10, 'home', example_page_transition_probabilities, datetime.date(2015,1,1), datetime.date(2015, 10,10), uniform_amount_generator)

In [81]:
data

[{'amount_spent': 54.01274506934433,
  'page_visits': [{'at': '2015-02-22T17:53:44', 'id': 2362, 'name': 'thanks'},
   {'at': '2015-02-22T17:54:01', 'id': 5824, 'name': 'home'},
   {'at': '2015-02-22T17:54:39', 'id': 6419, 'name': 'buy'},
   {'at': '2015-02-22T17:54:40', 'id': 6178, 'name': 'home'},
   {'at': '2015-02-22T17:56:02', 'id': 6450, 'name': 'buy'},
   {'at': '2015-02-22T17:56:21', 'id': 4509, 'name': 'home'},
   {'at': '2015-02-22T17:57:56', 'id': 5283, 'name': 'thanks'},
   {'at': '2015-02-22T17:58:26', 'id': 7210, 'name': 'home'},
   {'at': '2015-02-22T18:01:42', 'id': 1232, 'name': 'thanks'},
   {'at': '2015-02-22T18:02:00', 'id': 4256, 'name': 'home'},
   {'at': '2015-02-22T18:02:09', 'id': 1275, 'name': 'buy'},
   {'at': '2015-02-22T18:02:26', 'id': 9977, 'name': 'home'},
   {'at': '2015-02-22T18:04:21', 'id': 7305, 'name': 'thanks'},
   {'at': '2015-02-22T18:04:35', 'id': 3577, 'name': 'home'},
   {'at': '2015-02-22T18:05:10', 'id': 2363, 'name': 'buy'},
   {'at': '201

In [64]:
visits = [rv for rv in rvs]

In [65]:
visits

[{'at': '2015-10-21T10:55:51', 'id': 4733, 'name': 'home'},
 {'at': '2015-10-21T10:58:42', 'id': 5354, 'name': 'buy'},
 {'at': '2015-10-21T10:58:59', 'id': 6080, 'name': 'home'},
 {'at': '2015-10-21T11:01:30', 'id': 7914, 'name': 'buy'},
 {'at': '2015-10-21T11:01:40', 'id': 3676, 'name': 'home'},
 {'at': '2015-10-21T11:04:25', 'id': 2078, 'name': 'buy'},
 {'at': '2015-10-21T11:04:44', 'id': 4006, 'name': 'home'},
 {'at': '2015-10-21T11:05:40', 'id': 8017, 'name': 'search'},
 {'at': '2015-10-21T11:08:37', 'id': 1697, 'name': 'buy'}]

In [None]:
class RandomSessionGenerator(object):
    
    def __init__(self, page_transition_probabilities, start_datetime_generator):
        self._start_datetime_generator = start_datetime_generator
    
    def get_start_time(self):
        return self.start_datetime_generator.generate()
    
    def __next__(self):
        while True:
            start_time = self.get_start_time()
            