In [4]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [17]:
from datetime import datetime, timedelta
from math import ceil

In [198]:
class CinemaDataset():
    class Session():
        def __init__(self, start_datetime, min_duration_hours=1, max_duration_hours=3):
            self.start_datetime = start_datetime
            self.min_duration_hours = min_duration_hours
            self.max_duration_hours = max_duration_hours
            self.set_duration()
        
        def set_duration(self):
            self.duration = timedelta(hours=np.random.uniform(self.min_duration_hours, self.max_duration_hours))
            self.end_datetime = self.start_datetime + self.duration
            
    
    def __init__(self, num_rows=30, num_places=10, seed=0):
        np.random.seed(seed)
        self.seed = seed
        self.num_rows = num_rows
        self.num_filled_rows = 0
        self.num_places = num_places
               
        # time of openning and closing of cinema
        self.start_hour = 8
        self.end_hour = 23
        # interval between films
        self.interval = timedelta(minutes=30)       
        
        # emulation start time
        self.start_datetime = datetime(year=2019, month=9, day=1, hour=self.start_hour)
    
    def get_ages(self, size):
        return np.random.normal(30, 7, size=size).astype(int).reshape(-1, 1)
    
    def get_sexes(self, size):
        sex_bin = np.random.binomial(1, 0.5, size=size).reshape(-1, 1)
        return np.apply_along_axis(lambda x: 'male' if x else 'female', 1, sex_bin).reshape(-1, 1)
    
    def get_datetimes(self, session, num_seats):
        max_early =  - timedelta(minutes=15).total_seconds()
        max_late = timedelta(minutes=15).total_seconds()
        
        # for beginning of the film
        delays_start = np.random.uniform(low=max_early, high=max_late, size=num_seats).reshape((-1, 1))
        delays_start = np.apply_along_axis(lambda x: timedelta(seconds=x[0]), 1, delays_start).reshape((-1, 1))
        start_datetimes = delays_start + session.start_datetime
        
        # for ending of the film
        delays_end = np.random.uniform(low=max_early, high=max_late, size=num_seats).reshape((-1, 1))
        delays_end = np.apply_along_axis(lambda x: timedelta(seconds=x[0]), 1, delays_end).reshape((-1, 1))
        end_datetimes = delays_end + session.end_datetime
        return start_datetimes, end_datetimes
    
    def get_seats(self):
        num_seats = np.random.randint(low=int(self.num_places * 0.7), high=self.num_places)
        seats = np.random.choice(self.num_places, size=num_seats, replace=False).reshape((-1, 1))
        return seats
    
    def generate(self):
        self.num_sessions = ceil(self.num_rows / self.num_places)
        self.current_datetime = self.start_datetime
        
        day_start_datetime = self.start_datetime
        day_end_datetime = day_start_datetime + timedelta(hours=self.end_hour - self.start_hour)
        
        total_seats = []
        total_ages = []
        total_sexes = []
        total_start_times = []
        total_end_times = []
        for i in range(self.num_sessions):
            # set the next day if cinema is closing
            if self.current_datetime >= day_end_datetime:
                day_start_datetime = day_start_datetime + timedelta(days=1)
                day_end_datetime = day_end_datetime + timedelta(days=1)
            seats = self.get_seats()
            ages = self.get_ages(len(seats))
            sexes = self.get_sexes(len(seats))
            session = CinemaDataset.Session(self.current_datetime)
            start_datetimes, end_datetimes = self.get_datetimes(session=session, num_seats=len(seats))
            
            # collect everything into lists
            total_seats.append(seats)
            total_ages.append(ages)
            total_sexes.append(sexes)
            total_start_times.append(start_datetimes)
            total_end_times.append(end_datetimes)
            
            self.current_datetime = self.interval + session.end_datetime
            
        table = np.hstack([
            np.vstack(total_seats), 
            np.vstack(total_ages), 
            np.vstack(total_sexes), 
            np.vstack(total_start_times), 
            np.vstack(total_end_times)])
        columns = ['seat', 'age', 'sex', 'start_time', 'end_time']
        data = pd.DataFrame(table, columns=columns)
        return data

data_generator = CinemaDataset(num_rows=1000, num_places=5, seed=1)

data = data_generator.generate()

In [199]:
data

Unnamed: 0,seat,age,sex,start_time,end_time
0,2,26,male,2019-09-01 07:51:53.238985,2019-09-01 09:51:08.118256
1,1,20,male,2019-09-01 08:01:01.945036,2019-09-01 10:06:23.290856
2,4,41,fema,2019-09-01 08:12:25.131644,2019-09-01 10:01:33.961460
3,0,22,male,2019-09-01 07:58:42.968654,2019-09-01 09:59:41.607764
4,3,32,male,2019-09-01 10:16:54.194188,2019-09-01 11:32:56.735660
5,2,32,male,2019-09-01 10:12:28.677079,2019-09-01 11:50:38.481276
6,0,28,male,2019-09-01 10:31:42.826886,2019-09-01 11:51:44.371614
7,4,29,fema,2019-09-01 10:20:35.430750,2019-09-01 11:40:52.518831
8,3,28,female,2019-09-01 12:16:40.058987,2019-09-01 13:38:11.274682
9,4,33,male,2019-09-01 12:03:31.279881,2019-09-01 13:51:27.913213
