# Dataset Generator

## Docter Availability

In [2]:
import pandas as pd
import numpy as np
import uuid
import datetime
import random

In [130]:
columns = [
    'doc_id',
    'date', # For everyday or every doctors working day
    'time_start', # Worktime start
    'time_end', # Worktime end
    'time_slot_start', # Start of his time slot
    'time_slot_end', # End of the time slot
    # 'time_slot_active_duration', # Excluding breaks
    'time_slot_duration', # Total time slot duration
    'appointments_count', # No of patients seen
    'overtime_delay', # Extra time for the slot
    # Realtime
    'event_type', # Kafka event type, like delay in arrival, delay in examining a patient X, cannot make it etc
    'event_time_stamp', # Time the event was sent
    'delay_caused_due_to_event', # If any delay was caused due to the event
    ### Target
    # 'available', #Was the doctor available at that slot
]

start_times = ['9:00:00', '18:00:00', '20:00:00']
time_slot_duration = 15
end_times = ['11:00:00', '20:00:00', '23:00:00']
event_types = ['cannot make it', 'delayed due to patient', 'delay in arrival']

In [131]:
# Random date picker
def get_random_date(only_date=True):
    start_date = datetime.date(2024, 1, 1)
    end_date = datetime.date(2024, 3, 31)

    random_date = start_date + (end_date - start_date) * random.random()
    random_date = datetime.datetime.combine(random_date, datetime.datetime.min.time())
    
    if only_date:
        return random_date.date().__str__() 
    return random_date.__str__()

get_random_date()

# Split time and add minutes
def add_minutes(time: str, minutes: int) -> str:
    (h, m, _) = time.split(':')
    d = datetime.timedelta(hours=int(h), minutes=int(m))
    return (d + datetime.timedelta(minutes=minutes)).__str__()

In [132]:
add_minutes(start_times[0], 50)

'9:50:00'

In [133]:
dataset = pd.DataFrame([], columns=columns)

dataset    

Unnamed: 0,doc_id,date,time_start,time_end,time_slot_start,time_slot_end,time_slot_duration,appointments_count,overtime_delay,event_type,event_time_stamp,delay_caused_due_to_event


In [134]:
entries = 1000

for i in range(entries):
    new_entry = pd.DataFrame({
        'doc_id': [str(uuid.uuid4())]
    }, columns=columns)
    new_entry['date'] = get_random_date()
    randomTimeIndex = random.randint(0, len(start_times) - 1)
    new_entry['time_start'] = start_times[randomTimeIndex]
    new_entry['time_end'] = end_times[randomTimeIndex]

    (h_start, _, _) = start_times[randomTimeIndex].split(':')
    (h_end, _, _) = end_times[randomTimeIndex].split(':')
    difference = int(h_end) - int(h_start)
    random_time_factor = random.randint(0, difference * 2)
    time_slot_factor = (random_time_factor * 15)
    
    # print(f"Start time: {start_times[randomTimeIndex]}, TimeSlotFactor: {time_slot_factor}")
    start_t = add_minutes(start_times[randomTimeIndex], time_slot_factor)
    new_entry['time_slot_start'] = start_t
    new_entry['time_slot_end'] = add_minutes(start_t, time_slot_duration)
    new_entry['time_slot_duration'] = time_slot_duration
    new_entry['appointments_count'] = difference * 2
    
    new_entry['overtime_delay'] = random.randint(0, time_slot_duration)
    
    event = random.choices([0, 1], weights=[2, 1], k=1)[0]
    
    if event == 1:
        random_event_index = random.randint(0, len(event_types) - 1)
        new_entry['event_type'] = event_types[random_event_index]
        new_entry['event_time_stamp'] = get_random_date(only_date=False)
        new_entry['delay_caused_due_to_event'] = random.randint(0, time_slot_duration)
    
    dataset = pd.concat([dataset, new_entry], ignore_index=True)

dataset

Unnamed: 0,doc_id,date,time_start,time_end,time_slot_start,time_slot_end,time_slot_duration,appointments_count,overtime_delay,event_type,event_time_stamp,delay_caused_due_to_event
0,d0611be3-30af-46f7-8dad-09cdee7a8d02,2024-02-07,9:00:00,11:00:00,9:00:00,9:15:00,15,4,6,,,
1,7d9bdfee-82bd-465d-92f2-fa7517784bac,2024-01-22,9:00:00,11:00:00,9:15:00,9:30:00,15,4,9,,,
2,b1077fe8-00dd-4f18-8cd2-5136ebb244d4,2024-01-30,9:00:00,11:00:00,9:00:00,9:15:00,15,4,13,,,
3,0f5c3f46-c657-42eb-a85f-d9d588587d46,2024-02-05,18:00:00,20:00:00,18:45:00,19:00:00,15,4,4,,,
4,cfdd52b1-09e6-426a-a289-5ea02ad7c2fd,2024-03-18,9:00:00,11:00:00,9:00:00,9:15:00,15,4,13,cannot make it,2024-03-18 00:00:00,4
...,...,...,...,...,...,...,...,...,...,...,...,...
995,f8629e26-8080-4971-bc75-97066dc07a72,2024-03-13,9:00:00,11:00:00,9:00:00,9:15:00,15,4,10,,,
996,6579926b-3b36-4f73-89af-abac9ed6d060,2024-02-18,20:00:00,23:00:00,20:00:00,20:15:00,15,6,8,cannot make it,2024-01-14 00:00:00,6
997,87188e10-5bd7-4395-9c66-985972e8d5a4,2024-03-17,20:00:00,23:00:00,20:30:00,20:45:00,15,6,7,,,
998,b01e299f-e0a4-4e5c-a479-a56316fca581,2024-02-22,18:00:00,20:00:00,18:15:00,18:30:00,15,4,5,,,


In [135]:
dataset['event_type'].isna().sum()

678

In [1]:
# Save the dataset
from pathlib import Path

BASE_DATASET_PATH = Path('./datasets/synt')
BASE_DATASET_PATH.mkdir(parents=True, exist_ok=True)

DATASET_PATH = BASE_DATASET_PATH / 'doctor_availability.csv'

dataset.to_csv(DATASET_PATH)

NameError: name 'dataset' is not defined

In [144]:
pat_df = pd.read_csv('./datasets/patient_data.csv')

pat_df['Delayed_by'] = pat_df['Delayed_by'].apply(lambda _: random.choices([0, 5, 10, 15], weights=[5, 2, 1, 1])[0])

pat_df.to_csv(BASE_DATASET_PATH / 'patient_data.csv', index=False)

In [4]:
import ast

pat_df = pd.read_csv(BASE_DATASET_PATH / 'patient_data.csv')

pat_df['Cause_of_Visit'] = pat_df['Cause_of_Visit'].apply(lambda symptom: ast.literal_eval('\',\''.join(symptom.split('\' \''))))

pat_df.to_csv(BASE_DATASET_PATH / 'patient_data.csv', index=False)