In [1]:
import pandas as pd
from faker import Faker
import random
import numpy as np

# Initialize Faker
fake = Faker()

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Define ranges for the number of records
num_medical_groups = random.randint(50, 100)
num_users = random.randint(500, 1000)
num_patients = random.randint(1000, 2000)
num_access_logs = random.randint(10000, 20000)

# Generate mock data for Medical_Group_List
medical_group_list = pd.DataFrame(
    {
        "medical_group_id": [fake.uuid4() for _ in range(num_medical_groups)],
        "groupname": [fake.company() for _ in range(num_medical_groups)],
        "state": [fake.state() for _ in range(num_medical_groups)],
        "date_group_created": [fake.date_time_this_decade() for _ in range(num_medical_groups)],
        "number_of_users": [random.randint(5, 20) for _ in range(num_medical_groups)],
        "number_of_patients": [random.randint(50, 200) for _ in range(num_medical_groups)],
    }
)

# Generate unique usernames
usernames = set()
while len(usernames) < num_users:
    usernames.add(fake.user_name())

# Generate mock data for Medical_Group_User
medical_group_user = pd.DataFrame(
    {
        "user_id": [fake.uuid4() for _ in range(num_users)],
        "medical_group_id": [random.choice(medical_group_list["medical_group_id"]) for _ in range(num_users)],
        "username": list(usernames),
        "password": [fake.password() for _ in range(num_users)],
        "role": [random.choice(["admin", "physician", "staff"]) for _ in range(num_users)],
        "date_user_created": [fake.date_time_this_decade() for _ in range(num_users)],
        "last_login_date": [fake.date_time_this_year() for _ in range(num_users)],
    }
)

# Generate mock data for Patient_Demographics
patient_demographics = pd.DataFrame(
    {
        "patient_id": [fake.uuid4() for _ in range(num_patients)],
        "age": [random.randint(0, 100) for _ in range(num_patients)],
        "weight": [round(random.uniform(50, 100), 1) for _ in range(num_patients)],  # weight in kg
        "last_visit_date": [fake.date_time_this_year() for _ in range(num_patients)],
    }
)

# Generate mock data for Patient_Access_Log
patient_access_log = pd.DataFrame(
    {
        "access_id": [fake.uuid4() for _ in range(num_access_logs)],
        "patient_id": [random.choice(patient_demographics["patient_id"]) for _ in range(num_access_logs)],
        "user_id": [random.choice(medical_group_user["user_id"]) for _ in range(num_access_logs)],
        "date_accessed": [fake.date_time_this_year() for _ in range(num_access_logs)],
    }
)

# Display Head of each base dataframe
for dataframe in [medical_group_list, medical_group_user, patient_demographics, patient_access_log]:
    print(dataframe.head())

                       medical_group_id                     groupname  \
0  3e6e5147-5ce5-49b9-bc82-8e9d839f5d01                      Reed PLC   
1  d2553ebc-1d4c-4843-bd9b-a23cc5765655  Ramsey, Gonzalez and Stewart   
2  c78c85c2-a0b0-4bf9-af39-2be2fe4f9046              Castillo-Jenkins   
3  721e2356-dcb1-4331-b326-b594dce2b6c0       Byrd, Aguirre and Lewis   
4  dc2ec521-4e7f-4589-a49e-78d270abd7e2    Craig, Davis and Rodriguez   

          state         date_group_created  number_of_users  \
0     Louisiana 2024-03-10 06:32:02.651300               12   
1  Rhode Island 2022-04-28 06:46:16.227920               12   
2    New Mexico 2023-05-04 15:06:47.312763                9   
3       Indiana 2020-02-22 06:26:02.546875                8   
4     Minnesota 2020-05-01 09:10:24.666450                7   

   number_of_patients  
0                  62  
1                  78  
2                  89  
3                  90  
4                 158  
                                user_i

In [2]:
import duckdb

# Create a DuckDB file
con = duckdb.connect(database="stellar_health.db")

# Create schema 'stellar_health'
con.execute("CREATE SCHEMA IF NOT EXISTS stellar_health_application")

# Load data into DuckDB within the 'stellar_health' schema
con.execute(
    "CREATE OR REPLACE TABLE stellar_health.stellar_health_application.medical_group_list AS SELECT * FROM medical_group_list"
)
con.execute(
    "CREATE OR REPLACE TABLE stellar_health.stellar_health_application.medical_group_user AS SELECT * FROM medical_group_user"
)
con.execute(
    "CREATE OR REPLACE TABLE stellar_health.stellar_health_application.patient_demographics AS SELECT * FROM patient_demographics"
)
con.execute(
    "CREATE OR REPLACE TABLE stellar_health.stellar_health_application.patient_access_log AS SELECT * FROM patient_access_log"
)

# Verify the data is loaded
print(con.execute("SELECT * FROM stellar_health.stellar_health_application.medical_group_list LIMIT 5").fetchdf())
print(con.execute("SELECT * FROM stellar_health.stellar_health_application.medical_group_user LIMIT 5").fetchdf())
print(con.execute("SELECT * FROM stellar_health.stellar_health_application.patient_demographics LIMIT 5").fetchdf())
print(con.execute("SELECT * FROM stellar_health.stellar_health_application.patient_access_log LIMIT 5").fetchdf())

                       medical_group_id                     groupname  \
0  3e6e5147-5ce5-49b9-bc82-8e9d839f5d01                      Reed PLC   
1  d2553ebc-1d4c-4843-bd9b-a23cc5765655  Ramsey, Gonzalez and Stewart   
2  c78c85c2-a0b0-4bf9-af39-2be2fe4f9046              Castillo-Jenkins   
3  721e2356-dcb1-4331-b326-b594dce2b6c0       Byrd, Aguirre and Lewis   
4  dc2ec521-4e7f-4589-a49e-78d270abd7e2    Craig, Davis and Rodriguez   

          state         date_group_created  number_of_users  \
0     Louisiana 2024-03-10 06:32:02.651300               12   
1  Rhode Island 2022-04-28 06:46:16.227920               12   
2    New Mexico 2023-05-04 15:06:47.312763                9   
3       Indiana 2020-02-22 06:26:02.546875                8   
4     Minnesota 2020-05-01 09:10:24.666450                7   

   number_of_patients  
0                  62  
1                  78  
2                  89  
3                  90  
4                 158  
                                user_i

In [3]:
# print(con.execute("SELECT * FROM main.raw_stellar_health_application__medical_group_list  LIMIT 5").fetchdf())

In [4]:
# Close the connection
con.close()