Generating patient data

In [12]:
from faker import Faker
import pandas as pd
import numpy as np
import random

In [13]:
fake = Faker("en_us")

In [14]:
def generate_patient_data(num_patients):
    data = []

    race = ['White', 'Black', 'Asian', 'Hispanic', 'Native American or Alaskan Native', 'Native Hawaiian or Other Pacific Islander', 'Biracial', 'Other']
    race_weights = [0.616, 0.124, 0.60, 0.189, 0.029, 0.002, 0.102, 0.084]
    
    gender = ['Male', 'Female', 'Non-Binary', 'Other']
    gender_weights = [0.45, 0.40, 0.10, 0.05]
    
    insurance = ['Commercial', 'Private', 'Government', 'Uninsured', 'Other']
    insurance_weights = [0.2, 0.05, 0.3, 0.4, 0.05]

    smoking = ['Current Smoker', 'Former Smoker', 'Never Smoked']
    smoking_weights = [0.4, 0.3, 0.3]

    alcohol = ['None', 'Occasional', 'Moderate', 'Heavy']
    alcohol_weights = [0.1, 0.3, 0.4, 0.2]

    physical_activity = ['Sedentary', 'Slightly Active', 'Moderately Active', 'Very Active']
    physical_activity_weights = [0.6, 0.25, 0.1, 0.05]

    support_system = ['Poor', 'Fair', 'Good', 'Excellent']
    support_system_weights = [0.3, 0.5, 0.1, 0.1]

    for i in range(num_patients):
        record = {
            "patient_id": i,
            'name': fake.name(),
            "address": fake.address(),
            "phone_number": fake.phone_number(),
            "age": random.randint(1, 100),  
            "race": random.choices(race, weights=race_weights)[0],
            "gender": random.choices(gender, weights=gender_weights)[0],
            "insurance": random.choices(insurance, weights=insurance_weights)[0],
            "smoking": random.choices(smoking, weights=smoking_weights)[0],
            "physical_activity": random.choices(physical_activity, weights=physical_activity_weights)[0],
            "alcohol": random.choices(alcohol, weights=alcohol_weights)[0],
            "support_system": random.choices(support_system, support_system_weights)[0],
        }
        data.append(record)
    
    return pd.DataFrame(data)


In [15]:
patient_data = generate_patient_data(1000)
patient_data['address'] = patient_data['address'].str.replace('\n', ', ', regex=True)
print(patient_data.head(10).to_string(index=False))

 patient_id             name                                                   address          phone_number  age     race gender  insurance        smoking physical_activity    alcohol support_system
          0   Michele Fisher         54673 John Wells Suite 867, Matthewbury, ID 50179     (278)378-0800x367   35    Asian Female Government   Never Smoked Moderately Active      Heavy           Poor
          1    Kathryn Allen  011 Daniel Circle Suite 528, Port Matthewmouth, ND 86875  001-232-670-5841x764   70    White Female Government  Former Smoker   Slightly Active      Heavy           Fair
          2   Jeffrey Vaughn                66239 Travis Hollow, Gravesmouth, GA 84666 +1-410-489-6448x68788   27    White Female Commercial   Never Smoked Moderately Active   Moderate           Poor
          3         Mary Lee                          Unit 1126 Box 7374, DPO AE 73761          311-403-8673   30    White   Male      Other Current Smoker       Very Active   Moderate           Poor


In [17]:
patient_data.to_csv("data/patient_data.csv", index=False)