# Personally Identifiable Information (PII) Data Generation

A helper script for for PII Data Anonymization project.

Created by Aussie Frost on 4/15/2024

## Preliminary imports

In [1]:
import random
import csv

## Defining random case narrative data generation script

This section defines a script that will generate a complex faux dataset containing PII that can then be anonymized. The data will also contain terms that may appear in crisis response case narratives such that these terms can be analyzed.

In [2]:
def random_call_transcription():
    """Generate a random call transcription based on predefined templates."""
    
    # define our templates
    templates = [
        "Hi, this is {name}. I need to discuss the charges on my last bill from my visit on {date}. My insurance should have covered my behavioral therapy sessions. Please call me back at {phone} or email me at {email}. Also, the new address is {address}",
        "Good afternoon, I'm {name}. I am calling to update my address to {address}. Also, could you send a confirmation to {email}? I also need to talk about my recent diagnosis and treatment plan.",
        "Hello, my name is {name}. I am calling to confirm my appointment on {date} for my annual physical examination. Is your zip still {zipcode}? Please tell us. Also, please note my new phone number is {phone}, and my current address is {address}.",
        "This is {name}. I need to order a new batch of insulin pens. My doctor, Dr. Harris, recommended changing the dosage. You can reach me at {email} or {phone}.",
        "I'm {name} calling because I have questions regarding the side effects of the new medication prescribed to me last week. Please call me at {phone} or send details to {email}.",
        "Hi, this is {name}. I'm following up on my recent test results for my heart condition discussed on {date}. I've moved to {address}. My new contact number is {phone}, and my email is {email}.",
        "My name is {name}, and I am contacting you regarding my therapy sessions. We needed to report this case. I faxed to this address: {fax}, and plan to call on {phone}. You can contact me at {email} or {phone}.",
        "Hello, it's {name} here. I need to discuss the billing error reported at {ip} related to my knee surgery. I found more info on {website}. Please send the corrected invoice to my address at {address}, or email me at {email}.",
        "We went on a case today. {name} was requesting help, experiencing some medical issues. I contacted {email} to give the address, and they said it was {address}. Perhaps we will ask people around {zipcode}."
    ]
    
    # generate some spoofy names
    first_names = [
        "John", "Jane", "David", "Emily", "Michael", "Sarah", "William", "Jennifer", "Robert", "Mary-Sue"
        "Jessica", "Andrew", "Dr. Ashley", "James", "Amanda", "Sir Matthew", "Christina", "Daniel", "Elizabeth",
        "Joseph", "Nicole", "Anthony", "Margaret", "Kevin", "Laura", "Bryan", "Miss Alexis", "Nicholas", "Katherine",
        "Aiden", "Bella", "Carter", "Delilah", "Ethan", "Fiona", "Grayson", "Hazel", "Isaiah", "Juliet",
        "Kai", "Luna", "Mason", "Nora", "Oliver", "Penelope", "Quentin", "Riley", "Sebastian", "Tessa",
        "Uriah", "Violet", "Winston", "Xena", "Yosef", "Zoe", "Abram", "Beatrice", "Cedric", "Daphne",
        "Eliot", "Freya", "Gideon", "Harriet", "Idris", "Juno", "Knox", "Leia", "Milo", "Naomi",
        "Oscar", "Phoebe", "Reece", "Sienna", "Theo", "Uma", "Victor", "Willow", "Xavier", "Yasmin",
        "Zane", "Adele", "Blaine", "Cora", "Dexter", "Elsie", "Flynn", "Gracie", "Holden", "Iris",
        "Jasper", "Keira", "Levi", "Matilda", "Nolan", "Olive", "Paxton", "Quinn", "Rowan", "Scarlett",
        "Tristan", "Ursula", "Vaughn", "Whitney", "Xander", "Yara", "Zack", "Alma", "Barrett", "Celeste",
        "Drake", "Esme", "Finn", "Gemma", "Hugo", "Ivy", "Jonah", "Kyla", "Lionel", "Mira", "Nico",
        "Octavia", "Pierce", "Rosalie", "Soren", "Thalia", "Ulysses", "Verity", "Wesley", "Yvette"
    ]
    last_names = [
        "Abbott", "Black", "Chapman", "Duffy", "Ellison", "Finch", "Griffith", "Harlow", "Ingram", "Jennings",
        "Knight", "Lowe", "Maxwell", "Norris", "Osborne", "Pike", "Quinn", "Rhodes", "Sherwood", "Tate",
        "Underwood", "Vance", "Whitfield", "York", "Adler", "Barron", "Connor", "Donahue", "Easton", "Field",
        "Golden", "Hartford", "Ivy", "Joyner", "Kessler", "Lawton", "Merritt", "North", "Overton", "Pritchard",
        "Quick", "Rivers", "Sterling", "Thorne", "Upton", "Vaughn", "Waverly", "Xavier", "Youngblood", "Ziegler",
        "Archer", "Beck", "Colby", "Dalton", "Everett", "Frost", "Garrison", "Hammond", "Irvine", "Justice",
        "Kipling", "Langston", "Morrow", "Noble", "Oakley", "Parson", "Quest", "Reilly", "Sawyer", "Thrasher",
        "Ulrich", "Valentine", "Warner", "Xiong", "Yost", "Zimmerman", "Ashby", "Birch", "Crowley", "Davenport",
        "Emerson", "Forrester", "Gilmore", "Hale", "Irwin", "Jasper", "Kent", "Law", "Meadow", "Northwood",
        "O'Donnell", "Pace", "Quill", "Riddle", "Sloan", "Tanner", "Upshaw", "Vincent", "Welles", "Younger"
    ]

    # Generate a random number of names, from 1 to 3
    num_names = random.randint(1, 3)
    names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(num_names)]

    # Format the names into a natural sounding list
    if num_names > 1:
        name_text = ", ".join(names[:-1]) + " and " + names[-1]
    else:
        name_text = names[0]

    area_codes = ["212", "312", "404", "617", "703", "818", "904", "206", "303", "415"]
    phone_formats = ["({area_code})-234-1423", "{area_code}-832-1437", "+1 {area_code}-802-1437", "{area_code} 812 1497"]
    address_formats = ["123 Main St, San Francisco, CA 12345", "567 Elm St, Eugene, Ore 54321", "42 Park Way", "81915 East 24th Ave, Eugene"
                      "15161 Thornberry Lane, Eugene", "51 West 13th St, Eugene, OR 98412"]
    fax_formats = ["+1-907-555-1234"]
    email_formats = ["jeff@example.net"]
    zipcode_formats = ["97401", "04215", "24321-4215", "14512"]
    website_formats = ["www.example.com", "example.com", "http://this.net", "https://www.instagram.com/aoc"]
    ip_formats = ['.'.join(str(random.randint(0, 255)) for _ in range(4)), ':'.join('{:x}'.format(random.randint(0, 2**16 - 1)) for _ in range(8))]
    date_formats = ["2023-10-26", "12/04/2022", "August 15, 1999", "03.11.2018", "2010-02-02", "January 31", "2 Feb", "June 14", "September 22", "Mar 21st", "Apr 1"]

    name_context_templates = [f"involving {name_text}",
                              f"with {name_text}",
                              f"involving a person or people named {name_text}",
                              f"among {name_text}",
                              f"around {name_text} and others"]
    name_context = random.choice(name_context_templates)
    
    # Random selection of template and filling it
    template = random.choice(templates)
    return template.format(
        name=name_text,
        email=random.choice(email_formats),
        phone = random.choice(phone_formats).format(area_code=random.choice(area_codes)),
        address = f"{random.choice(address_formats)}, {random.choice(zipcode_formats)}",
        fax = random.choice(fax_formats),
        website = random.choice(website_formats),
        ip = random.choice(ip_formats),
        date = random.choice(date_formats),
        zipcode = random.choice(zipcode_formats)
    )

    return

def generate_dataset(size, datapath):
    """ 
        Returns nothing, but creates dataset and saves to project dir.
    """
    
    call_types = ["medical", "behavioral", "outreach", "checkin"]
    
    data = []
    for i in range(size):
        call_time = f"2024-{random.randint(1, 12):02d}-{random.randint(1, 28):02d} {random.randint(0, 23):02d}:{random.randint(0, 59):02d}:{random.randint(0, 59):02d}"
        call_type = random.choice(call_types)
        call_transcription = random_call_transcription()
        data.append({
            "time_of_call": call_time, 
            "call_type": call_type,
            "call_transcription": call_transcription
        })

    # Write data to CSV file
    with open(datapath, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["time_of_call", "call_type", "call_transcription"])
        writer.writeheader()
        writer.writerows(data)
    print(f"Call data CSV file generated successfully at {datapath}")

    return

## Generating a test dataset

Deploying the data generation script is as easy as one line of code.

In [3]:
# generate a faux dataset
generate_dataset(1000, "data/call_data.csv")

Call data CSV file generated successfully at data/call_data.csv
