# Generate Data for Searchable Enterprise Directory Application

In [1]:
# import necessary packages

import csv
import random

### Set up dicts and lists to create data

In [2]:
# Lists of possible names

first_names = ["Ava", "Noah", "Jacob", "Liam", "Noah", "Oliver", "James", "Elijah", "William", "Henry", "Lucas", "Benjamin", "Theodore",
               "Olivia", "Emma", "Charlotte", "Amelia", "Sophia", "Isabella", "Mia", "Evelyn", "Luna"]

last_names = ["Downey", "Ramdial", "Lettick", "Smith", "Johnson", "Williams", "Jones", "Brown", "Garcia", "Miller", "Davis", 
              "Rodriguez", "Martinez", "Hernandez", "Wilson", "Lopez", "Anderson", "Thomas", "Moore", "Jackson"]

In [3]:
# dictionaries needed to make data have some correlation

work_location_mult = {
    "Hartford, CT" : 1.217,
    "Saint Paul, MN" : 1.135,
    "Hunt Valley, MD" : 1.117,
    "New York, NY" : 1.670
}

job_role_mult = {
    "Junior Software Engineer" : 1.3,
    "Senior Software Engineer" : 2.12,
    "Software Engineer Manager" : 2.66,
    "Junior Data Engineer" : 1.5,
    "Senior Data Engineer" : 2.30,
    "Data Engineer Manager" : 2.60,
    "HR Representative" : 0.73,
    "HR Specialist" : 1,
    "HR Manager" : 1.66
}

job_roles_t1 = ['Software Engineer Manager', 'Data Engineer Manager', 'HR Manager']
job_roles_swe = ['Senior Software Engineer', 'Junior Software Engineer']
job_roles_de = ['Senior Data Engineer', 'Junior Data Engineer']
job_roles_hr = ['HR Specialist', 'HR Representative']

In [4]:
# constants among all rows

password = "Password123!"

### generate the data

In [5]:
# generate phone number
# code modified from https://stackoverflow.com/questions/26226801/making-random-phone-number-xxx-xxx-xxxx

def gen_phone_num():
    first = str(random.randint(600,999))
    second = str(random.randint(1,888)).zfill(3)

    last = (str(random.randint(1,9998)).zfill(4))
    while last in ['1111','2222','3333','4444','5555','6666','7777','8888']:
        last = (str(random.randint(1,9998)).zfill(4))
        
    return '{}-{}-{}'.format(first,second, last)

In [6]:
# generate salary with noise multiplier

def gen_salary(job_role, work_location):
    salary = 60000 * job_role_mult.get(job_role) * work_location_mult.get(work_location)
    salary = salary * random.uniform(0.93, 1.08 ) # noise
    return round(salary, 2)

In [7]:
NUM_ROWS = 999
data_rows = []

# make CEO employee
name = "Alan Schnitzer"
employee_id = 10000
phone_number = gen_phone_num()
job_role = "CEO"
work_location = "Hartford, CT"
salary = 16778800
manager_id = 10000

data_row = [
    name,
    password,
    employee_id,
    phone_number,
    job_role,
    work_location,
    salary,
    manager_id
]

# Add the data row to the list
data_rows.append(data_row)

for x in range(NUM_ROWS):
    name = "{} {}".format(random.choice(first_names), random.choice(last_names))
    employee_id = x + 10001
    phone_number = gen_phone_num()
    work_location = random.choice(list(work_location_mult.keys()))

    # determine manager and job role
    if (x % 20 == 0): # management roles reporting to CEO
        job_role = random.choice(job_roles_t1)
        manager_id = 10000
    elif (data_rows[x-(x%20)+1][4] == "Software Engineer Manager"): # report to swe manager
        job_role = random.choice(job_roles_swe)
        manager_id = data_rows[x-(x%20)+1][2]
    elif (data_rows[x-(x%20)+1][4] == "Data Engineer Manager"): # report to de manager
        job_role = random.choice(job_roles_de)
        manager_id = data_rows[x-(x%20)+1][2]
    elif (data_rows[x-(x%20)+1][4] == "HR Manager"): # report to hr manager
        job_role = random.choice(job_roles_hr)
        manager_id = data_rows[x-(x%20)+1][2]

    salary = gen_salary(job_role, work_location)

    data_row = [
        name,
        password,
        employee_id,
        phone_number,
        job_role,
        work_location,
        salary,
        manager_id
    ]

    # Add the data row to the list
    data_rows.append(data_row)

In [8]:
# test row outputs

print(data_rows[0])
print(data_rows[1])
print(data_rows[10])
print(data_rows[11])
print(data_rows[999])

['Alan Schnitzer', 'Password123!', 10000, '933-399-1433', 'CEO', 'Hartford, CT', 16778800, 10000]
['Isabella Downey', 'Password123!', 10001, '883-209-0229', 'HR Manager', 'Hartford, CT', 113191.4, 10000]
['Lucas Garcia', 'Password123!', 10010, '622-745-9651', 'HR Representative', 'Hartford, CT', 52171.96, 10001]
['Noah Miller', 'Password123!', 10011, '749-457-2724', 'HR Representative', 'Hartford, CT', 55605.43, 10001]
['Isabella Lopez', 'Password123!', 10999, '677-660-2207', 'HR Specialist', 'Saint Paul, MN', 73188.06, 10981]


In [9]:
import pandas as pd
df = pd.DataFrame(data_rows)
df.head(45)

Unnamed: 0,0,1,2,3,4,5,6,7
0,Alan Schnitzer,Password123!,10000,933-399-1433,CEO,"Hartford, CT",16778800.0,10000
1,Isabella Downey,Password123!,10001,883-209-0229,HR Manager,"Hartford, CT",113191.4,10000
2,Olivia Lopez,Password123!,10002,716-476-7649,HR Specialist,"Hunt Valley, MD",70760.78,10001
3,Henry Williams,Password123!,10003,924-293-6309,HR Specialist,"New York, NY",104130.84,10001
4,Amelia Hernandez,Password123!,10004,660-452-4687,HR Representative,"New York, NY",74193.31,10001
5,Oliver Thomas,Password123!,10005,971-644-9735,HR Representative,"New York, NY",71202.54,10001
6,Ava Martinez,Password123!,10006,713-358-7962,HR Representative,"Hunt Valley, MD",49749.27,10001
7,Evelyn Jones,Password123!,10007,760-044-4968,HR Specialist,"Hartford, CT",74467.91,10001
8,William Davis,Password123!,10008,983-166-9434,HR Representative,"Hartford, CT",52288.27,10001
9,Benjamin Miller,Password123!,10009,711-710-0376,HR Representative,"New York, NY",74374.25,10001


In [10]:
print(df[7].unique())

[10000 10001 10021 10041 10061 10081 10101 10121 10141 10161 10181 10201
 10221 10241 10261 10281 10301 10321 10341 10361 10381 10401 10421 10441
 10461 10481 10501 10521 10541 10561 10581 10601 10621 10641 10661 10681
 10701 10721 10741 10761 10781 10801 10821 10841 10861 10881 10901 10921
 10941 10961 10981]


In [11]:
# Write the data to the CSV file
with open('data.csv', "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(
        ["name", "password", "employee_id", "phone_number", "job_role", "work_location", "salary",
         "manager_id"]
    )
    writer.writerows(data_rows)

print("Data generation complete.")

Data generation complete.
