In [1]:
import pandas as pd
from openai import OpenAI

api_key = None

with open(".env", "r") as f:
    for line in f:
        if line.startswith("OPENAI_API_KEY"):
            api_key = line.split("=")[1].strip()

client = OpenAI(api_key=api_key)

In [10]:
def generate_names(description):
    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0.7,
        ## no limit on max tokens.
        messages=[
            {"role": "system", "content": "You are an expert in linguistics and anthropology. You are helping a team of scientists by generating curated lists of names. You are provided an ethnic description of individuals. Come up with a list of 200 names including both first name and last name (add middle name if required), separated by \n. Think about the generation process in the first line only."},
            {"role": "user", "content": description}            
        ]
    )

    return completion.choices[0].message.content

In [11]:
## sourced from : https://www.doi.gov/pmb/eeo/directives/race-data#:~:text=The%20standards%20have%20five%20categories,%22Not%20Hispanic%20or%20Latino.%22

us_ethnic_description = [
    """
        -- Asian. A person having origins in any of the original peoples of the Far East, Southeast Asia, or the Indian subcontinent including, for example, Cambodia, China, India, Japan, Korea, Malaysia, Pakistan, the Philippine Islands, Thailand, and Vietnam.
    """,
    """
        -- Black or African American. A person having origins in any of the black racial groups of Africa. Terms such as "Haitian" can be used in addition to "Black or African American."
    """,
    """
       -- Hispanic or Latino. A person of Cuban, Mexican, Puerto Rican, South or Central American, or other Spanish culture or origin, regardless of race. The term, "Spanish origin," can be used in addition to "Hispanic or Latino." 
    """,
    """
        -- White. A person having origins in any of the original peoples of Europe, the Middle East, or North Africa.
    """
]

In [12]:
us_ethnic_dictionary = {}
for desc in us_ethnic_description:
    print("Processing for: ", desc)
    results = []
    for _ in range(5):
        result = generate_names(desc)
        results.append(result)
        print("Outputted these many values: ", len(result.split("\n")))
    us_ethnic_dictionary[desc] = results
    print("Processed\n\n")

Processing for:  
        -- Asian. A person having origins in any of the original peoples of the Far East, Southeast Asia, or the Indian subcontinent including, for example, Cambodia, China, India, Japan, Korea, Malaysia, Pakistan, the Philippine Islands, Thailand, and Vietnam.
    
Outputted these many values:  200
Outputted these many values:  202
Outputted these many values:  200
Outputted these many values:  200
Outputted these many values:  202
Processed


Processing for:  
        -- Black or African American. A person having origins in any of the black racial groups of Africa. Terms such as "Haitian" can be used in addition to "Black or African American."
    
Outputted these many values:  200
Outputted these many values:  202
Outputted these many values:  202
Outputted these many values:  202
Outputted these many values:  202
Processed


Processing for:  
       -- Hispanic or Latino. A person of Cuban, Mexican, Puerto Rican, South or Central American, or other Spanish culture

In [14]:
## save the results in dump for now.
import os
os.makedirs("dump/us/", exist_ok=True)
for idx, desc in enumerate(us_ethnic_description):
    with open(f"dump/us/names_{idx}.txt", 'w') as f:
        f.writelines(us_ethnic_dictionary[desc])
        f.write("\n\n")

In [18]:
## picked from here: https://en.wikipedia.org/wiki/Administrative_divisions_of_India
indian_ethnic_descriptions = [
    """
        -- North Indian. A person having roots from Chandigarh, Delhi, Haryana, Himachal Pradesh, Jammu and Kashmir, Ladakh, Punjab and Rajasthan.
    """,
    """
        -- North-east Indian. A person having roots from Arunachal Pradesh, Assam, Manipur, Meghalaya, Mizoram, Nagaland, Sikkim and Tripura.
    """,
    """
       -- Central Indian. A person having roots from Chhattisgarh, Madhya Pradesh, Uttarakhand and Uttar Pradesh. 
    """,
    """
        -- East Indian. A person having roots from Bihar, Jharkhand, Odisha and West Bengal.
    """,
    """
        -- South Indian. A person having roots from Andhra Pradesh, Karnataka, Kerala, Tamil Nadu, Pudduchery and Telangana.
    """,
    """
        -- West Indian. A person having roots from Dadra and Nagar Haveli and Daman and Diu, Goa, Gujarat and Maharashtra.
    """
]

In [19]:
indian_ethnic_dictionary = {}
for desc in indian_ethnic_descriptions:
    print("Processing for: ", desc)
    results = []
    for _ in range(5):
        result = generate_names(desc)
        results.append(result + "\n\n")
        print("Outputted these many values: ", len(result.split("\n")))
    
    indian_ethnic_dictionary[desc] = results
    print("Processed\n\n")

Processing for:  
        -- North Indian. A person having roots from Chandigarh, Delhi, Haryana, Himachal Pradesh, Jammu and Kashmir, Ladakh, Punjab and Rajasthan.
    
Outputted these many values:  202
Outputted these many values:  202
Outputted these many values:  200
Outputted these many values:  202
Outputted these many values:  200
Processed


Processing for:  
        -- North-east Indian. A person having roots from Arunachal Pradesh, Assam, Manipur, Meghalaya, Mizoram, Nagaland, Sikkim and Tripura.
    
Outputted these many values:  202
Outputted these many values:  202
Outputted these many values:  200
Outputted these many values:  200
Outputted these many values:  202
Processed


Processing for:  
       -- Central Indian. A person having roots from Chhattisgarh, Madhya Pradesh, Uttarakhand and Uttar Pradesh. 
    
Outputted these many values:  202
Outputted these many values:  200
Outputted these many values:  202
Outputted these many values:  200
Outputted these many values

In [20]:
os.makedirs("dump/india/", exist_ok=True)
for idx, desc in enumerate(indian_ethnic_descriptions):
    with open(f"dump/india/names_{idx}.txt", 'w') as f:
        f.writelines(indian_ethnic_dictionary[desc])