In [7]:
import random
import string
import pandas as pd
import json

# --- Attribute value lists (from your screenshot) ---

EDUCATION_LEVELS = [
    "No formal education",
    "Equivalent to completing fourth grade in the U.S.",
    "Equivalent to completing eighth grade in the U.S.",
    "Equivalent to completing high school in the U.S.",
    "Equivalent to completing two years at college in the U.S.",
    "Equivalent to completing a college degree in the U.S.",
    "Equivalent to completing a graduate degree in the U.S.",
]

# education levels that allow high-skill professions
HIGH_EDU_LEVELS = EDUCATION_LEVELS[4:]  # last 3 entries

GENDERS = ["Female", "Male"]

COUNTRIES_ALL = [
    "Germany",
    "France",
    "Mexico",
    "Philippines",
    "Poland",
    "India",
    "China",
    "Sudan",
    "Somalia",
    "Iraq",
]

# only these if reason == "Escape political/religious persecution"
PERSECUTION_COUNTRIES = ["Iraq", "Sudan", "Somalia"]

LANGUAGE_OPTIONS = [
    "During admission interview, this applicant spoke fluent English",
    "During admission interview, this applicant spoke broken English",
    "During admission interview, this applicant tried to speak English but was unable",
    "During admission interview, this applicant spoke through an interpreter",
]

APPLICATION_REASONS = [
    "Reunite with family members already in U.S.",
    "Seek better job in U.S.",
    "Escape political/religious persecution",
]

PROFESSIONS_ALL = [
    "Gardener",
    "Waiter",
    "Nurse",
    "Teacher",
    "Child care provider",
    "Janitor",
    "Construction worker",
    "Financial analyst",
    "Research scientist",
    "Doctor",
    "Computer programmer",
]

HIGH_SKILL_PROFESSIONS = [
    "Financial analyst",
    "Research scientist",
    "Doctor",
    "Computer programmer",
]

LOW_SKILL_PROFESSIONS = [
    p for p in PROFESSIONS_ALL if p not in HIGH_SKILL_PROFESSIONS
]

JOB_EXPERIENCE = [
    "No job training or prior experience",
    "One to two years",
    "Three to five years",
    "More than five years",
]

EMPLOYMENT_PLANS = [
    "Has a contract with a U.S. employer",
    "Does not have a contract with a U.S. employer, but has done job interviews",
    "Will look for work after arriving in the U.S.",
    "Has no plans to look for work at this time",
]

PRIOR_TRIPS = [
    "Never been to the U.S.",
    "Entered the U.S. once before on a tourist visa",
    "Entered the U.S. once before without legal authorization",
    "Has visited the U.S. many times before on tourist visas",
    "Spent six months with family members in the U.S.",
]

def profile_to_text(profile: dict) -> str:
    """
    Convert a single immigrant profile (dict) into a plain-text description.
    Keys like 'Feat_Education' become 'Education Level', etc.
    """

    # mapping from internal key â†’ plain label
    label_map = {
        "Feat_Education": "Education Level",
        "Feat_Gender": "Gender",
        "Feat_Country": "Country of Origin",
        "Feat_ApplicationReason": "Reason for Application",
        "Feat_JobProfession": "Profession",
        "Feat_JobExperience": "Job Experience",
        "Feat_JobPlans": "Employment Plans",
        "Feat_PriorTrips": "Prior Trips to the U.S.",
        "Feat_Language": "Language Ability",
    }

    lines = []
    for key, value in profile.items():
        label = label_map.get(key, key)
        lines.append(f"{label}: {value}")

    return "\n".join(lines)

def simulate_immigrant_survey(
    num_respondents: int,
    n_profiles: int = 5,
    random_state: int | None = None,
) -> pd.DataFrame:
    """
    Create a DataFrame with `num_respondents` rows and `n_profiles`
    immigrant profiles per respondent.

    - Columns: immigrant_a, ..., immigrant_e (for n_profiles=5)
    - Each cell contains a dict like your example, with keys:
      Feat_Education, Feat_Gender, Feat_Country, Feat_ApplicationReason,
      Feat_JobProfession, Feat_JobExperience, Feat_JobPlans,
      Feat_PriorTrips, Feat_Language
    - Attribute order is randomized *once per respondent* and then used
      consistently for all that respondent's profiles.
    - Feat_JobProfession, Feat_JobExperience, Feat_JobPlans appear
      consecutively in a randomized internal order.
    - If Feat_ApplicationReason == "Escape political/religious persecution",
      Feat_Country is restricted to Iraq, Sudan, or Somalia.
    - High-skill professions are only allowed for 2+ years of college.
    """
    if random_state is not None:
        random.seed(random_state)

    # column labels: immigrant_a, immigrant_b, ...
    labels = [f"immigrant_{string.ascii_lowercase[i]}_{j}" for j in range(1, n_profiles+1) for i in range(2)]

    # keys for the attributes
    job_keys = ["Feat_JobProfession", "Feat_JobExperience", "Feat_JobPlans"]
    other_keys = [
        "Feat_Education",
        "Feat_Gender",
        "Feat_Country",
        "Feat_ApplicationReason",
        "Feat_PriorTrips",
        "Feat_Language",
    ]

    rows = []

    for _ in range(num_respondents):
        # ---- randomize attribute order ONCE per respondent ----
        other_shuffled = other_keys[:]
        random.shuffle(other_shuffled)

        job_block = job_keys[:]
        random.shuffle(job_block)

        insert_pos = random.randint(0, len(other_shuffled))
        attribute_order = (
            other_shuffled[:insert_pos] + job_block + other_shuffled[insert_pos:]
        )

        respondent_profiles = {}
        pairs = []

        for lab in labels:
            # ---- draw attribute values with constraints ----
            education = random.choice(EDUCATION_LEVELS)
            gender = random.choice(GENDERS)

            reason = random.choice(APPLICATION_REASONS)
            if reason == "Escape political/religious persecution":
                country = random.choice(PERSECUTION_COUNTRIES)
            else:
                country = random.choice(COUNTRIES_ALL)

            # profession restricted by education
            if education in HIGH_EDU_LEVELS:
                profession_pool = PROFESSIONS_ALL
            else:
                profession_pool = LOW_SKILL_PROFESSIONS
            profession = random.choice(profession_pool)

            job_exp = random.choice(JOB_EXPERIENCE)
            job_plans = random.choice(EMPLOYMENT_PLANS)
            prior_trips = random.choice(PRIOR_TRIPS)
            language = random.choice(LANGUAGE_OPTIONS)

            value_map = {
                "Feat_Education": education,
                "Feat_Gender": gender,
                "Feat_Country": country,
                "Feat_ApplicationReason": reason,
                "Feat_JobProfession": profession,
                "Feat_JobExperience": job_exp,
                "Feat_JobPlans": job_plans,
                "Feat_PriorTrips": prior_trips,
                "Feat_Language": language,
            }

            # build ordered dict for this profile
            profile = {}
            for key in attribute_order:
                profile[key] = value_map[key]
            profile["text"] = profile_to_text(profile)

            respondent_profiles[lab] = profile
        rows.append(respondent_profiles)
    
    df = pd.DataFrame(pd.DataFrame([json.dumps(i) for i in rows]), index = range(num_respondents))
    df.index.name = "Treatment_ID"
    return df

df = simulate_immigrant_survey(1217)

df.to_excel("treatments.xlsx")
