In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Block 1: Imports and API Configuration
This block now imports the openai library. You must set your OPENAI_API_KEY as an environment variable for this to work.

In [None]:
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
import os
import json
import re # For cleaning model output, just in case

os.environ['OPENAI_API_KEY'] = ''

try:
    # This will be used in Block 5 to initialize the client
    _ = os.environ['OPENAI_API_KEY']
    print("OpenAI API Key found and set successfully.")
except KeyError:
    print("WARNING: 'OPENAI_API_KEY' environment variable not set.")
    print("Please paste your key into the 'os.environ' line above.")

# Define a helper function to generate lists of grid variables
def generate_var_list(prefix, count):
    """Helper function to create variable names like 'CCP2_1_1', 'CCP2_1_2'."""
    return [f"{prefix}_{i}" for i in range(1, count + 1)]

Block 2: Define File Paths and Load Data
This block sets the file paths and loads the initial dataset into a pandas DataFrame.


In [None]:
# --- Configuration ---
# Set the path to input CSV file
FILE_PATH = "/content/drive/MyDrive/CYON_Analysis_Materials/integrated_simul_generation_Oct18_PROCESSED.csv"

# Set the desired output path
OUTPUT_PATH = "/content/drive/MyDrive/CYON_Analysis_Materials/simulated_responses.csv"

# --- Load Data ---
try:
    df = pd.read_csv(FILE_PATH)
    print(f"Successfully loaded {FILE_PATH}.")
    print(f"Data shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: File not found at {FILE_PATH}.")
    print("Please check the file path and try again.")
    df = pd.DataFrame()
except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    df = pd.DataFrame()

if not df.empty:
    print("Data head:")
    print(df.head())

Block 3: Define Variable Lists and Question Maps
This block defines which variables are human-answered and which need to be simulated. It also creates dictionaries (maps) that store the full question text for each variable, which is essential for the AI prompt.

In [None]:
# List of variables already answered by humans (our "persona")
pre_survey_human_vars = [
    'DEM1.1', 'DEM2.1', 'DEM3.1', 'DEM4.1', 'DEM5.1', 'DEM7.1', 'DEM8.1', 'VOT2.1', 'CCP1_1.1'
]

# Variable for the treatment text
treatment_var = 'ed_generatedBody'

def get_question_maps():
    """
    Returns two dictionaries mapping variable names to their
    full question text and scale.
    """

    # --- 1. Pre-Survey Questions to be Simulated ---
    pre_survey_map = {}

    # CCP2_1 (8 items)
    pre_survey_map.update({v: f"CCP2_1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP2_1', 8),
        [
            "1. I believe that climate change is real.",
            "2. Climate change is NOT occurring.",
            "3. Human activities are a major cause of climate change.",
            "4. Climate change is mostly caused by human activity.",
            "5. The main causes of climate change are human activities.",
            "6. Overall, climate change will bring more negative than positive consequences to the world.",
            "7. Climate change will bring about serious negative consequences.",
            "8. The consequences of climate change will be very serious."
        ]
    )})

    # CCP3_1 (12 items)
    pre_survey_map.update({v: f"CCP3_1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP3_1', 12),
        [
            "1. I think climate change is a serious problem",
            "2. I believe that most of the concerns about climate change have been exaggerated",
            "3. I am concerned about the consequences of climate change",
            "4. I am hesitant to believe climate change scientists tell the whole story",
            "5. I believe that most claims about climate change are true",
            "6. I am not sure that climate change is actually occurring",
            "7. The climate change we are observing is just a natural process",
            "8. Humans are largely responsible for global warming",
            "9. I doubt that human activities cause global warming",
            "10. There is not much we can do that will help solve environmental problems",
            "11. Trying to solve environmental problems is a waste of time",
            "12. Human behavior has little effect on global warming"
        ]
    )})

    # CCP4_1 (9 items)
    pre_survey_map.update({v: f"CCP4_1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP4_1', 9),
        [
            "1. I support government subsidies for renewable energy sources like solar and wind power.",
            "2. Investing in renewable energy should be a priority for our country.",
            "3. I am in favor of strict regulations to limit carbon emissions from factories and vehicles.",
            "4. I am willing to pay more for products that are environmentally friendly.",
            "5. I support local initiatives to reduce waste and promote recycling.",
            "6. Our country should adhere to international agreements aimed at reducing climate change.",
            "7. It is important for our government to participate in global efforts to combat climate change.",
            "8. The government should provide financial assistance to communities affected by climate change.",
            "9. Policies that protect the environment are worth the economic cost."
        ]
    )})

    # PMA1 (3 items)
    pre_survey_map.update({v: f"PMA1 (1=Definitely no, 7=Definitely yes): {q}" for v, q in zip(
        generate_var_list('PMA1', 3),
        ["1. Are you comfortable with using AI tools?", "2. Are you confident using AI tools?", "3. Are you proficient in using AI tools?"]
    )})

    # AICOP (13 items)
    pre_survey_map.update({v: f"AICOP (1=Not at All, 7=Greatest Extent): {q}" for v, q in zip(
        generate_var_list('AICOP', 13),
        [
            "1. AI assists me tackle a given task effectively.", "2. AI offers me collaborative results when I'm working alone.",
            "3. AI offers helpful examples that I can utilize to solve my problems.", "4. AI allows me to avoid wasting mental efforts on tedious tasks.",
            "5. AI makes a given task seem less complicated.", "6. AI’s solutions are easily comprehensible.",
            "7. AI’s solutions are readily applicable to tasks I’m working on.", "8. AI guides me to understand essential elements of tasks.",
            "9. AI improves my understanding of tasks’ procedures.", "10. AI provides me assistance by defining unfamiliar terms.",
            "11. AI helps me understand complex information.", "12. AI guides me through the simpler parts of the tasks first.",
            "13. AI provides summaries when I’m overwhelmed with information."
        ]
    )})

    # PAA (4 items)
    pre_survey_map.update({v: f"PAA (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PAA', 4),
        ["1. AI is precise.", "2. AI is error free.", "3. AI is accurate.", "4. AI is objective."]
    )})

    # PAH (4 items)
    pre_survey_map.update({v: f"PAH (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PAH', 4),
        ["1. AI can express human emotion.", "2. AI can make human-like subjective judgements.", "3. AI can provide contextual background.", "4. AI can show human intuition."]
    )})

    # PTA (7 items)
    pre_survey_map.update({v: f"PTA (1=Completely untrue, 7=Completely true): {q}" for v, q in zip(
        generate_var_list('PTA', 7),
        [
            "1. AI is competent.", "2. AI can satisfy its users.", "3. One can expect good advice from AI.",
            "4. AI puts users’ interests first.", "5. If problems arise, one can expect to be treated fairly by AI.",
            "6. AI operates carefully and thoroughly.", "7. You can believe the statements produced by AI."
        ]
    )})

    # NFC (6 items)
    pre_survey_map.update({v: f"NFC (1=Not characteristic, 7=Extremely characteristic): {q}" for v, q in zip(
        generate_var_list('NFC', 6),
        [
            "1. I would prefer complex to simple problems.",
            "2. I like to have the responsibility of handling a situation that requires a lot of thinking.",
            "3. Thinking is not my idea of fun.",
            "4. I would rather do something that requires little thought than something that is sure to challenge my thinking abilities.",
            "5. I really enjoy a task that involves coming up with new solutions to problems.",
            "6. I would prefer a task that is intellectual, difficult, and important to one that is somewhat important but does not require much thought."
        ]
    )})

    # MT (5 items)
    pre_survey_map.update({v: f"MT (News media are...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('MT', 5),
        ["1. Fair", "2. Unbiased", "3. Tell the whole story", "4. Accurate", "5. Separate fact and opinion in their news coverage"]
    )})

    # CT (3 items)
    pre_survey_map.update({v: f"CT (Corporations...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CT', 3),
        ["1. Corporations in the United States are truthful to us", "2. Corporations in the United States treat people like me justly and fairly.", "3. Corporations in the United States keep their promises."]
    )})

    # PT (3 items)
    pre_survey_map.update({v: f"PT (Politicians...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PT', 3),
        ["1. I trust politicians to tell the truth", "2. I trust politicians to deal with the things that matter", "3. I trust the government to do what is right"]
    )})

    # GT (3 items)
    pre_survey_map.update({v: f"GT (US Gov't...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('GT', 3),
        ["1. The United States government is truthful to us", "2. The United States government treats people like me justly and fairly", "3. The United States government keeps its promises"]
    )})

    # PEF1 (8 items)
    pre_survey_map.update({v: f"PEF1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PEF1', 8),
        [
            "1. I consider myself to be well qualified to participate in politics.",
            "2. I think that I am better informed about politics than most people.",
            "3. Other people seem to have an easier time understanding complicated issues than I do.",
            "4. I feel that I have a pretty good understanding of the important political issues facing our country.",
            "5. Voting gives people an effective way to influence what the government does.",
            "6. I can make a difference if I participate in the election process.",
            "7. My vote makes a difference.",
            "8. I have a real say in what the government does."
        ]
    )})

    # PI1 (4 items)
    pre_survey_map.update({v: f"PI1 (1=Not interested, 7=Extremely interested): {q}" for v, q in zip(
        generate_var_list('PI1', 4),
        ["1. Politics", "2. Election Campaigns", "3. Social issues", "4. News"]
    )})


    # --- 2. Post-Survey Questions to be Simulated ---
    post_survey_map = {}

    # SA
    post_survey_map['SA'] = "SA: How the CYON news described the Trump Admin’s climate policy. (-2=Very Neg, 2=Very Pos)"

    # PAPC1 (6 items)
    post_survey_map.update({v: f"PAPC1 (1=Not at All, 7=Very Much): {q}" for v, q in zip(
        generate_var_list('PAPC1', 6),
        [
            "1. How fair do you think CYON is in generating political news?",
            "2. How politically biased do you think CYON is in prompting political discussions online...?",
            "3. How much trust would you have in the decision of CYON to delete the kind of content that needs to be deleted?",
            "4. How much do you agree with the statement, “The decision of CYON to delete problematic content is legitimate.”?",
            "5. How fair do you think CYON is to users when providing political news?",
            "6. How much do you agree with the statement, “CYON’s way of organizing political content is legitimate”?"
        ]
    )})

    # PAPC2 (4 items)
    post_survey_map.update({v: f"PAPC2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PAPC2', 4),
        [
            "1. In CYON news, news and information is being wrongly removed.",
            "2. In CYON news, political viewpoints are being censored.",
            "3. CYON will make easy to find trustworthy information",
            "4. CYON will allow online users to have more meaningful conversations"
        ]
    )})

    # PAPC3 (4 items)
    post_survey_map.update({v: f"PAPC3 (Scales vary): {q}" for v, q in zip(
        generate_var_list('PAPC3', 4),
        [
            "1. I felt my view was understood by CYON. (1-7)",
            "2. I took viewpoints reflected on the CYON news with respect. (1-7)",
            "3. CYON was disrespectful to my viewpoint. (1-7)",
            "4. I was able to see my values and beliefs from the CYON news. (1-7)"
        ]
    )})

    # PAPC3a
    post_survey_map['PAPC3a'] = "PAPC3a: How would you grade the CYON news you just read (0=Worst, 10=Best)."

    # DR (4 items)
    post_survey_map.update({v: f"DR (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('DR', 4),
        [
            "1. I find it difficult to see things from the point of view of people who disagree with me on climate issues.",
            "2. It is important to understand people who disagree with me on climate issues by imagining how things look from their perspective.",
            "3. Even if I don’t agree with them, I understand people have good reasons for voting for candidates who disagree with me on climate issues.",
            "4. I respect the opinions of people who disagree with me on climate issues."
        ]
    )})

    # CCP1_2
    post_survey_map['CCP1_2'] = "CCP1_2: Do you support or oppose the U.S. withdrawal from the Paris Agreement? (1=Strongly Oppose, 7=Strongly Support)"

    # CCP2_2 (8 items)
    post_survey_map.update({v: f"CCP2_2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP2_2', 8),
        [
            "1. I believe that climate change is real.", "2. Climate change is NOT occurring.",
            "3. Human activities are a major cause of climate change.", "4. Climate change is mostly caused by human activity.",
            "5. The main causes of climate change are human activities.",
            "6. Overall, climate change will bring more negative than positive consequences to the world.",
            "7. Climate change will bring about serious negative consequences.",
            "8. The consequences of climate change will be very serious."
        ]
    )})

    # CCP3_2 (12 items)
    post_survey_map.update({v: f"CCP3_2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP3_2', 12),
        [
            "1. I think climate change is a serious problem", "2. I believe that most of the concerns about climate change have been exaggerated",
            "3. I am concerned about the consequences of climate change", "4. I am hesitant to believe climate change scientists tell the whole story",
            "5. I believe that most claims about climate change are true", "6. I am not sure that climate change is actually occurring",
            "7. The climate change we are observing is just a natural process", "8. Humans are largely responsible for global warming",
            "9. I doubt that human activities cause global warming", "10. There is not much we can do that will help solve environmental problems",
            "11. Trying to solve environmental problems is a waste of time", "12. Human behavior has little effect on global warming"
        ]
    )})

    # CCP4_2 (9 items)
    post_survey_map.update({v: f"CCP4_2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP4_2', 9),
        [
            "1. I support government subsidies for renewable energy sources like solar and wind power.",
            "2. Investing in renewable energy should be a priority for our country.",
            "3. I am in favor of strict regulations to limit carbon emissions from factories and vehicles.",
            "4. I am willing to pay more for products that are environmentally friendly.",
            "5. I support local initiatives to reduce waste and promote recycling.",
            "6. Our country should adhere to international agreements aimed at reducing climate change.",
            "7. It is important for our government to participate in global efforts to combat climate change.",
            "8. The government should provide financial assistance to communities affected by climate change.",
            "9. Policies that protect the environment are worth the economic cost."
        ]
    )})

    return pre_survey_map, post_survey_map

# --- Get the maps and create a full list of simulated variables ---
pre_survey_q_map, post_survey_q_map = get_question_maps()

all_simulated_vars = list(pre_survey_q_map.keys()) + list(post_survey_q_map.keys())

# Add new columns to the DataFrame, initializing with NA
# This ensures the columns exist for us to fill later
for col in all_simulated_vars:
    if col not in df.columns:
        df[col] = pd.NA

print(f"Defined {len(pre_survey_q_map)} pre-survey variables to simulate.")
print(f"Defined {len(post_survey_q_map)} post-survey variables to simulate.")
print(f"Total simulated variables per participant: {len(all_simulated_vars)}")

Block 4: Define Simulation Function (with Persona Decoding)

In [None]:
# --- ADDED: Decoding Maps for Human-Answered Variables ---
# These maps will translate numeric codes from the CSV into
# human-readable text for the AI's persona.

DEM1_MAP = {
    1: "Female",
    2: "Male",
    3: "Non-binary / third gender",
    4: "Prefer not to say",
    5: "Other (Specify)" # Assuming 'Other' is coded as 5
}

DEM2_MAP = {}

DEM3_MAP = {
    1: "8th grade or less",
    2: "Some high school, no diploma",
    3: "High school graduate or GED",
    4: "Some college, no degree",
    5: "Associate’s degree",
    6: "Bachelor’s degree",
    7: "Graduate or professional degree"
}

DEM4_MAP = {
    1: "Under $10,000",
    2: "$10,000 to $14,999",
    3: "$15,000 to $24,999",
    4: "$25,000 to $34,999",
    5: "$35,000 to $49,999",
    6: "$50,000 to $74,999",
    7: "$75,000 to $99,999",
    8: "$100,000 to $124,999",
    9: "$125,000 to $149,000",
    10: "$150,000 to $199,999",
    11: "$200,000 or more"
}

DEM5_MAP = {
    1: "Caucasian/White",
    2: "African American/Black",
    3: "Hispanic/Latino",
    4: "Asian",
    5: "American Indian/Alaskan Native",
    6: "Native Hawaiian/Pacific Islander",
    7: "Other"
}

DEM7_MAP = {
    1: "Very liberal",
    2: "Liberal",
    3: "Somewhat liberal",
    4: "Moderate",
    5: "Somewhat conservative",
    6: "Conservative",
    7: "Very conservative"
}

DEM8_MAP = {
    1: "Democrat",
    2: "Republican",
    3: "Independent",
    4: "Something else"
}

VOT2_MAP = {
    1: "The Democratic candidate (Kamala Harris)",
    2: "The Republican candidate (Donald Trump)",
    3: "Another candidate"
}

CCP1_1_MAP = {
    1: "1 (Strongly Oppose)",
    2: "2",
    3: "3",
    4: "4 (Neutral)",
    5: "5",
    6: "6",
    7: "7 (Strongly Support)"
}

# --- This dictionary maps variable names to their decoding map ---
DECODING_MASTER_MAP = {
    'DEM1.1': DEM1_MAP,
    'DEM2.1': DEM2_MAP,
    'DEM3.1': DEM3_MAP,
    'DEM4.1': DEM4_MAP,
    'DEM5.1': DEM5_MAP,
    'DEM7.1': DEM7_MAP,
    'DEM8.1': DEM8_MAP,
    'VOT2.1': VOT2_MAP,
    'CCP1_1.1': CCP1_1_MAP
}

def format_persona(row, human_vars):
    """
    Formats the human-answered data into a descriptive string for the prompt,
    using the decoding maps.
    """
    persona = ["Here is the information about the participant you are simulating:"]

    for var in human_vars:
        if var in row:
            value = row[var]

            # If variable has a decoding map
            if var in DECODING_MASTER_MAP:
                # Special handling for DEM2.1 (Age)
                if var == 'DEM2.1':
                    label = f"{value} years old"
                else:
                    label = DECODING_MASTER_MAP[var].get(value, f"Unknown code: {value}")
                persona.append(f"- {var}: {label}")

            # Special handling for CCP1_1 (if it appears without .1)
            elif var == 'CCP1_1':
                label = CCP1_1_MAP.get(value, f"{value}")
                persona.append(f"- {var} (Support U.S. withdrawal from Paris Agreement): {label}")

            # For variables with no map
            else:
                persona.append(f"- {var}: {value}")

    return "\n".join(persona)

In [None]:
# --- Helper Functions (Your provided structure) ---

def format_persona(row, human_vars):
    """
    Formats the human-answered data into a string for the prompt.
    Skips any variables that have missing (NaN/None) data for this row.
    (This implements the "pairwise" data handling)
    """
    persona = ["Here is the information about the participant you are simulating:"]
    has_data = False
    for var in human_vars:
        value = row.get(var)

        # --- MODIFICATION ---
        # Only add the variable if it's not NaN/None and not an empty string
        if pd.notna(value) and str(value).strip() != "":
            persona.append(f"- {var}: {value}")
            has_data = True

    if not has_data:
        persona.append("- No pre-survey data was provided for this participant.")

    # Add context for the key variable, as in your original function
    persona.append("  (Note for CCP1_1: 1=Strongly Oppose, 7=Strongly Support U.S. withdrawal from Paris Agreement)")
    return "\n".join(persona)

def format_questions(question_map):
    """Formats the question map into a numbered list string for the prompt."""
    return "\n".join([f"- {var}: {text}" for var, text in question_map.items()])

def clean_json_response(text):
    """
    Cleans the model's text output to extract only the valid JSON block.
    """
    # Find the first '{' and the last '}'
    start = text.find('{')
    end = text.rfind('}')
    if start != -1 and end != -1 and end > start:
        return text[start:end+1]

    # Fallback for triple-backtick markdown
    match = re.search(r'```json\n(.*?)\n```', text, re.DOTALL)
    if match:
        return match.group(1).strip()

    print("Warning: Clean-up function could not find valid JSON.")
    return text # Return original if no clear JSON is found

def simulate_single_participant(row, client, model_name, human_vars, pre_map, post_map, treatment_var):
    """
    Generates a prompt for a single participant and gets the AI's response.
    (Your provided function, now uses the robust format_persona)
    """

    # 1. Create the Persona string (now handles missing data)
    persona_str = format_persona(row, human_vars)

    # 2. Create the Pre-Survey question string
    pre_survey_q_str = format_questions(pre_map)

    # 3. Get the Treatment text (handles missing data)
    treatment_str = row[treatment_var]
    if pd.isna(treatment_str):
        treatment_str = "No article text was provided."

    # 4. Create the Post-Survey question string
    post_survey_q_str = format_questions(post_map)

    # 5. Get the list of all keys we expect in the JSON
    all_json_keys = list(pre_map.keys()) + list(post_map.keys())

    # 6. Construct the full prompt
    prompt = f"""
    You are a survey participant simulator. Your task is to realistically complete a survey experiment based on a given persona. Please follow all steps.

    ---
    **PART 1: YOUR PERSONA**
    You are simulating a person with the following characteristics and views:
    {persona_str}

    ---
    **PART 2: PRE-SURVEY**
    Based *only* on the persona from Part 1, please provide the most plausible answers for the following questions.

    {pre_survey_q_str}

    ---
    **PART 3: TREATMENT MATERIAL**
    Now, please carefully read the following AI-generated news article ("CYON news"). Read it from the perspective of your persona.

    **Article:**
    {treatment_str}

    ---
    **PART 4: POST-SURVEY**
    Having just read the article in Part 3, and keeping your persona (Part 1) and pre-survey answers (Part 2) in mind, please provide the most plausible answers for the following questions. Your answers to the climate change questions (CCP_2) may or may not change based on the article.

    {post_survey_q_str}

    ---
    **INSTRUCTIONS**
    Return *only* a single, valid JSON object. The keys of the object must be the variable names (e.g., "CCP2_1_1", "SA", "PAPC1_1") and the values must be the single integer or float response. Do not include any other text, explanations, or markdown.

    The JSON object must contain exactly these {len(all_json_keys)} keys:
    {all_json_keys}
    """

    # 7. Call the OpenAI API
    response_content = "" # Initialize for error logging
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            top_p=1.0,
            max_tokens=8192,
            response_format={"type": "json_object"} # Force JSON output
        )

        # 8. Parse the JSON response
        response_content = response.choices[0].message.content
        return json.loads(response_content)

    except json.JSONDecodeError:
        # Fallback if the model *still* fails (unlikely with json_object)
        print(f"Warning: Model response was not valid JSON. Attempting to clean...")
        cleaned_text = clean_json_response(response_content)
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError as e:
            # Log the *cleaned* text for easier debugging
            print(f"Error: Could not parse JSON even after cleaning. Cleaned Response: {cleaned_text}. Error: {e}")
            return None # Will be handled in the main loop
    except Exception as e:
        print(f"An unexpected error occurred during API call or parsing: {e}")
        return None

print("All simulation functions defined.")

Block 5: Run the Simulation Loop
This block is modified to initialize the OpenAI() client and pass your specific MODEL_NAME to the simulation function.

In [None]:
# --- Main Simulation Loop (with Error Tracking) ---

if 'df' in locals() and not df.empty:

    # --- Define your specific fine-tuned model ---
    MODEL_NAME = "ft:gpt-4o-2024-08-06:personal::A8vV3mNd"

    # --- Initialize the Model ---
    try:
        if "OPENAI_API_KEY" not in os.environ:
            raise EnvironmentError("OPENAI_API_KEY environment variable not set.")

        client = OpenAI()
        print(f"OpenAI client initialized. Using model: {MODEL_NAME}")

    except Exception as e:
        print(f"FATAL ERROR: Could not initialize OpenAI client: {e}")
        client = None

    if client:
        results = []
        # --- List to track skipped cases ---
        skipped_cases = []

        print(f"Starting simulation for {len(df)} participants...")

        # Use tqdm for a progress bar
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Simulating Participants"):
            try:
                # Call your simulation function
                sim_data = simulate_single_participant(
                    row,
                    client,
                    MODEL_NAME,
                    pre_survey_human_vars,
                    pre_survey_q_map,
                    post_survey_q_map,
                    treatment_var
                )

                if sim_data:
                    sim_data['index_col'] = index
                    results.append(sim_data)
                else:
                    # Log cases where the simulation function returned None
                    print(f"Skipping row {index} due to simulation error (function returned None).")
                    skipped_cases.append({'index': index, 'error': 'Simulation function returned None.'})

            except Exception as e:
                # Log cases that failed with an unexpected exception
                print(f"An unexpected error occurred while processing row {index}: {e}")
                skipped_cases.append({'index': index, 'error': str(e)})
                pass

        print("\n" + "="*30)
        print("Simulation loop complete.")
        print(f"Successfully generated responses for {len(results)}/{len(df)} participants.")
        print("="*30)

        # --- Final Report of Skipped Cases ---
        if skipped_cases:
            print(f"\n--- Skipped Case Report ({len(skipped_cases)} total) ---")
            for case in skipped_cases:
                print(f"  - Index: {case['index']}, Error: {case['error']}")
        else:
            print("\n--- All participants simulated successfully! ---")

else:
    print("DataFrame 'df' is empty or not loaded. Skipping simulation loop.")
    results = []
    skipped_cases = []

In [None]:
if results:
    # Convert the list of dictionaries (results) into a DataFrame
    sim_df = pd.DataFrame(results)

    # Use the 'index_col' we saved to set the DataFrame's index
    # This ensures alignment with the original 'df'
    sim_df = sim_df.set_index('index_col')

    # Create a copy of the original dataframe to hold the final results
    final_df = df.copy()

    # Use .update() to fill in the simulated values
    # This will overwrite the NA values we created in Block 3
    final_df.update(sim_df)

    # --- Save the final DataFrame to a new CSV file ---
    try:
        final_df.to_csv(OUTPUT_PATH, index=False)
        print(f"\nSuccessfully saved completed dataset to: {OUTPUT_PATH}")
        print("Final data shape:", final_df.shape)

        # Display the head of a few simulated columns to verify
        print("\nVerification (head of first 5 simulated columns):")
        print(final_df[all_simulated_vars[:5]].head())

    except Exception as e:
        print(f"\nAn error occurred while saving the file: {e}")
        print("Please check file permissions and the path.")

elif 'df' in locals() and df.empty:
    print("Simulation was skipped because the initial data file was not loaded.")
else:
    print("No results were generated from the simulation. Nothing to save.")

From Below: For handling skipped cases.

In [None]:
if results:
    # Convert the list of dictionaries (results) into a DataFrame
    sim_df = pd.DataFrame(results)

    # Use the 'index_col' we saved to set the DataFrame's index
    # This ensures alignment with the original 'df'
    sim_df = sim_df.set_index('index_col')

    # Create a copy of the original dataframe to hold the final results
    final_df = df.copy()

    # Use .update() to fill in the simulated values
    # This will overwrite the NA values we created in Block 3
    final_df.update(sim_df)

    # --- Save the final DataFrame to a new CSV file ---
    try:
        final_df.to_csv(OUTPUT_PATH, index=False)
        print(f"\nSuccessfully saved completed dataset to: {OUTPUT_PATH}")
        print("Final data shape:", final_df.shape)

        # Display the head of a few simulated columns to verify
        print("\nVerification (head of first 5 simulated columns):")
        # Assumes 'all_simulated_vars' is defined from a previous block
        print(final_df[all_simulated_vars[:5]].head())

    except Exception as e:
        print(f"\nAn error occurred while saving the file: {e}")
        print("Please check file permissions and the path.")

elif 'df' in locals() and df.empty:
    print("Simulation was skipped because the initial data file was not loaded.")
else:
    print("No results were generated from the simulation. Nothing to save to OUTPUT_PATH.")

# --- NEW BLOCK: Save skipped cases report ---
# This runs independently of the 'results' check to ensure the
# skipped cases log is saved even if all simulations failed.

# --- Define the specific path for the skipped cases file ---
SKIPPED_CASES_PATH = "/content/drive/MyDrive/CYON_Analysis_Materials/skipped_cases.csv"

if 'skipped_cases' in locals() and skipped_cases:
    print(f"\nSaving {len(skipped_cases)} skipped cases to '{SKIPPED_CASES_PATH}'...")
    try:
        skipped_df = pd.DataFrame(skipped_cases)
        skipped_df.to_csv(SKIPPED_CASES_PATH, index=False)
        print(f"Successfully saved '{SKIPPED_CASES_PATH}'.")
    except Exception as e:
        print(f"An error occurred while saving '{SKIPPED_CASES_PATH}': {e}")
elif 'skipped_cases' in locals():
    # This handles the case where the simulation ran, but no cases were skipped
    print(f"\nNo cases were skipped during the simulation. '{SKIPPED_CASES_PATH}' not created.")

I want to retry with skipped cases. Use "/content/drive/MyDrive/CYON_Analysis_Materials/simulated_responses.csv" and if there is no value for the variable "CCP2_1_1", try to do the all steps (personal simulation -> do the pre- and post survey) again with those "skipped cases" and incorporate results to "simulated_responses_re.csv". So, there could be all simulated survey data will be in a new file.

Block 2 (New): Load Data and Identify Skipped Cases
This block replaces your old Block 2. It loads the file you already created and splits it into two parts: df_complete (good rows) and df_retry (skipped rows).

In [None]:
# --- Configuration for Retry ---
# Input file is the one you already created
INPUT_PATH_RETRY = "/content/drive/MyDrive/CYON_Analysis_Materials/simulated_responses.csv"

# New output file as requested
OUTPUT_PATH_RETRY = "/content/drive/MyDrive/CYON_Analysis_Materials/simulated_responses_re.csv"

# --- Load Existing Data ---
try:
    df_full = pd.read_csv(INPUT_PATH_RETRY)
    print(f"Successfully loaded {INPUT_PATH_RETRY}.")
    print(f"Total cases in file: {len(df_full)}")
except FileNotFoundError:
    print(f"ERROR: File not found at {INPUT_PATH_RETRY}.")
    print("Please check the file path. Cannot proceed.")
    df_full = pd.DataFrame()
except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    df_full = pd.DataFrame()

if not df_full.empty:
    # --- Identify Skipped vs. Complete Cases ---

    # The condition for a "skipped" case is that 'CCP2_1_1' is null/NA
    skipped_mask = df_full['CCP2_1_1'].isna()

    # Create two separate dataframes
    df_retry = df_full[skipped_mask].copy()
    df_complete = df_full[~skipped_mask].copy()

    print(f"Found {len(df_complete)} already completed cases.")
    print(f"Found {len(df_retry)} skipped cases to retry.")

    if 'index' not in df_full.columns:
        # If the original index wasn't saved, we'll use the current one.
        # This is crucial for re-combining later.
        df_full.reset_index(inplace=True)
        df_retry = df_full[skipped_mask].copy()
        df_complete = df_full[~skipped_mask].copy()
        print("Using DataFrame index for merging.")
else:
    print("DataFrame is empty. Halting operation.")

Add vars

In [None]:
# List of variables already answered by humans (our "persona")
pre_survey_human_vars = [
    'DEM1.1', 'DEM2.1', 'DEM3.1', 'DEM4.1', 'DEM5.1', 'DEM7.1', 'DEM8.1', 'VOT2.1', 'CCP1_1.1'
]

# Variable for the treatment text
treatment_var = 'ed_generatedBody'

def get_question_maps():
    """
    Returns two dictionaries mapping variable names to their
    full question text and scale.
    """

    # --- 1. Pre-Survey Questions to be Simulated ---
    pre_survey_map = {}

    # CCP2_1 (8 items)
    pre_survey_map.update({v: f"CCP2_1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP2_1', 8),
        [
            "1. I believe that climate change is real.",
            "2. Climate change is NOT occurring.",
            "3. Human activities are a major cause of climate change.",
            "4. Climate change is mostly caused by human activity.",
            "5. The main causes of climate change are human activities.",
            "6. Overall, climate change will bring more negative than positive consequences to the world.",
            "7. Climate change will bring about serious negative consequences.",
            "8. The consequences of climate change will be very serious."
        ]
    )})

    # CCP3_1 (12 items)
    pre_survey_map.update({v: f"CCP3_1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP3_1', 12),
        [
            "1. I think climate change is a serious problem",
            "2. I believe that most of the concerns about climate change have been exaggerated",
            "3. I am concerned about the consequences of climate change",
            "4. I am hesitant to believe climate change scientists tell the whole story",
            "5. I believe that most claims about climate change are true",
            "6. I am not sure that climate change is actually occurring",
            "7. The climate change we are observing is just a natural process",
            "8. Humans are largely responsible for global warming",
            "9. I doubt that human activities cause global warming",
            "10. There is not much we can do that will help solve environmental problems",
            "11. Trying to solve environmental problems is a waste of time",
            "12. Human behavior has little effect on global warming"
        ]
    )})

    # CCP4_1 (9 items)
    pre_survey_map.update({v: f"CCP4_1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP4_1', 9),
        [
            "1. I support government subsidies for renewable energy sources like solar and wind power.",
            "2. Investing in renewable energy should be a priority for our country.",
            "3. I am in favor of strict regulations to limit carbon emissions from factories and vehicles.",
            "4. I am willing to pay more for products that are environmentally friendly.",
            "5. I support local initiatives to reduce waste and promote recycling.",
            "6. Our country should adhere to international agreements aimed at reducing climate change.",
            "7. It is important for our government to participate in global efforts to combat climate change.",
            "8. The government should provide financial assistance to communities affected by climate change.",
            "9. Policies that protect the environment are worth the economic cost."
        ]
    )})

    # PMA1 (3 items)
    pre_survey_map.update({v: f"PMA1 (1=Definitely no, 7=Definitely yes): {q}" for v, q in zip(
        generate_var_list('PMA1', 3),
        ["1. Are you comfortable with using AI tools?", "2. Are you confident using AI tools?", "3. Are you proficient in using AI tools?"]
    )})

    # AICOP (13 items)
    pre_survey_map.update({v: f"AICOP (1=Not at All, 7=Greatest Extent): {q}" for v, q in zip(
        generate_var_list('AICOP', 13),
        [
            "1. AI assists me tackle a given task effectively.", "2. AI offers me collaborative results when I'm working alone.",
            "3. AI offers helpful examples that I can utilize to solve my problems.", "4. AI allows me to avoid wasting mental efforts on tedious tasks.",
            "5. AI makes a given task seem less complicated.", "6. AI’s solutions are easily comprehensible.",
            "7. AI’s solutions are readily applicable to tasks I’m working on.", "8. AI guides me to understand essential elements of tasks.",
            "9. AI improves my understanding of tasks’ procedures.", "10. AI provides me assistance by defining unfamiliar terms.",
            "11. AI helps me understand complex information.", "12. AI guides me through the simpler parts of the tasks first.",
            "13. AI provides summaries when I’m overwhelmed with information."
        ]
    )})

    # PAA (4 items)
    pre_survey_map.update({v: f"PAA (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PAA', 4),
        ["1. AI is precise.", "2. AI is error free.", "3. AI is accurate.", "4. AI is objective."]
    )})

    # PAH (4 items)
    pre_survey_map.update({v: f"PAH (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PAH', 4),
        ["1. AI can express human emotion.", "2. AI can make human-like subjective judgements.", "3. AI can provide contextual background.", "4. AI can show human intuition."]
    )})

    # PTA (7 items)
    pre_survey_map.update({v: f"PTA (1=Completely untrue, 7=Completely true): {q}" for v, q in zip(
        generate_var_list('PTA', 7),
        [
            "1. AI is competent.", "2. AI can satisfy its users.", "3. One can expect good advice from AI.",
            "4. AI puts users’ interests first.", "5. If problems arise, one can expect to be treated fairly by AI.",
            "6. AI operates carefully and thoroughly.", "7. You can believe the statements produced by AI."
        ]
    )})

    # NFC (6 items)
    pre_survey_map.update({v: f"NFC (1=Not characteristic, 7=Extremely characteristic): {q}" for v, q in zip(
        generate_var_list('NFC', 6),
        [
            "1. I would prefer complex to simple problems.",
            "2. I like to have the responsibility of handling a situation that requires a lot of thinking.",
            "3. Thinking is not my idea of fun.",
            "4. I would rather do something that requires little thought than something that is sure to challenge my thinking abilities.",
            "5. I really enjoy a task that involves coming up with new solutions to problems.",
            "6. I would prefer a task that is intellectual, difficult, and important to one that is somewhat important but does not require much thought."
        ]
    )})

    # MT (5 items)
    pre_survey_map.update({v: f"MT (News media are...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('MT', 5),
        ["1. Fair", "2. Unbiased", "3. Tell the whole story", "4. Accurate", "5. Separate fact and opinion in their news coverage"]
    )})

    # CT (3 items)
    pre_survey_map.update({v: f"CT (Corporations...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CT', 3),
        ["1. Corporations in the United States are truthful to us", "2. Corporations in the United States treat people like me justly and fairly.", "3. Corporations in the United States keep their promises."]
    )})

    # PT (3 items)
    pre_survey_map.update({v: f"PT (Politicians...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PT', 3),
        ["1. I trust politicians to tell the truth", "2. I trust politicians to deal with the things that matter", "3. I trust the government to do what is right"]
    )})

    # GT (3 items)
    pre_survey_map.update({v: f"GT (US Gov't...; 1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('GT', 3),
        ["1. The United States government is truthful to us", "2. The United States government treats people like me justly and fairly", "3. The United States government keeps its promises"]
    )})

    # PEF1 (8 items)
    pre_survey_map.update({v: f"PEF1 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PEF1', 8),
        [
            "1. I consider myself to be well qualified to participate in politics.",
            "2. I think that I am better informed about politics than most people.",
            "3. Other people seem to have an easier time understanding complicated issues than I do.",
            "4. I feel that I have a pretty good understanding of the important political issues facing our country.",
            "5. Voting gives people an effective way to influence what the government does.",
            "6. I can make a difference if I participate in the election process.",
            "7. My vote makes a difference.",
            "8. I have a real say in what the government does."
        ]
    )})

    # PI1 (4 items)
    pre_survey_map.update({v: f"PI1 (1=Not interested, 7=Extremely interested): {q}" for v, q in zip(
        generate_var_list('PI1', 4),
        ["1. Politics", "2. Election Campaigns", "3. Social issues", "4. News"]
    )})


    # --- 2. Post-Survey Questions to be Simulated ---
    post_survey_map = {}

    # SA
    post_survey_map['SA'] = "SA: How the CYON news described the Trump Admin’s climate policy. (-2=Very Neg, 2=Very Pos)"

    # PAPC1 (6 items)
    post_survey_map.update({v: f"PAPC1 (1=Not at All, 7=Very Much): {q}" for v, q in zip(
        generate_var_list('PAPC1', 6),
        [
            "1. How fair do you think CYON is in generating political news?",
            "2. How politically biased do you think CYON is in prompting political discussions online...?",
            "3. How much trust would you have in the decision of CYON to delete the kind of content that needs to be deleted?",
            "4. How much do you agree with the statement, “The decision of CYON to delete problematic content is legitimate.”?",
            "5. How fair do you think CYON is to users when providing political news?",
            "6. How much do you agree with the statement, “CYON’s way of organizing political content is legitimate”?"
        ]
    )})

    # PAPC2 (4 items)
    post_survey_map.update({v: f"PAPC2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('PAPC2', 4),
        [
            "1. In CYON news, news and information is being wrongly removed.",
            "2. In CYON news, political viewpoints are being censored.",
            "3. CYON will make easy to find trustworthy information",
            "4. CYON will allow online users to have more meaningful conversations"
        ]
    )})

    # PAPC3 (4 items)
    post_survey_map.update({v: f"PAPC3 (Scales vary): {q}" for v, q in zip(
        generate_var_list('PAPC3', 4),
        [
            "1. I felt my view was understood by CYON. (1-7)",
            "2. I took viewpoints reflected on the CYON news with respect. (1-7)",
            "3. CYON was disrespectful to my viewpoint. (1-7)",
            "4. I was able to see my values and beliefs from the CYON news. (1-7)"
        ]
    )})

    # PAPC3a
    post_survey_map['PAPC3a'] = "PAPC3a: How would you grade the CYON news you just read (0=Worst, 10=Best)."

    # DR (4 items)
    post_survey_map.update({v: f"DR (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('DR', 4),
        [
            "1. I find it difficult to see things from the point of view of people who disagree with me on climate issues.",
            "2. It is important to understand people who disagree with me on climate issues by imagining how things look from their perspective.",
            "3. Even if I don’t agree with them, I understand people have good reasons for voting for candidates who disagree with me on climate issues.",
            "4. I respect the opinions of people who disagree with me on climate issues."
        ]
    )})

    # CCP1_2
    post_survey_map['CCP1_2'] = "CCP1_2: Do you support or oppose the U.S. withdrawal from the Paris Agreement? (1=Strongly Oppose, 7=Strongly Support)"

    # CCP2_2 (8 items)
    post_survey_map.update({v: f"CCP2_2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP2_2', 8),
        [
            "1. I believe that climate change is real.", "2. Climate change is NOT occurring.",
            "3. Human activities are a major cause of climate change.", "4. Climate change is mostly caused by human activity.",
            "5. The main causes of climate change are human activities.",
            "6. Overall, climate change will bring more negative than positive consequences to the world.",
            "7. Climate change will bring about serious negative consequences.",
            "8. The consequences of climate change will be very serious."
        ]
    )})

    # CCP3_2 (12 items)
    post_survey_map.update({v: f"CCP3_2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP3_2', 12),
        [
            "1. I think climate change is a serious problem", "2. I believe that most of the concerns about climate change have been exaggerated",
            "3. I am concerned about the consequences of climate change", "4. I am hesitant to believe climate change scientists tell the whole story",
            "5. I believe that most claims about climate change are true", "6. I am not sure that climate change is actually occurring",
            "7. The climate change we are observing is just a natural process", "8. Humans are largely responsible for global warming",
            "9. I doubt that human activities cause global warming", "10. There is not much we can do that will help solve environmental problems",
            "11. Trying to solve environmental problems is a waste of time", "12. Human behavior has little effect on global warming"
        ]
    )})

    # CCP4_2 (9 items)
    post_survey_map.update({v: f"CCP4_2 (1=SD, 7=SA): {q}" for v, q in zip(
        generate_var_list('CCP4_2', 9),
        [
            "1. I support government subsidies for renewable energy sources like solar and wind power.",
            "2. Investing in renewable energy should be a priority for our country.",
            "3. I am in favor of strict regulations to limit carbon emissions from factories and vehicles.",
            "4. I am willing to pay more for products that are environmentally friendly.",
            "5. I support local initiatives to reduce waste and promote recycling.",
            "6. Our country should adhere to international agreements aimed at reducing climate change.",
            "7. It is important for our government to participate in global efforts to combat climate change.",
            "8. The government should provide financial assistance to communities affected by climate change.",
            "9. Policies that protect the environment are worth the economic cost."
        ]
    )})

    return pre_survey_map, post_survey_map

# --- Get the maps and create a full list of simulated variables ---
pre_survey_q_map, post_survey_q_map = get_question_maps()

all_simulated_vars = list(pre_survey_q_map.keys()) + list(post_survey_q_map.keys())

# Add new columns to the DataFrame, initializing with NA
# This ensures the columns exist for us to fill later
for col in all_simulated_vars:
    if col not in df_full.columns:
        df[col] = pd.NA

print(f"Defined {len(pre_survey_q_map)} pre-survey variables to simulate.")
print(f"Defined {len(post_survey_q_map)} post-survey variables to simulate.")
print(f"Total simulated variables per participant: {len(all_simulated_vars)}")

In [None]:
# --- ADDED: Decoding Maps for Human-Answered Variables ---
# These maps will translate numeric codes from the CSV into
# human-readable text for the AI's persona.

DEM1_MAP = {
    1: "Female",
    2: "Male",
    3: "Non-binary / third gender",
    4: "Prefer not to say",
    5: "Other (Specify)" # Assuming 'Other' is coded as 5
}

DEM2_MAP = {}

DEM3_MAP = {
    1: "8th grade or less",
    2: "Some high school, no diploma",
    3: "High school graduate or GED",
    4: "Some college, no degree",
    5: "Associate’s degree",
    6: "Bachelor’s degree",
    7: "Graduate or professional degree"
}

DEM4_MAP = {
    1: "Under $10,000",
    2: "$10,000 to $14,999",
    3: "$15,000 to $24,999",
    4: "$25,000 to $34,999",
    5: "$35,000 to $49,999",
    6: "$50,000 to $74,999",
    7: "$75,000 to $99,999",
    8: "$100,000 to $124,999",
    9: "$125,000 to $149,000",
    10: "$150,000 to $199,999",
    11: "$200,000 or more"
}

DEM5_MAP = {
    1: "Caucasian/White",
    2: "African American/Black",
    3: "Hispanic/Latino",
    4: "Asian",
    5: "American Indian/Alaskan Native",
    6: "Native Hawaiian/Pacific Islander",
    7: "Other"
}

DEM7_MAP = {
    1: "Very liberal",
    2: "Liberal",
    3: "Somewhat liberal",
    4: "Moderate",
    5: "Somewhat conservative",
    6: "Conservative",
    7: "Very conservative"
}

DEM8_MAP = {
    1: "Democrat",
    2: "Republican",
    3: "Independent",
    4: "Something else"
}

VOT2_MAP = {
    1: "The Democratic candidate (Kamala Harris)",
    2: "The Republican candidate (Donald Trump)",
    3: "Another candidate"
}

CCP1_1_MAP = {
    1: "1 (Strongly Oppose)",
    2: "2",
    3: "3",
    4: "4 (Neutral)",
    5: "5",
    6: "6",
    7: "7 (Strongly Support)"
}

# --- This dictionary maps variable names to their decoding map ---
DECODING_MASTER_MAP = {
    'DEM1.1': DEM1_MAP,
    'DEM2.1': DEM2_MAP,
    'DEM3.1': DEM3_MAP,
    'DEM4.1': DEM4_MAP,
    'DEM5.1': DEM5_MAP,
    'DEM7.1': DEM7_MAP,
    'DEM8.1': DEM8_MAP,
    'VOT2.1': VOT2_MAP,
    'CCP1_1.1': CCP1_1_MAP
}

def format_persona(row, human_vars):
    """
    Formats the human-answered data into a descriptive string for the prompt,
    using the decoding maps.
    """
    persona = ["Here is the information about the participant you are simulating:"]

    for var in human_vars:
        if var in row:
            value = row[var]

            # If variable has a decoding map
            if var in DECODING_MASTER_MAP:
                # Special handling for DEM2.1 (Age)
                if var == 'DEM2.1':
                    label = f"{value} years old"
                else:
                    label = DECODING_MASTER_MAP[var].get(value, f"Unknown code: {value}")
                persona.append(f"- {var}: {label}")

            # Special handling for CCP1_1 (if it appears without .1)
            elif var == 'CCP1_1':
                label = CCP1_1_MAP.get(value, f"{value}")
                persona.append(f"- {var} (Support U.S. withdrawal from Paris Agreement): {label}")

            # For variables with no map
            else:
                persona.append(f"- {var}: {value}")

    return "\n".join(persona)

In [None]:
# --- Helper Functions (Your provided structure) ---

def format_persona(row, human_vars):
    """
    Formats the human-answered data into a string for the prompt.
    Skips any variables that have missing (NaN/None) data for this row.
    (This implements the "pairwise" data handling)
    """
    persona = ["Here is the information about the participant you are simulating:"]
    has_data = False
    for var in human_vars:
        value = row.get(var)

        # --- MODIFICATION ---
        # Only add the variable if it's not NaN/None and not an empty string
        if pd.notna(value) and str(value).strip() != "":
            persona.append(f"- {var}: {value}")
            has_data = True

    if not has_data:
        persona.append("- No pre-survey data was provided for this participant.")

    # Add context for the key variable, as in your original function
    persona.append("  (Note for CCP1_1: 1=Strongly Oppose, 7=Strongly Support U.S. withdrawal from Paris Agreement)")
    return "\n".join(persona)

def format_questions(question_map):
    """Formats the question map into a numbered list string for the prompt."""
    return "\n".join([f"- {var}: {text}" for var, text in question_map.items()])

def clean_json_response(text):
    """
    Cleans the model's text output to extract only the valid JSON block.
    """
    # Find the first '{' and the last '}'
    start = text.find('{')
    end = text.rfind('}')
    if start != -1 and end != -1 and end > start:
        return text[start:end+1]

    # Fallback for triple-backtick markdown
    match = re.search(r'```json\n(.*?)\n```', text, re.DOTALL)
    if match:
        return match.group(1).strip()

    print("Warning: Clean-up function could not find valid JSON.")
    return text # Return original if no clear JSON is found

def simulate_single_participant(row, client, model_name, human_vars, pre_map, post_map, treatment_var):
    """
    Generates a prompt for a single participant and gets the AI's response.
    (Your provided function, now uses the robust format_persona)
    """

    # 1. Create the Persona string (now handles missing data)
    persona_str = format_persona(row, human_vars)

    # 2. Create the Pre-Survey question string
    pre_survey_q_str = format_questions(pre_map)

    # 3. Get the Treatment text (handles missing data)
    treatment_str = row[treatment_var]
    if pd.isna(treatment_str):
        treatment_str = "No article text was provided."

    # 4. Create the Post-Survey question string
    post_survey_q_str = format_questions(post_map)

    # 5. Get the list of all keys we expect in the JSON
    all_json_keys = list(pre_map.keys()) + list(post_map.keys())

    # 6. Construct the full prompt
    prompt = f"""
    You are a survey participant simulator. Your task is to realistically complete a survey experiment based on a given persona. Please follow all steps.

    ---
    **PART 1: YOUR PERSONA**
    You are simulating a person with the following characteristics and views:
    {persona_str}

    ---
    **PART 2: PRE-SURVEY**
    Based *only* on the persona from Part 1, please provide the most plausible answers for the following questions.

    {pre_survey_q_str}

    ---
    **PART 3: TREATMENT MATERIAL**
    Now, please carefully read the following AI-generated news article ("CYON news"). Read it from the perspective of your persona.

    **Article:**
    {treatment_str}

    ---
    **PART 4: POST-SURVEY**
    Having just read the article in Part 3, and keeping your persona (Part 1) and pre-survey answers (Part 2) in mind, please provide the most plausible answers for the following questions. Your answers to the climate change questions (CCP_2) may or may not change based on the article.

    {post_survey_q_str}

    ---
    **INSTRUCTIONS**
    Return *only* a single, valid JSON object. The keys of the object must be the variable names (e.g., "CCP2_1_1", "SA", "PAPC1_1") and the values must be the single integer or float response. Do not include any other text, explanations, or markdown.

    The JSON object must contain exactly these {len(all_json_keys)} keys:
    {all_json_keys}
    """

    # 7. Call the OpenAI API
    response_content = "" # Initialize for error logging
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            top_p=1.0,
            max_tokens=8192,
            response_format={"type": "json_object"} # Force JSON output
        )

        # 8. Parse the JSON response
        response_content = response.choices[0].message.content
        return json.loads(response_content)

    except json.JSONDecodeError:
        # Fallback if the model *still* fails (unlikely with json_object)
        print(f"Warning: Model response was not valid JSON. Attempting to clean...")
        cleaned_text = clean_json_response(response_content)
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError as e:
            # Log the *cleaned* text for easier debugging
            print(f"Error: Could not parse JSON even after cleaning. Cleaned Response: {cleaned_text}. Error: {e}")
            return None # Will be handled in the main loop
    except Exception as e:
        print(f"An unexpected error occurred during API call or parsing: {e}")
        return None

print("All simulation functions defined.")

Block 5 (New): Run the Simulation Loop (for Skipped Cases)
This block replaces your old Block 5. It initializes the OpenAI client but only loops over the df_retry DataFrame.

In [None]:
if 'df_retry' in locals() and not df_retry.empty:

    # --- Define your specific fine-tuned model ---
    MODEL_NAME = "ft:gpt-4o-2024-08-06:personal::A8vV3mNd"

    # --- Initialize the Model ---
    try:
        if "OPENAI_API_KEY" not in os.environ:
            raise EnvironmentError("OPENAI_API_KEY environment variable not set. Please re-run Block 1.")

        client = OpenAI()
        print(f"OpenAI client initialized. Using model: {MODEL_NAME}")

    except Exception as e:
        print(f"FATAL ERROR: Could not initialize OpenAI client: {e}")
        client = None

    if client:
        # This list will hold the results ONLY for the retried cases
        results_retry = []

        print(f"Starting simulation retry for {len(df_retry)} participants...")

        # Use tqdm to loop over the df_retry DataFrame
        for index, row in tqdm(df_retry.iterrows(), total=len(df_retry), desc="Retrying Skipped Participants"):
            try:
                # Call the same simulation function from Block 4
                sim_data = simulate_single_participant(
                    row,
                    client,
                    MODEL_NAME,
                    pre_survey_human_vars,
                    pre_survey_q_map,
                    post_survey_q_map,
                    treatment_var
                )

                if sim_data:
                    # Store the original index to merge correctly
                    sim_data['index_col'] = index
                    results_retry.append(sim_data)
                else:
                    print(f"Skipping row {index} again due to simulation error.")

            except Exception as e:
                print(f"An error occurred while processing row {index}: {e}")
                pass

        print("\nRetry simulation loop complete.")
        print(f"Successfully generated new responses for {len(results_retry)}/{len(df_retry)} participants.")

elif 'df_retry' in locals() and df_retry.empty:
    print("No skipped cases found to retry. Proceed to Block 6 to save.")
    results_retry = [] # Ensure list exists
else:
    print("DataFrame 'df_retry' was not created. Skipping simulation loop.")
    results_retry = [] # Ensure list exists

Block 6 (New): Process, Combine, and Save Results
This block replaces your old Block 6. It takes the new results, updates the df_retry DataFrame, and then sticks it back together with df_complete before saving.

In [None]:
if 'results_retry' in locals() and results_retry:
    # --- Process the NEW simulation results ---

    # Convert the list of new results into a DataFrame
    sim_df_retry = pd.DataFrame(results_retry)

    # Use the 'index_col' to set the index for a perfect merge
    sim_df_retry = sim_df_retry.set_index('index_col')

    # --- Update the df_retry DataFrame ---
    # This fills in the 'NA' values in our retry_df with the new simulated data
    print(f"Updating {len(sim_df_retry)} rows in the 'retry' dataframe...")
    df_retry.update(sim_df_retry)

    # --- Combine and Save ---
    # Concatenate the original good rows with the newly filled retry rows
    final_df_re = pd.concat([df_complete, df_retry])

    # Sort by the original index to restore the file's order
    final_df_re = final_df_re.sort_index()

    # If we added an 'index' column in Block 2, drop it before saving
    if 'index' in final_df_re.columns:
        final_df_re = final_df_re.drop(columns=['index'])

    try:
        final_df_re.to_csv(OUTPUT_PATH_RETRY, index=False)
        print(f"\nSuccessfully saved combined dataset to: {OUTPUT_PATH_RETRY}")
        print(f"Final data shape: {final_df_re.shape}")

        # Verify that the 'CCP2_1_1' column now has fewer (or zero) NAs
        print(f"Total NAs in 'CCP2_1_1' before retry: {df_full['CCP2_1_1'].isna().sum()}")
        print(f"Total NAs in 'CCP2_1_1' after retry: {final_df_re['CCP2_1_1'].isna().sum()}")

    except Exception as e:
        print(f"\nAn error occurred while saving the file: {e}")

elif 'df_complete' in locals() and 'results_retry' in locals() and not results_retry:
    print("No new results were generated (or no retries were needed).")
    # If no retries were needed, just re-save the complete data to the new file name
    if 'df_retry' in locals() and df_retry.empty:
        df_complete.to_csv(OUTPUT_PATH_RETRY, index=False)
        print(f"Saved the original {len(df_complete)} complete cases to {OUTPUT_PATH_RETRY}.")
    else:
        print("Retry loop ran but produced no new data. Original file not re-saved.")
else:
    print("No results were generated. Nothing to save.")

One final completion

In [None]:
# --- Configuration for Final Retry ---
# We will read and overwrite the same file to finalize it.
FILE_PATH_FINAL = "/content/drive/MyDrive/CYON_Analysis_Materials/simulated_responses_re.csv"

print(f"--- Starting Final Retry ---")
print(f"Loading data from: {FILE_PATH_FINAL}")

# --- 1. Load Existing Data ---
try:
    df_full = pd.read_csv(FILE_PATH_FINAL)
    print(f"Successfully loaded. Total cases in file: {len(df_full)}")
except FileNotFoundError:
    print(f"ERROR: File not found at {FILE_PATH_FINAL}.")
    df_full = pd.DataFrame()
except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    df_full = pd.DataFrame()

if not df_full.empty:
    # --- 2. Identify Skipped vs. Complete Cases ---

    # Reset index to safely track rows
    if 'index' not in df_full.columns:
        df_full.reset_index(inplace=True)

    skipped_mask = df_full['CCP2_1_1'].isna()
    df_retry = df_full[skipped_mask].copy()
    df_complete = df_full[~skipped_mask].copy()

    initial_na_count = df_retry.shape[0]
    print(f"Found {len(df_complete)} already completed cases.")
    print(f"Found {len(df_retry)} skipped cases to retry.")

    # --- 3. Run Simulation Loop (Only if needed) ---
    if not df_retry.empty:
        MODEL_NAME = "ft:gpt-4o-2024-08-06:personal::A8vV3mNd"
        results_retry = []
        client = None

        try:
            if "OPENAI_API_KEY" not in os.environ:
                raise EnvironmentError("OPENAI_API_KEY environment variable not set. Please re-run Block 1.")
            client = OpenAI()
            print(f"OpenAI client initialized. Using model: {MODEL_NAME}")
        except Exception as e:
            print(f"FATAL ERROR: Could not initialize OpenAI client: {e}")

        if client:
            print(f"Starting simulation retry for {len(df_retry)} participants...")
            for index, row in tqdm(df_retry.iterrows(), total=len(df_retry), desc="Retrying Final Cases"):
                try:
                    sim_data = simulate_single_participant(
                        row, client, MODEL_NAME,
                        pre_survey_human_vars, pre_survey_q_map,
                        post_survey_q_map, treatment_var
                    )
                    if sim_data:
                        sim_data['index'] = index # Use the 'index' column for merging
                        results_retry.append(sim_data)
                    else:
                        print(f"Skipping row {index} again due to simulation error.")
                except Exception as e:
                    print(f"An error occurred while processing row {index}: {e}")

            print(f"\nRetry loop complete. Successfully generated new responses for {len(results_retry)}/{len(df_retry)} cases.")

            # --- 4. Process New Results ---
            if results_retry:
                sim_df_retry = pd.DataFrame(results_retry)
                sim_df_retry = sim_df_retry.set_index('index')

                # Use update() to fill in the missing values in our retry dataframe
                print(f"Updating {len(sim_df_retry)} rows in the 'retry' dataframe...")
                df_retry.set_index('index', inplace=True) # Align indices for update
                df_retry.update(sim_df_retry)

                # Re-align df_complete index for final concat
                df_complete.set_index('index', inplace=True)

    # --- 5. Combine and Save Final Data ---
    try:
        # Combine the original good rows with the newly filled retry rows
        final_df_re = pd.concat([df_complete, df_retry])

        # Sort by the original index to restore the file's order
        final_df_re = final_df_re.sort_index()

        # Drop the 'index' column we added, if it exists
        if 'index' in final_df_re.columns:
            final_df_re = final_df_re.drop(columns=['index'])
        # Also drop the 'index_col' from the very first run, if it's there
        if 'index_col' in final_df_re.columns:
            final_df_re = final_df_re.drop(columns=['index_col'])

        final_df_re.to_csv(FILE_PATH_FINAL, index=False)

        print(f"\n--- Operation Complete ---")
        print(f"Successfully saved fully completed dataset to: {FILE_PATH_FINAL}")
        print(f"Final data shape: {final_df_re.shape}")

        final_na_count = final_df_re['CCP2_1_1'].isna().sum()
        print(f"Total NAs in 'CCP2_1_1' before retry: {initial_na_count}")
        print(f"Total NAs in 'CCP2_1_1' after retry:  {final_na_count}")

    except Exception as e:
        print(f"\nAn error occurred while saving the file: {e}")

elif df_full.empty:
    print("Could not proceed because the input file was not loaded.")
else:
    print("No cases to retry. File is already complete.")