<a href="https://colab.research.google.com/github/rahulamatapu/Community-Perspectives/blob/master/Unbiased_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


from langchain_openai import ChatOpenAI

import os
os.environ["OPENAI_API_KEY"] = "Your key"

from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

In [None]:
import random
import itertools
from pydantic import BaseModel, Field
from pydantic import ConfigDict
from typing import Optional
from typing import List
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage



In [None]:
from typing import Dict, Tuple
import pandas as pd
import ast
import json

In [None]:
SEED = 24
random.seed(SEED)

In [None]:

class PersonaInformation(BaseModel):
    persona_id: str
    age: int
    gender: str
    race_ethnicity: str
    marital_status: str 
    citizenship_status: str  
    birth_place: str
    year_of_immigration: Optional[int] = Field(default=None)
    primary_language: str
    english_proficiency: str
    education_level: str
    school_enrolment: Optional[str]
    employment_status: str
    occupation: str
    industry_of_employment: Optional[str]
    class_of_worker: Optional[str]
    work_hours_per_week: int  
    income_bracket: str
    poverty_status: str  
    government_assistance: Optional[str]
    household_relationship: str
    household_size: int  
    number_of_children: int  
    responsible_for_grandchildren: bool = Field(default=False) 
    home_ownership: str
    housing_type: str  
    utilities_access: str  
    utilities_housing_costs: str
    transportation_mode: str
    commute_duration: Optional[int] = Field(default=None)  
    work_from_home: Optional[bool] = Field(default=False)
    health_insurance: Optional[str]
    disability_status: Optional[str]
    disability_type: Optional[str] = Field(default=None)
    military_service: str  
    military_service_period: Optional[int] = Field(default=None)
    disability_rating: Optional[str] = Field(default=None)
    internet_access: Optional[str]
    device_access: str 
    recent_birth: Optional[bool] = Field(default=False)
    moved_in_last_year: bool
    previous_residence: str
    disaster_experience: str
    community_description: str
    geographical_class: Optional[str] = Field(default=None)
    
    

    class Config:
        arbitrary_types_allowed = True


class ResponseData(BaseModel):
    response_id: str
    scenario_id: str
    persona_id: str
    question_id: str
    option1: str
    option2: str
    priority_choice: str
    reasoning: str
    ordered_pair: List[str]

# Scenario Context
'''
class ScenarioContext(BaseModel):
    scenario_id: str
    description: str
    model_config = ConfigDict(arbitrary_types_allowed=True)
'''
# Community Context
class CommunityContext(BaseModel):
    community_id: str
    social_vulnerability_score: float
    #access_to_resources: str
    community_geographical_class: str

# Question, we generate questions seperately below using a script
class Question(BaseModel):
    question_id: str
    option1: str
    option2: str
    text: str


In [None]:
#Input to Model.
community_contexts = {
    "Community 1": CommunityContext(
        community_id="C001",
        social_vulnerability_score=2,
        community_geographical_class="Rural"
    ),
    "Community 2": CommunityContext(
        community_id="C002",
        social_vulnerability_score=3,
        community_geographical_class="Suburban"
    ),
    "Community 3": CommunityContext(
        community_id="C003",
        social_vulnerability_score=7,
        community_geographical_class="Urban"
    )
}


infrastructure_status = {
    "Community 1": {
        "Power School": 1, "Water School": 0, "Power Hospital": 1, "Water Hospital": 1,
        "Power Residential": 0, "Water Residential": 0, "Power Commercial": 1, "Water Commercial": 1
    },
    "Community 2": {
        "Power School": 1, "Water School": 1, "Power Hospital": 1, "Water Hospital": 1,
        "Power Residential": 0, "Water Residential": 0, "Power Commercial": 1, "Water Commercial": 0
    },
    "Community 3": {
        "Power School": 1, "Water School": 0, "Power Hospital": 1, "Water Hospital": 1,
        "Power Residential": 0, "Water Residential": 0, "Power Commercial": 1, "Water Commercial": 1
    }
}

In [None]:
# ['Repair Water Residential in Community 2',
#  'Repair Power Residential in Community 2',
#  'Repair Water School in Community 1',
#  'Repair Power Residential in Community 1',
#  'Repair Water Residential in Community 3',
#  'Repair Water School in Community 3',
#  'Repair Water Commercial in Community 2',
#  'Repair Power Residential in Community 3',
#  'Repair Water Residential in Community 1']

In [None]:
def generate_community_description(community, status, context):
    working = [k for k, v in status.items() if v == 1]
    damaged = [k for k, v in status.items() if v == 0]
    return f"In {community}: " \
           f"- Working infrastructure: {', '.join(working)}; " \
           f"- Damaged infrastructure: {', '.join(damaged)}; " \
           f"- Community geographical class: {context.community_geographical_class}; " \
           f"- Social vulnerability score: {context.social_vulnerability_score}/10; " \
           #f"- Access to resources: {context.access_to_resources}."


def generate_all_possible_questions():
    all_damaged = []
    for community, status in infrastructure_status.items():
        damaged = [(community, infra) for infra, working in status.items() if working == 0]
        all_damaged.extend(damaged)

    all_combinations = list(itertools.combinations(all_damaged, 2))

    questions = []
    for i, pair in enumerate(all_combinations, 1):
        question = Question(
            question_id=f"Q{i:03d}",
            option1=f"Repair {pair[0][1]} in {pair[0][0]}",
            option2=f"Repair {pair[1][1]} in {pair[1][0]}",
            text=f"Which should be repaired first: {pair[0][1]} in {pair[0][0]} or {pair[1][1]} in {pair[1][0]}?"
        )
        questions.append(question)

    return questions

def validate_response(response: ResponseData, persona: PersonaInformation, question: Question) -> Tuple[bool, str]:
    if response.priority_choice not in [question.option1, question.option2]:
        return False, f"Invalid priority choice: {response.priority_choice}"

    if set(response.ordered_pair) != set([question.option1, question.option2]):
        return False, f"Invalid ordered pair: {response.ordered_pair}"

    if len(response.reasoning) < 50:  # Ensure a minimum length for the reasoning
        return False, "Reasoning is too short"

    # Check if the reasoning mentions at least one community
#     if not any(community in response.reasoning for community in ["Community 1", "Community 2", "Community 3"]):
#         return False, "Reasoning does not mention any community"

    #if persona.occupation.lower() not in response.reasoning.lower():
        #return False, f"Reasoning does not mention the persona's occupation ({persona.occupation})"

    # Check if the reasoning mentions at least one infrastructure type
    #infrastructure_types = ["Water", "Power", "School", "Residential", "Commercial"]
    #if not any(infra_type in response.reasoning for infra_type in infrastructure_types):
        #return False, "Reasoning does not mention any infrastructure type"

    return True, "Valid response"

In [None]:
print(PersonaInformation.schema())
print(" ")
print(ResponseData.schema())


In [None]:


# Define the persona template without predefined attributes
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

# Few-shot examples demonstrating persona diversity
persona_examples = [
    {
        "example": """Persona ID: P001, Age: 35, Gender: Male, Race: Hispanic,
        Marital Status: Married, Citizenship: Naturalized Citizen,
        Birth Place: Mexico, Year of Immigration: 1998,
        Primary Language: Spanish, English Proficiency: Well,
        Education Level: Master's Degree, School Enrollment: No,
        Employment Status: Employed, Occupation: Civil Engineer,
        Industry: Construction, Class of Worker: Private Sector,
        Work Hours per Week: 45, Income Bracket: $75K-$100K,
        Poverty Status: Above Poverty Line, Government Assistance: None,
        Household Relationship: Head, Household Size: 4, Number of Children: 2,
        Home Ownership: Own, Housing Type: Single-Family Home,
        Utilities Access: Has all utilities, Utilities & Housing Costs: $1,500 per month,
        Transportation Mode: Drives, Commute Duration: 30 minutes,
        Work From Home: No, Health Insurance: Private,
        Disability Status: No, Military Service: None, Internet Access: Broadband,
        Device Access: Laptop, Smartphone, Recent Birth: No,
        Moved Last Year: No, Previous Residence: Same Residence,
        Disaster Experience: Experienced a hurricane in 2020,
        Geographical Class: Suburban"""
    },
    {
        "example": """Persona ID: P002, Age: 52, Gender: Female, Race: Black,
        Marital Status: Divorced, Citizenship: U.S. Citizen,
        Birth Place: United States, Year of Immigration: N/A,
        Primary Language: English, English Proficiency: Very well,
        Education Level: High School, School Enrollment: No,
        Employment Status: Retired, Occupation: N/A,
        Industry: N/A, Class of Worker: N/A,
        Work Hours per Week: 0, Income Bracket: $25K-$50K,
        Poverty Status: Below Poverty Line, Government Assistance: SNAP,
        Household Relationship: Parent, Household Size: 2, Number of Children: 1,
        Home Ownership: Rent, Housing Type: Apartment,
        Utilities Access: Limited access to utilities, Utilities & Housing Costs: $800 per month,
        Transportation Mode: Public Transit, Commute Duration: 45 minutes,
        Work From Home: N/A, Health Insurance: Medicaid,
        Disability Status: Yes, Disability Type: Mobility,
        Military Service: None, Internet Access: Mobile Data,
        Device Access: Smartphone, Recent Birth: No,
        Moved Last Year: Yes, Previous Residence: Different city,
        Disaster Experience: No major disaster experience,
        Geographical Class: Urban"""
    }
]
response_examples = [
    {
        "example": """Response ID: R001, Scenario ID: S001, Persona ID: P001, Question ID: Q001,
        Option 1: Repair Water School in Community 1, Option 2: Repair Power Residential in Community 2,
        Priority Choice: Repair Water School in Community 1,
        Reasoning: As a teacher in my community, I believe access to clean water in schools is crucial for maintaining hygiene and preventing the spread of diseases, especially after a disaster. Community 1 has limited resources and a high social vulnerability score, so ensuring schools are operational is a top priority for education continuity and community resilience.
        Ordered Pair: ["Repair Water School in Community 1", "Repair Power Residential in Community 2"]"""
    },
    {
        "example": """Response ID: R002, Scenario ID: S001, Persona ID: P002, Question ID: Q002,
        Option 1: Repair Power Residential in Community 2, Option 2: Repair Water Residential in Community 3,
        Priority Choice: Repair Power Residential in Community 2,
        Reasoning: As a nurse in my community , I understand the critical importance of electrical power for medical equipment and maintaining communication during emergencies.  Community 2 has moderate access to resources, and restoring power to residential areas will help more people in the larger population of 12,000 to manage their health needs at home, potentially reducing strain on medical facilities.
        Ordered Pair: ["Repair Power Residential in Community 2", "Repair Water Residential in Community 3"]"""
    },
    {
        "example": """Response ID: R003, Scenario ID: S001, Persona ID: P003, Question ID: Q003,
        Option 1: Repair Water Residential in Community 1, Option 2: Repair Power School in Community 3,
        Priority Choice: Repair Water Residential in Community 1,
        Reasoning: As a retired person living in my community, I recognize the dire need for water in residential areas of Community 1. I believe we should prioritize the basic needs of the most vulnerable. Community 1 has a higher social vulnerability score and limited access to resources, making clean water for their homes a critical priority for health and sanitation.
        Ordered Pair: ["Repair Water Residential in Community 1", "Repair Power School in Community 3"]"""
    },
    {
        "example": """Response ID: R004, Scenario ID: S001, Persona ID: P004, Question ID: Q004,
        Option 1: Repair Power Commercial in Community 2, Option 2: Repair Water School in Community 3,
        Priority Choice: Repair Power Commercial in Community 2,
        Reasoning: As a small business owner in my community, I believe restoring power to commercial areas is crucial for economic recovery after a disaster.Community 2 has a moderate population of 12,000 and restoring commercial power will help businesses reopen, provide essential services, and support the local economy. This can indirectly benefit the entire region by maintaining supply chains and employment.
        Ordered Pair: ["Repair Power Commercial in Community 2", "Repair Water School in Community 3"]"""
    },
    {
        "example": """Response ID: R005, Scenario ID: S001, Persona ID: P005, Question ID: Q005,
        Option 1: Repair Water Commercial in Community 1, Option 2: Repair Power Residential in Community 3,
        Priority Choice: Repair Water Commercial in Community 1,
        Reasoning: As a public health official in my community, I prioritize repairing the water supply to commercial areas in community 1. Given the high social vulnerability score of 7/10 and limited access to resources, restoring water to commercial areas will allow essential businesses like pharmacies and food stores to operate, supporting the basic needs of 5,000 residents. This is critical for preventing secondary health crises in the vulnerable population.
        Ordered Pair: ["Repair Water Commercial in Community 1", "Repair Power Residential in Community 3"]"""
    }
]

# Enhanced Prompt to Allow Dynamic Persona Generation
persona_prompt_template = FewShotPromptTemplate(
    prefix="""
    Generate **realistic and diverse personas** for a disaster scenario study. 
    Ensure variety using **logical attribute generation** instead of predefined lists.
    
    **Personas should include:**
    - You must generate a valid JSON output
    - All JSON fields are correctly formatted and enclosed in double quotes.
    - No newlines (`\n`) inside values
    - Age: Ranges from **18 to 80+**
    - Gender: Mix of **Male, Female**
    - Race/Ethnicity: Represent **a variety of racial and ethnic backgrounds**
    - Citizenship: Mix of **U.S. Citizens and Naturalized Immigrants**
    - Birthplace: Can be **U.S. or foreign-born**
    - Primary Language: Based on **birthplace & ethnicity**
    - English Proficiency: Should be logically consistent
    - Education Level: Ranging from **no formal education to PhD**
    - Employment Status: Mix of **employed, unemployed, self-employed, and retired**
    - Occupation: Should be **realistically assigned based on education & age**
    - It is illogical for a High school Graduate to be a teacher or former teacher.
    - Industry: Relevant to occupation (e.g., Tech for Software Engineers, Healthcare for Nurses)
    - Class of Worker: Vary between **Private Sector, Government, Self-Employed, Non-Profit**
    - Work Hours: Ranges from **part-time to full-time, 0 if unemployed or retired**
    - Income Bracket: Determined based on **occupation, industry, and experience**
    - Poverty Status: Should logically correspond to income level
    - Household Structure: Single, Married, Extended Family, etc.
    - Number of Children: Based on **age & marital status**
    - If income bracket is less than $25K and household size is 1, then poverty status must be "Below Poverty Line".
    - If poverty status is "Above Poverty Line", then income bracket must be at least '$25K-$50K` or higher.
    - If household size is 1, then `responsible_for_grandchildren` must be False.
    - If `responsible_for_grandchildren is True, then household size` must be at least 2.
    - Housing Type: **Apartment, Single-Family Home, Mobile Home, Townhouse**
    - Home Ownership: **Own, Rent, Living with Family**
    - Disability rating should only be assigned to those with military service
    - Utilities & Housing Costs: Should vary based on **income and location**
    - Transportation Mode: **Drives, Public Transit, Walks, Works from Home**
    - Disaster Experience: Some with **hurricane, earthquake, flood experience** while others have none
    - Geographical Class: You must include one of  (Urban, Suburban, or Rural). THIS MUST ALWAYS BE PRESENT!!!
    

    **DO NOT OMIT ANY REQUIRED FIELDS.** If a field does not apply, set a logical default.
    **DO NOT USE PREDEFINED LISTS.** Instead, generate attributes logically.
    
    """,
    examples=persona_examples,
    suffix="\nGenerate a persona:",
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

# Response Prompt Template 
response_prompt_template = FewShotPromptTemplate(
    prefix="SYNTHETIC_FEW_SHOT_PREFIX",
    examples=response_examples,
    suffix="SYNTHETIC_FEW_SHOT_SUFFIX\n\nPersona: {persona}\nQuestion: {question}\nGenerate a response:",
    input_variables=["subject", "extra", "persona", "question"],
    example_prompt=OPENAI_TEMPLATE,
)

# Create LLM Generators
persona_generator = create_openai_data_generator(
    output_schema=PersonaInformation,
    llm=ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.9),  
    #llm=ChatOpenAI(temperature=0.9),
    prompt=persona_prompt_template,
)

response_generator = create_openai_data_generator(
    output_schema=ResponseData,
    llm=ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.7), 
    #llm=ChatOpenAI(temperature=0.7),
    prompt=response_prompt_template,
)





In [None]:
# def generate_personas(num_personas: int) -> List[PersonaInformation]:
#     communities = list(community_contexts.keys())

#     # Generate descriptions for all communities
#     community_descriptions = {
#         community: generate_community_description(community, infrastructure_status[community], community_contexts[community])
#         for community in communities
#     }

#     # Used for combining all community descriptions
#     all_community_desc = " ".join(community_descriptions.values())

#     synthetic_personas = persona_generator.generate(
#         subject="storm_scenario_persona",
#         extra={
#             "age": random.choices([random.randint(18, 24), random.randint(25, 44), random.randint(45, 64), random.randint(65, 85)],
#                                     weights=[0.15, 0.40, 0.30, 0.15], k=1)[0],
#             "race_ethnicity": random.choices(["White", "Black", "Hispanic", "Asian", "Native American", "Mixed"],
#                                              weights=[0.60, 0.13, 0.19, 0.06, 0.01, 0.01], k=1)[0],
#             "selected_tier": random.choices(["High-income jobs", "Middle-income jobs", "Low-income jobs"],
#                                             weights=[0.15, 0.50, 0.35], k=1)[0],
#             "occupation": lambda selected_tier: random.choice(occupation_tiers[selected_tier]),
#             "income_bracket": lambda selected_tier: random.choice(income_bracket_mapping[selected_tier]),
#             "work_from_home": lambda selected_tier: random.random() < work_from_home_likelihood[selected_tier],
#             "commute_duration": lambda work_from_home: random.choices([5, 15, 30, 45, 60, 90],
#                                                                        weights=[0.05, 0.15, 0.40, 0.25, 0.10, 0.05],
#                                                                        k=1)[0] if not work_from_home else 0,
#             "community_description": all_community_desc
#         },
#         runs=num_personas
#     )[:num_personas]

#     for i, persona in enumerate(synthetic_personas):
#         persona.persona_id = f"P{i+1:03d}"
#         persona.community_description = all_community_desc  # Assign overall community context


#     return synthetic_personas

In [None]:
def generate_personas(num_personas: int) -> List[PersonaInformation]:
    communities = list(community_contexts.keys())

    # Generate descriptions for all communities
    community_descriptions = {
        community: generate_community_description(community, infrastructure_status[community], community_contexts[community])
        for community in communities
    }

    # Combine all community descriptions
    all_community_desc = " ".join(community_descriptions.values())
    
    

    # Construct prompt dynamically instead of using predefined lists
    extra_prompt = f"""Create a realistic persona ensuring diversity in:
        - Age (18-85+) with logical employment or retirement status
        - Occupation should be inferred and realistic based on education level and age.
        - Income logically aligned with occupation, household size, and employment type
        - Commute Duration dependent on work location (remote workers = 0)
        - If income bracket is less than $25K and household size is 1, then poverty status must be "Below Poverty Line".
        - If poverty status is "Above Poverty Line", then income bracket must be at least '$25K-$50K` or higher.
        - If household size is 1, then `responsible_for_grandchildren` must be False.
        - If `responsible_for_grandchildren is True, then household size` must be at least 2.
        - Disability rating should only be assigned to those with military service
        - Housing status aligned with economic situation (ownership vs rental)
        - Disaster Experience varies based on geographic vulnerability
       
        
    Ensure personas reflect real-life distributions, but DO NOT hardcode specific examples.

    Community descriptions: {all_community_desc}"""

    synthetic_personas = persona_generator.generate(
        subject="storm_scenario_persona",
        extra=extra_prompt,
        runs=num_personas
    )[:num_personas]

    print(f"Generated {len(synthetic_personas)} personas.")

    # Assign community descriptions and validate completeness
    
    for i, persona in enumerate(synthetic_personas):
        persona.persona_id = f"P{i+1:03d}"
        persona.community_description = all_community_desc  # Assign overall community context

        # Ensure LLM output includes an occupation, fallback if missing
        if not hasattr(persona, 'occupation') or not persona.occupation:
            persona.occupation = "Unknown Profession"


        

    return synthetic_personas
    
def build_full_persona_string(persona):
    """
    Convert a PersonaInformation object to a full descriptive string.
    """
    return f"""
Persona ID: {persona.persona_id}
Age: {persona.age}
Gender: {persona.gender}
Race/Ethnicity: {persona.race_ethnicity}
Marital Status: {persona.marital_status}
Citizenship Status: {persona.citizenship_status}
Birth Place: {persona.birth_place}
Year of Immigration: {persona.year_of_immigration}
Primary Language: {persona.primary_language}
English Proficiency: {persona.english_proficiency}
Education Level: {persona.education_level}
School Enrollment: {persona.school_enrolment}
Employment Status: {persona.employment_status}
Occupation: {persona.occupation}
Industry of Employment: {persona.industry_of_employment}
Class of Worker: {persona.class_of_worker}
Work Hours Per Week: {persona.work_hours_per_week}
Income Bracket: {persona.income_bracket}
Poverty Status: {persona.poverty_status}
Government Assistance: {persona.government_assistance}
Household Relationship: {persona.household_relationship}
Household Size: {persona.household_size}
Number of Children: {persona.number_of_children}
Responsible for Grandchildren: {persona.responsible_for_grandchildren}
Home Ownership: {persona.home_ownership}
Housing Type: {persona.housing_type}
Utilities Access: {persona.utilities_access}
Housing Costs: {persona.utilities_housing_costs}
Transportation Mode: {persona.transportation_mode}
Commute Duration: {persona.commute_duration} minutes
Work from Home: {persona.work_from_home}
Health Insurance: {persona.health_insurance}
Disability Status: {persona.disability_status}
Disability Type: {persona.disability_type}
Military Service: {persona.military_service}
Military Service Period: {persona.military_service_period}
Disability Rating: {persona.disability_rating}
Internet Access: {persona.internet_access}
Device Access: {persona.device_access}
Recent Birth: {persona.recent_birth}
Moved in Last Year: {persona.moved_in_last_year}
Previous Residence: {persona.previous_residence}
Disaster Experience: {persona.disaster_experience}
Geographical Class: {persona.geographical_class}

""".strip()


In [None]:
# #Generate personas
# num_personas_to_generate = 200
# print("\n--- Generating Personas ---")
# synthetic_personas = generate_personas(num_personas_to_generate)


# # Print generated personas
# for persona in synthetic_personas:
#     print(persona.model_dump_json(indent=2))


In [None]:


# with open("synthetic_personas_unbiased.json", "w") as f:
#     json.dump([p.dict() for p in synthetic_personas], f, indent=2)


In [None]:
#Read saved persona from json

with open("data/synthetic_personas_unbiased.json", "r") as f:
    persona_dicts = json.load(f)
    synthetic_personas = [PersonaInformation(**d) for d in persona_dicts]


In [None]:
synthetic_personas

In [None]:
#Fixed assigning same questions to all personas

# Initialize the language model
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.7)
#llm=ChatOpenAI(temperature=0.7)

def generate_responses_for_personas(
    personas: List[PersonaInformation],
    questions: List[Question],
    max_attempts: int = 5
) -> List[dict]:
    responses = []
    raw_outputs = []
    response_counter = 1

    for persona in personas:
        print(f"Generating responses for Persona ID: {persona.persona_id}, Occupation: {persona.occupation}")

        for question in questions:  # Now, every persona gets all questions
            print(f"\nProcessing question: {question.text}")
            valid_response = None
            attempts = 0

            while valid_response is None and attempts < max_attempts:
                try:
                    prompt = PromptTemplate(
                        input_variables=["full_persona", "question", "option1", "option2", "all_community_descriptions"],
                        template="""You are a {full_persona}, and you are aware that a tornado has impacted three communities (1, 2, and 3), causing damage to critical infrastructure.

                        You are familiar with the conditions in all communities, described as follows: {all_community_descriptions}. Social Vulnerability Scores (SVS) range from 0 (least vulnerable) to 1 (most vulnerable), with higher values indicating greater vulnerability.

                        Consider the situation, taking into account any details you find relevant.

                        Question: {question}  
                        Before choosing between '{option1}' and '{option2}', first explain your reasoning.

                        Provide your answer in the following format:
                        Reasoning: [Explain your selection with a concise rationale, highlighting relevant factors influencing your decision.]
                        Priority Choice: [Your chosen option]  
                        """
                        )

                    full_persona_string = build_full_persona_string(persona)

                    formatted_prompt = prompt.format(
                        full_persona=full_persona_string,  
                        occupation=persona.occupation,
                        question=question.text,
                        option1=question.option1,
                        option2=question.option2,
                        all_community_descriptions=persona.community_description
                    )

                    response = llm.invoke([HumanMessage(content=formatted_prompt)])

                    # Parse the response
                    response_text = response.content
                    print(f"Response: {response_text}")
                    lines = response_text.split('\n')
                    priority_choice = next((line.split(': ', 1)[1].strip() for line in lines if line.startswith("Priority Choice:")), None)
                    reasoning = next((line.split(': ', 1)[1].strip() for line in lines if line.startswith("Reasoning:")), None)

                    if priority_choice not in [question.option1, question.option2]:
                        print(f"Invalid priority choice: {priority_choice}. Retrying.")
                        attempts += 1
                        continue

                    if not reasoning:
                        print("No reasoning provided. Retrying.")
                        attempts += 1
                        continue

                    ordered_pair = [
                        priority_choice,
                        question.option2 if priority_choice == question.option1 else question.option1
                    ]

                    response_data = ResponseData(
                        response_id=f"R{response_counter:03d}",
                        scenario_id="S001",
                        persona_id=persona.persona_id,
                        question_id=question.question_id,
                        option1=question.option1,
                        option2=question.option2,
                        priority_choice=priority_choice,
                        reasoning=reasoning,
                        ordered_pair=ordered_pair
                    )

                    # raw prompt and raw LLM response added to store the full input-output
                    raw_entry = {
                        "response_id": f"R{response_counter:03d}",
                        "persona_id": persona.persona_id,
                        "question_id": question.question_id,
                        "formatted_prompt": formatted_prompt,
                        "raw_response": response_text,
                        "priority_choice": priority_choice,
                        "reasoning": reasoning
                    }

                    validation_result, validation_message = validate_response(response_data, persona, question)
                    if validation_result:
                        valid_response = {
                            **response_data.model_dump(),
                            'age': persona.age,
                            'occupation': persona.occupation,
                            'geographical_class': persona.geographical_class,
                            'disaster_experience': persona.disaster_experience,
                            'income_bracket': persona.income_bracket,
                            'community_description': persona.community_description
                        }
                        responses.append(valid_response)
                        response_counter += 1
                        print("Valid response generated.")
                    else:
                        print(f"Response validation failed: {validation_message}. Retrying...")

                except Exception as e:
                    print(f"Error generating response: {str(e)}")

                attempts += 1

            if valid_response is None:
                print(f"Failed to generate valid response after {max_attempts} attempts")

    return responses, raw_outputs



print("\n--- Summary of Generated Personas ---")
for persona in synthetic_personas:
    print(f"Persona ID: {persona.persona_id}, Occupation: {persona.occupation}")

# Generate questions
questions = generate_all_possible_questions()

# Generate responses
responses, raw_outputs = generate_responses_for_personas(synthetic_personas, questions)
with open("data/raw_prompt_responses_rewording_reasoning.json", "w") as f:
    json.dump(raw_outputs, f, indent=2)

# Create DataFrame for responses (which now include persona information)
responses_df = pd.DataFrame(responses)

# Save to a single CSV file 
responses_df.to_csv('data/personas_and_responses_unbiased_rewording_reasoning.csv', index=False)
print("\nPersona and Response information has been written to personas_and_responses")

# Print results
print(f"\nGenerated {len(responses)} total responses.")
print(responses_df[['response_id', 'persona_id', 'occupation', 'priority_choice']])
