<a href="https://colab.research.google.com/github/ZLY1223/Vaccine_LLM_Prompt_Engineering/blob/main/Counterfactual_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "sk-xxxxxxx"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:

from transformers import set_seed

In [None]:
pip install openai



In [None]:
import openai

# Counterfactual Analysis

## M2: random diet assignment

### GPT

In [None]:
import pandas as pd
import random
import re

task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
media_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/media_diet_with_extracted_words.xlsx")
matrix_df = pd.read_excel(
    "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/Media_Diets.xlsx",
    sheet_name="Media Diets",
    index_col=0
)

category_mapping = {
    "Left": "Left Echochamber",
    "Left-Center": "Left Echochamber",
    "Center": "Center-ish",
    "Right": "Right Echochamber",
    "Right-Center": "Right Echochamber",
    "Low Credibility": "Misinformation Only"
}

media_df["Group"] = media_df["Category"].map(category_mapping)
media_df = media_df[media_df["Group"].notna()]

grouped_media_df = media_df.groupby("Group").apply(lambda g: g.sample(n=min(30, len(g)), random_state=42)).reset_index(drop=True)

grouped_media_df["Group"].value_counts()

  grouped_media_df = media_df.groupby("Group").apply(lambda g: g.sample(n=min(30, len(g)), random_state=42)).reset_index(drop=True)


Unnamed: 0_level_0,count
Group,Unnamed: 1_level_1
Center-ish,30
Left Echochamber,30
Misinformation Only,30
Right Echochamber,30


In [None]:
grouped_media_df.to_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/media_diet_balanced_30.xlsx", index=False)

In [None]:
# ==== Establishing Quotas for Each Category of Media Diet ====
matrix_df = matrix_df.rename(columns={"Misinfo": "Low Credibility"})

# Retain only quota items appearing in all four target groups
valid_categories = list(category_mapping.keys())
matrix_df_filtered = matrix_df[valid_categories]

diet_to_article_quota = {
    diet: matrix_df_filtered.loc[diet][matrix_df_filtered.loc[diet] > 0].to_dict()
    for diet in matrix_df_filtered.index
}

diet_to_article_quota["Right Echochamber"]

{'Right': 2, 'Right-Center': 3}

In [None]:
# ==== Randomly Assign Media Diet + Construct Prompt ====

def build_rag_prompt(task_prompt, diet):
    quota = diet_to_article_quota.get(diet, {})
    selected_articles = []

    for cat, count in quota.items():
        available = grouped_media_df[grouped_media_df["Category"] == cat][["Headline", "First_100_Words"]].dropna()
        if len(available) == 0:
            print(f"[Warning] No articles for category '{cat}' under diet '{diet}'")
            continue
        samples = available.sample(n=min(count, len(available)), random_state=random.randint(0, 10000))
        selected_articles.extend(list(zip(samples["Headline"], samples["First_100_Words"])))

    random.shuffle(selected_articles)

    disease_block = f"""% Media Diet Information
Assume today is June 29, 2022. Your decision about receiving a COVID-19 vaccine is primarily shaped by your demographics. However, the kind of information you are exposed to in the media may also influence your thinking.
Below are five real-world news articles that reflect the type of media you are likely to encounter.
As you read them, consider:
- Do any of these articles reinforce your existing beliefs?
- Do any contradict them or introduce new concerns or reassurances?
Unless strongly challenged by the articles, your decision should remain aligned with your prior beliefs. Only adjust your position if a particular article clearly shifts your perception.
If one article is especially influential, briefly mention it in your reasoning.
"""

    media_context = "The articles are listed below:\n"
    for i, (title, content) in enumerate(selected_articles[:5]):
        media_context += f"Article {i+1}: {title.strip()}\n{content.strip()}\n"

    insert_point = task_prompt.find("% Task Prompt")
    return task_prompt if insert_point == -1 else (
        task_prompt[:insert_point] + disease_block + media_context + task_prompt[insert_point:]
    )

def safe_build(row):
    return build_rag_prompt(row["Task_Prompt"], row["media_diet_label"])

media_diet_choices = list(diet_to_article_quota.keys())

for i in range(10):
    random.seed(1000 + i)

    task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
    task_df["media_diet_label"] = [random.choice(media_diet_choices) for _ in range(len(task_df))]

    task_df["Task_Prompt_Full"] = task_df.apply(safe_build, axis=1)

    task_df["Task_Prompt_Full"] = task_df["Task_Prompt_Full"].apply(
        lambda text: re.sub(
            r"Now, based on the information above,",
            "Now, based on your demographics and the content of the five articles above,",
            text
        )
    )

    output_path = f"/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2_seed{1000+i}.xlsx"
    task_df[["id", "Task_Prompt_Full", "media_diet_label"]].to_excel(output_path, index=False)
    print(f"[Saved] Seed {1000 + i} -> {output_path}")


[Saved] Seed 1000 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2_seed1000.xlsx
[Saved] Seed 1001 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2_seed1001.xlsx
[Saved] Seed 1002 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2_seed1002.xlsx
[Saved] Seed 1003 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2_seed1003.xlsx
[Saved] Seed 1004 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2_seed1004.xlsx
[Saved] Seed 1005 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2_seed1005.xlsx
[Saved] Seed 1006 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_me

In [None]:
import pandas as pd
import re
import time
import openai

seed=42
client = openai.OpenAI(api_key="sk-xxxxxxx")

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Chatgpt3.5/M2_random_media_diet/"

# seeds to run
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_random_media_diet_4omini_seed{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    print(f"\n===== Running with seed={seed} =====\n")

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": "You are simulating a human persona's decision-making process regarding whether to receive a COVID-19 vaccine."
                    },
                    {
                        "role": "user",
                        "content": task_prompt
                    }
                ],
                seed=seed,
                temperature=0.5,
                max_tokens=1024
            )

            assistant_response = response.choices[0].message.content
            print(f"Row {idx} Seed {seed} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            likelihood_vaccinated = int(match_vaccinated.group(1)) if match_vaccinated else "Extraction Failed"
            likelihood_not_vaccinated = int(match_not_vaccinated.group(1)) if match_not_vaccinated else "Extraction Failed"

            option_mapping = {
                likelihood_vaccinated: "Vaccinated",
                likelihood_not_vaccinated: "Not Vaccinated"
            }

            max_prob = max(likelihood_vaccinated, likelihood_not_vaccinated) if isinstance(likelihood_vaccinated, int) and isinstance(likelihood_not_vaccinated, int) else None
            selected_option = option_mapping[max_prob] if max_prob is not None else "Undetermined"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": selected_option,
                "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vaccinated}%",
                "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vaccinated}%"
            })

        except Exception as e:
            print(f"Error at row {idx}, seed {seed}: {e}")
            continue

        time.sleep(0.2)

    task_results_df = pd.DataFrame(task_results)
    task_results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}\n")



===== Running with seed=1000 =====

Row 0 Seed 1000 Response:
```
Brief Reasoning: The articles raise significant concerns about the safety and effectiveness of COVID-19 vaccines, particularly regarding autoimmune risks and alleged fraud in vaccine trials, which reinforce my hesitance about vaccination. Therefore, I decide not to take the vaccine due to these concerns.

Likelihood of Choosing Vaccinated: 20%
Likelihood of Choosing Not Vaccinated: 80%
```

Row 1 Seed 1000 Response:
```
Brief Reasoning: The articles highlight the ongoing rise in COVID-19 infections and hospitalizations, reinforcing my belief that vaccination is important for protection against the virus. Additionally, the study showing reduced hospitalization rates for infants born to vaccinated mothers further supports the vaccine's effectiveness and necessity. 

Likelihood of Choosing Vaccinated: 85%
Likelihood of Choosing Not Vaccinated: 15%
```

Row 2 Seed 1000 Response:
```
Brief Reasoning: The articles raise signi

KeyboardInterrupt: 

### Gemini-2.5-flash

In [None]:
import pandas as pd
import re
import time
import google.generativeai as genai

genai.configure(api_key="xxxxxxx")

model = genai.GenerativeModel(model_name="gemini-2.5-flash")

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/gemini-2.5-flash/M2/"

seed=42
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_random_media_diet_gemini2.5flash_seed{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            response = model.generate_content(full_prompt)
            assistant_response = response.text.strip()

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            l_vac = int(match_vaccinated.group(1)) if match_vaccinated else -1
            l_not = int(match_not_vaccinated.group(1)) if match_not_vaccinated else -1

            if l_vac == -1 or l_not == -1:
                decision = "Extraction Failed"
            else:
                decision = "Vaccinated" if l_vac >= l_not else "Not Vaccinated"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": decision,
                "Likelihood of Choosing Vaccinated (%)": f"{l_vac}%" if l_vac >= 0 else "NA",
                "Likelihood of Choosing Not Vaccinated (%)": f"{l_not}%" if l_not >= 0 else "NA"
            })

        except Exception as e:
            print(f"Error on seed {seed}, row {idx}: {e}")
            time.sleep(1.5)

        time.sleep(0.5)

    df_results = pd.DataFrame(task_results)
    df_results.to_excel(output_file, index=False)
    print(f"Seed {seed} finished. Results saved to {output_file}\n")


### Llama-4-17b

In [None]:
import pandas as pd
import re
import time
import os
from openai import OpenAI

seed=42
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key="xxxxxxx",
)

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_media_diet/task_inputs_two_options_M2/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Llama-4-17b/M2/"

# seeds to run
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_random_media_diet_Llama_4_17b_{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    print(f"\n===== Running with seed={seed} =====\n")

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            completion = client.chat.completions.create(
                model="meta-llama/Llama-4-Scout-17B-16E-Instruct:cerebras",
                messages=[
                    {
                        "role": "user",
                        "content": task_prompt
                    }
                ]
            )


            assistant_response = completion.choices[0].message.content

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            likelihood_vaccinated = int(match_vaccinated.group(1)) if match_vaccinated else "Extraction Failed"
            likelihood_not_vaccinated = int(match_not_vaccinated.group(1)) if match_not_vaccinated else "Extraction Failed"

            option_mapping = {
                likelihood_vaccinated: "Vaccinated",
                likelihood_not_vaccinated: "Not Vaccinated"
            }

            max_prob = max(likelihood_vaccinated, likelihood_not_vaccinated) if isinstance(likelihood_vaccinated, int) and isinstance(likelihood_not_vaccinated, int) else None
            selected_option = option_mapping[max_prob] if max_prob is not None else "Undetermined"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": selected_option,
                "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vaccinated}%",
                "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vaccinated}%"
            })

        except Exception as e:
            print(f"Error at row {idx}, seed {seed}: {e}")
            continue

        time.sleep(0.2)

    task_results_df = pd.DataFrame(task_results)
    task_results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}\n")


## M2: random 5 articles sampling

### GPT

In [None]:
import pandas as pd
import random
import re

# ==== 1. Data Loading ====
task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
media_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/media_diet_balanced_30each.xlsx")

media_df = media_df.dropna(subset=["Headline", "First_100_Words"])

# ==== 2. Prompt Constructor: Without Media Diet, Only Randomly Selecting Articles ====
def build_random_articles_prompt(task_prompt):
    selected_articles = media_df.sample(n=5, random_state=random.randint(0, 10000)).reset_index(drop=True)

    disease_block = f"""% Media Diet Information
Assume today is June 29, 2022. Your decision about receiving a COVID-19 vaccine is primarily shaped by your demographics. However, the kind of information you are exposed to in the media may also influence your thinking.
Below are five real-world news articles that reflect the type of media you are likely to encounter.
As you read them, consider:
- Do any of these articles reinforce your existing beliefs?
- Do any contradict them or introduce new concerns or reassurances?
Unless strongly challenged by the articles, your decision should remain aligned with your prior beliefs. Only adjust your position if a particular article clearly shifts your perception.
If one article is especially influential, briefly mention it in your reasoning.
"""

    media_context = "The articles are listed below:\n"
    for i, row in selected_articles.iterrows():
        media_context += f"Article {i+1}: {row['Headline'].strip()}\n{row['First_100_Words'].strip()}\n"

    insert_point = task_prompt.find("% Task Prompt")
    return task_prompt if insert_point == -1 else (
        task_prompt[:insert_point] + disease_block + media_context + task_prompt[insert_point:]
    )

def safe_build_random(row):
    return build_random_articles_prompt(row["Task_Prompt"])

# ==== 3. Generating Different Versions Through Multiple Rounds of Random Construction ====
for i in range(10):
    seed = 1000 + i
    random.seed(seed)

    task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
    task_df["Task_Prompt_Full"] = task_df.apply(safe_build_random, axis=1)

    task_df["Task_Prompt_Full"] = task_df["Task_Prompt_Full"].apply(
        lambda text: re.sub(
            r"Now, based on the information above,",
            "Now, based on your demographics and the content of the five articles above,",
            text
        )
    )

    output_path = f"/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_inputs_two_options_M2_seed{seed}.xlsx"
    task_df[["id", "Task_Prompt_Full"]].to_excel(output_path, index=False)
    print(f"[Saved] Seed {seed} -> {output_path}")


[Saved] Seed 1000 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_inputs_two_options_M2_seed1000.xlsx
[Saved] Seed 1001 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_inputs_two_options_M2_seed1001.xlsx
[Saved] Seed 1002 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_inputs_two_options_M2_seed1002.xlsx
[Saved] Seed 1003 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_inputs_two_options_M2_seed1003.xlsx
[Saved] Seed 1004 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_inputs_two_options_M2_seed1004.xlsx
[Saved] Seed 1005 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_inputs_two_options_M2_seed1005.xlsx
[Saved] Seed 1006 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles/task_

In [None]:
import pandas as pd
import re
import time
import openai

input_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_articles"
output_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Chatgpt/M2_random_5_articles"
seed=42
client = openai.OpenAI(api_key="sk-xxxxxxx")

# ==== Start multi-round seed loop ====
for seed in list(range(1000, 1010)):
    input_path = f"{input_dir}/task_inputs_two_options_M2_seed{seed}.xlsx"
    df = pd.read_excel(input_path)

    task_results = []

    for idx, row in df.iterrows():
        task_prompt = row["Task_Prompt_Full"]
        prompt = [{"role": "user", "content": task_prompt}]
        id = row.get("id", f"sample_{idx}")

        # === Calling the Model ===
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=[
                {
                    "role": "system",
                    "content": "You are simulating a human persona's decision-making process regarding whether to receive a COVID-19 vaccine."
                },
                {
                    "role": "user",
                    "content": task_prompt
                }
            ],
            seed=seed,
            temperature=0.5,
            max_tokens=1024
        )

        assistant_response = response.choices[0].message.content
        print(f"[Seed {seed}] Row {idx} Response:\n{assistant_response}\n")

        # === Probability of Extraction ===
        match_vac = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
        match_not_vac = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

        likelihood_vac = int(match_vac.group(1)) if match_vac else "Extraction Failed"
        likelihood_not_vac = int(match_not_vac.group(1)) if match_not_vac else "Extraction Failed"

        if isinstance(likelihood_vac, int) and isinstance(likelihood_not_vac, int):
            selected = "Vaccinated" if likelihood_vac >= likelihood_not_vac else "Not Vaccinated"
        else:
            selected = "Extraction Failed"

        task_results.append({
            "id": id,
            "Task_Prompt": task_prompt,
            "Task_Response": assistant_response,
            "Vaccination Decision": selected,
            "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vac}%",
            "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vac}%"
        })

        time.sleep(0.2)

    results_df = pd.DataFrame(task_results)
    output_path = f"{output_dir}/Task_Results_Two_Options_M2_random_5_articles_4.1_seed{seed}.xlsx"
    results_df.to_excel(output_path, index=False)
    print(f"[Saved] Seed {seed} -> {output_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Likelihood of Choosing Vaccinated: 40%
Likelihood of Choosing Not Vaccinated: 60%

[Seed 1009] Row 170 Response:
Brief Reasoning: Based on my background and the media articles provided, I feel there is still a lot of controversy and negative sentiment around the vaccine, especially in Article 1, which reinforces skepticism and concerns about how unvaccinated people are treated. None of the articles strongly reassure me about the necessity or safety of the vaccine for someone like me, so I remain hesitant.

Likelihood of Choosing Vaccinated: 35%
Likelihood of Choosing Not Vaccinated: 65%

[Seed 1009] Row 171 Response:
Brief Reasoning: As a 54-year-old woman with a lower income and less formal education, I am cautious about new medical treatments, and the articles I encounter raise significant concerns about vaccine safety, especially regarding boosters and possible autoimmune risks. While one article mentions benefits in p

### Gemini-2.5-flash

In [None]:
import pandas as pd
import re
import time
import google.generativeai as genai

genai.configure(api_key="xxxxxxx")

model = genai.GenerativeModel(model_name="gemini-2.5-flash")

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_5_articles/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/gemini-2.5-flash/M2/"

seed=42
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_random_5_articles_gemini2.5flash_seed{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            response = model.generate_content(full_prompt)
            assistant_response = response.text.strip()

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            l_vac = int(match_vaccinated.group(1)) if match_vaccinated else -1
            l_not = int(match_not_vaccinated.group(1)) if match_not_vaccinated else -1

            if l_vac == -1 or l_not == -1:
                decision = "Extraction Failed"
            else:
                decision = "Vaccinated" if l_vac >= l_not else "Not Vaccinated"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": decision,
                "Likelihood of Choosing Vaccinated (%)": f"{l_vac}%" if l_vac >= 0 else "NA",
                "Likelihood of Choosing Not Vaccinated (%)": f"{l_not}%" if l_not >= 0 else "NA"
            })

        except Exception as e:
            print(f"Error on seed {seed}, row {idx}: {e}")
            time.sleep(1.5)

        time.sleep(1.2)

    df_results = pd.DataFrame(task_results)
    df_results.to_excel(output_file, index=False)
    print(f"Seed {seed} finished. Results saved to {output_file}\n")


### Llama-4-17b

In [None]:
import pandas as pd
import re
import time
import os
from openai import OpenAI

seed=42
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key="xxxxxxx",
)

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_5_articles/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Llama-4-17b/M2/"

# seeds to run
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_random_5_articles_Llama_4_17b_{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    print(f"\n===== Running with seed={seed} =====\n")

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            completion = client.chat.completions.create(
                model="meta-llama/Llama-4-Scout-17B-16E-Instruct:cerebras",
                messages=[
                    {
                        "role": "user",
                        "content": task_prompt
                    }
                ]
            )


            assistant_response = completion.choices[0].message.content

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            likelihood_vaccinated = int(match_vaccinated.group(1)) if match_vaccinated else "Extraction Failed"
            likelihood_not_vaccinated = int(match_not_vaccinated.group(1)) if match_not_vaccinated else "Extraction Failed"

            option_mapping = {
                likelihood_vaccinated: "Vaccinated",
                likelihood_not_vaccinated: "Not Vaccinated"
            }

            max_prob = max(likelihood_vaccinated, likelihood_not_vaccinated) if isinstance(likelihood_vaccinated, int) and isinstance(likelihood_not_vaccinated, int) else None
            selected_option = option_mapping[max_prob] if max_prob is not None else "Undetermined"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": selected_option,
                "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vaccinated}%",
                "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vaccinated}%"
            })

        except Exception as e:
            print(f"Error at row {idx}, seed {seed}: {e}")
            continue

        time.sleep(0.2)

    task_results_df = pd.DataFrame(task_results)
    task_results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}\n")


## M2: random 10 articles sampling

### GPT

In [None]:
import pandas as pd
import random
import re

task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
media_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/media_diet_balanced_30each.xlsx")

media_df = media_df.dropna(subset=["Headline", "First_100_Words"])

def build_random_articles_prompt(task_prompt):
    selected_articles = media_df.sample(n=10, random_state=random.randint(0, 10000)).reset_index(drop=True)

    disease_block = f"""% Media Diet Information
Assume today is June 29, 2022. Your decision about receiving a COVID-19 vaccine is primarily shaped by your demographics. However, the kind of information you are exposed to in the media may also influence your thinking.
Below are ten real-world news articles that reflect the type of media you are likely to encounter.
As you read them, consider:
- Do any of these articles reinforce your existing beliefs?
- Do any contradict them or introduce new concerns or reassurances?
Unless strongly challenged by the articles, your decision should remain aligned with your prior beliefs. Only adjust your position if a particular article clearly shifts your perception.
If one article is especially influential, briefly mention it in your reasoning.
"""

    media_context = "The articles are listed below:\n"
    for i, row in selected_articles.iterrows():
        media_context += f"Article {i+1}: {row['Headline'].strip()}\n{row['First_100_Words'].strip()}\n"

    insert_point = task_prompt.find("% Task Prompt")
    return task_prompt if insert_point == -1 else (
        task_prompt[:insert_point] + disease_block + media_context + task_prompt[insert_point:]
    )

def safe_build_random(row):
    return build_random_articles_prompt(row["Task_Prompt"])

for i in range(10):
    seed = 1000 + i
    random.seed(seed)

    task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
    task_df["Task_Prompt_Full"] = task_df.apply(safe_build_random, axis=1)

    task_df["Task_Prompt_Full"] = task_df["Task_Prompt_Full"].apply(
        lambda text: re.sub(
            r"Now, based on the information above,",
            "Now, based on your demographics and the content of the ten articles above,",
            text
        )
    )

    output_path = f"/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_10_articles/task_inputs_two_options_M2_random_10_articles_seed{seed}.xlsx"
    task_df[["id", "Task_Prompt_Full"]].to_excel(output_path, index=False)
    print(f"[Saved] Seed {seed} -> {output_path}")


In [None]:
import pandas as pd
import re
import time
import openai

input_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_10_articles"
output_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Chatgpt/M2_random_10_articles"
seed=42
client = openai.OpenAI(api_key="sk-xxxxxxx")

for seed in list(range(1000, 1010)):
    input_path = f"{input_dir}/task_inputs_two_options_M2_random_10_articles_seed{seed}.xlsx"
    df = pd.read_excel(input_path)

    task_results = []

    for idx, row in df.iterrows():
        task_prompt = row["Task_Prompt_Full"]
        prompt = [{"role": "user", "content": task_prompt}]
        id = row.get("id", f"sample_{idx}")

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are simulating a human persona's decision-making process regarding whether to receive a COVID-19 vaccine."
                },
                {
                    "role": "user",
                    "content": task_prompt
                }
            ],
            seed=seed,
            temperature=0.5,
            max_tokens=1024
        )

        assistant_response = response.choices[0].message.content
        print(f"[Seed {seed}] Row {idx} Response:\n{assistant_response}\n")

        match_vac = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
        match_not_vac = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

        likelihood_vac = int(match_vac.group(1)) if match_vac else "Extraction Failed"
        likelihood_not_vac = int(match_not_vac.group(1)) if match_not_vac else "Extraction Failed"

        if isinstance(likelihood_vac, int) and isinstance(likelihood_not_vac, int):
            selected = "Vaccinated" if likelihood_vac >= likelihood_not_vac else "Not Vaccinated"
        else:
            selected = "Extraction Failed"

        task_results.append({
            "id": id,
            "Task_Prompt": task_prompt,
            "Task_Response": assistant_response,
            "Vaccination Decision": selected,
            "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vac}%",
            "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vac}%"
        })

        time.sleep(0.2)

    results_df = pd.DataFrame(task_results)
    output_path = f"{output_dir}/Task_Results_Two_Options_M2_random_10_articles_4omini_seed{seed}.xlsx"
    results_df.to_excel(output_path, index=False)
    print(f"[Saved] Seed {seed} -> {output_path}")


### Gemini-2.5-flash

In [None]:
import pandas as pd
import re
import time
import google.generativeai as genai

genai.configure(api_key="xxxxxxx")

model = genai.GenerativeModel(model_name="gemini-2.5-flash")

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_10_articles/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/gemini-2.5-flash/M2/"

seed=42
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_random_10_articles_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_random_10_articles_gemini2.5flash_seed{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            response = model.generate_content(full_prompt)
            assistant_response = response.text.strip()

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            l_vac = int(match_vaccinated.group(1)) if match_vaccinated else -1
            l_not = int(match_not_vaccinated.group(1)) if match_not_vaccinated else -1

            if l_vac == -1 or l_not == -1:
                decision = "Extraction Failed"
            else:
                decision = "Vaccinated" if l_vac >= l_not else "Not Vaccinated"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": decision,
                "Likelihood of Choosing Vaccinated (%)": f"{l_vac}%" if l_vac >= 0 else "NA",
                "Likelihood of Choosing Not Vaccinated (%)": f"{l_not}%" if l_not >= 0 else "NA"
            })

        except Exception as e:
            print(f"Error on seed {seed}, row {idx}: {e}")
            time.sleep(1.5)

        time.sleep(1.2)

    df_results = pd.DataFrame(task_results)
    df_results.to_excel(output_file, index=False)
    print(f"Seed {seed} finished. Results saved to {output_file}\n")


### Llama-4-17b

In [None]:
import pandas as pd
import re
import time
import os
from openai import OpenAI

seed=42
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key="xxxxxxx",
)

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_10_articles/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Llama-4-17b/M2/"

# seeds to run
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_random_10_articles_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_random_10_articles_Llama_4_17b_{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    print(f"\n===== Running with seed={seed} =====\n")

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            completion = client.chat.completions.create(
                model="meta-llama/Llama-4-Scout-17B-16E-Instruct:cerebras",
                messages=[
                    {
                        "role": "user",
                        "content": task_prompt
                    }
                ]
            )


            assistant_response = completion.choices[0].message.content

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            likelihood_vaccinated = int(match_vaccinated.group(1)) if match_vaccinated else "Extraction Failed"
            likelihood_not_vaccinated = int(match_not_vaccinated.group(1)) if match_not_vaccinated else "Extraction Failed"

            option_mapping = {
                likelihood_vaccinated: "Vaccinated",
                likelihood_not_vaccinated: "Not Vaccinated"
            }

            max_prob = max(likelihood_vaccinated, likelihood_not_vaccinated) if isinstance(likelihood_vaccinated, int) and isinstance(likelihood_not_vaccinated, int) else None
            selected_option = option_mapping[max_prob] if max_prob is not None else "Undetermined"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": selected_option,
                "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vaccinated}%",
                "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vaccinated}%"
            })

        except Exception as e:
            print(f"Error at row {idx}, seed {seed}: {e}")
            continue

        time.sleep(0.2)

    task_results_df = pd.DataFrame(task_results)
    task_results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}\n")


## M2: demographics-only assignment

### GPT

In [None]:
import re
import pandas as pd

results_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Results/Media Diet/gpt4omini_demographics_results.xlsx")
def extract_answer(text):
    match = re.search(r'Answer:\s*(\d)', str(text))
    return int(match.group(1)) if match else None

results_df['predicted_answer'] = results_df['response'].apply(extract_answer)

label_map = {
    1: "Left Echochamber",
    2: "Right Echochamber",
    3: "Center-ish",
    4: "Misinformation Only"
}

results_df['media_diet_label'] = results_df['predicted_answer'].map(label_map)
results_df.to_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Results/Media Diet/gpt4omini_demographics_results.xlsx", index=False)

print("Frequency distribution:")
print(results_df['media_diet_label'].value_counts().sort_index())

print("\nPercentage distribution:")
print(results_df['media_diet_label'].value_counts(normalize=True).sort_index().apply(lambda x: f"{x:.1%}"))

Frequency distribution:
media_diet_label
Center-ish           324
Left Echochamber     227
Right Echochamber    449
Name: count, dtype: int64

Percentage distribution:
media_diet_label
Center-ish           32.4%
Left Echochamber     22.7%
Right Echochamber    44.9%
Name: proportion, dtype: object


In [None]:
import pandas as pd
import random
import re

task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
mapping_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Results/Media Diet/gpt4omini_demographics_results.xlsx")
media_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/media_diet_with_extracted_words.xlsx")
matrix_df = pd.read_excel(
    "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/Media_Diets.xlsx",
    sheet_name="Media Diets",
    index_col=0
)

matrix_df = matrix_df.rename(columns={
    "Misinfo": "Low Credibility",
})

task_df = task_df.merge(mapping_df[["id", "media_diet_label"]], left_on="id", right_on="id", how="left")

diet_to_article_quota = {
    diet: matrix_df.loc[diet][matrix_df.loc[diet] > 0].to_dict()
    for diet in matrix_df.index
}

def build_rag_prompt(task_prompt, diet, _):
    quota = diet_to_article_quota.get(diet, {})
    selected_articles = []

    for cat, count in quota.items():
        available = media_df[media_df["Category"] == cat][["Headline", "First_100_Words"]].dropna()
        if len(available) == 0:
            print(f"[Warning] No articles available for category '{cat}' (diet: {diet})")
            continue
        samples = available.sample(n=min(count, len(available)), random_state=random.randint(0, 10000))
        selected_articles.extend(list(zip(samples["Headline"], samples["First_100_Words"])))

    random.shuffle(selected_articles)

    disease_block = f"""% Media Diet Information
Assume today is June 29, 2022. Your decision about receiving a COVID-19 vaccine is primarily shaped by your demographics. However, the kind of information you are exposed to in the media may also influence your thinking.
Below are five real-world news articles that reflect the type of media you are likely to encounter.
As you read them, consider:
- Do any of these articles reinforce your existing beliefs?
- Do any contradict them or introduce new concerns or reassurances?
Unless strongly challenged by the articles, your decision should remain aligned with your prior beliefs. Only adjust your position if a particular article clearly shifts your perception.
If one article is especially influential, briefly mention it in your reasoning.
"""

    media_context = "The articles are listed below:\n"
    for i, (title, content) in enumerate(selected_articles[:5]):
        media_context += f"Article {i+1}: {title.strip()}\n{content.strip()}\n"

    insert_point = task_prompt.find("% Task Prompt")
    return task_prompt if insert_point == -1 else (
        task_prompt[:insert_point] + disease_block + media_context + task_prompt[insert_point:]
    )

def safe_build(row):
    diet = row["media_diet_label"]
    if diet not in diet_to_article_quota:
        print(f"[Warning] Diet not found in quota matrix: {diet}")
    return build_rag_prompt(row["Task_Prompt"], diet, None)

for i in range(10):
    random.seed(1000 + i)

    task_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx")
    task_df = task_df.merge(mapping_df[["id", "media_diet_label"]], on="id", how="left")

    task_df["Task_Prompt_Full"] = task_df.apply(safe_build, axis=1)

    task_df["Task_Prompt_Full"] = task_df["Task_Prompt_Full"].apply(
        lambda text: re.sub(
            r"Now, based on the information above,",
            "Now, based on your demographics and the content of the five articles above,",
            text
        )
    )

    output_path = f"/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet/task_inputs_two_options_M2_only_demographics_for_media_dietseed{1000+i}.xlsx"
    task_df[["id", "Task_Prompt_Full", "media_diet_label"]].to_excel(output_path, index=False)
    print(f"[Saved] Seed {1000 + i} -> {output_path}")

[Saved] Seed 1000 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet/task_inputs_two_options_M2_only_demographics_for_media_dietseed1000.xlsx
[Saved] Seed 1001 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet/task_inputs_two_options_M2_only_demographics_for_media_dietseed1001.xlsx
[Saved] Seed 1002 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet/task_inputs_two_options_M2_only_demographics_for_media_dietseed1002.xlsx
[Saved] Seed 1003 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet/task_inputs_two_options_M2_only_demographics_for_media_dietseed1003.xlsx
[Saved] Seed 1004 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet/task_inputs_two_options_M2_only_demographics_for_media_dietseed1004.xlsx
[Saved] Seed 1005 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet

In [None]:
import pandas as pd
import re
import time
import openai

input_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet"
output_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Chatgpt/M2_demo_only_diet"
seed=42
client = openai.OpenAI(api_key="sk-xxxxxxx")

for seed in list(range(1000, 1010)):
    input_path = f"{input_dir}/task_inputs_two_options_M2_only_demographics_for_media_diet_seed{seed}.xlsx"
    df = pd.read_excel(input_path)

    task_results = []

    for idx, row in df.iterrows():
        task_prompt = row["Task_Prompt_Full"]
        prompt = [{"role": "user", "content": task_prompt}]
        id = row.get("id", f"sample_{idx}")

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are simulating a human persona's decision-making process regarding whether to receive a COVID-19 vaccine."
                },
                {
                    "role": "user",
                    "content": task_prompt
                }
            ],
            seed=seed,
            temperature=0.5,
            max_tokens=1024
        )

        assistant_response = response.choices[0].message.content
        print(f"[Seed {seed}] Row {idx} Response:\n{assistant_response}\n")

        match_vac = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
        match_not_vac = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

        likelihood_vac = int(match_vac.group(1)) if match_vac else "Extraction Failed"
        likelihood_not_vac = int(match_not_vac.group(1)) if match_not_vac else "Extraction Failed"

        if isinstance(likelihood_vac, int) and isinstance(likelihood_not_vac, int):
            selected = "Vaccinated" if likelihood_vac >= likelihood_not_vac else "Not Vaccinated"
        else:
            selected = "Extraction Failed"

        task_results.append({
            "id": id,
            "Task_Prompt": task_prompt,
            "Task_Response": assistant_response,
            "Vaccination Decision": selected,
            "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vac}%",
            "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vac}%"
        })

        time.sleep(0.8)

    results_df = pd.DataFrame(task_results)
    output_path = f"{output_dir}/Task_Results_Two_Options_M2_only_demographics_for_media_diet_4omini_seed{seed}.xlsx"
    results_df.to_excel(output_path, index=False)
    print(f"[Saved] Seed {seed} -> {output_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Seed 1008] Row 697 Response:
```
Brief Reasoning: The articles highlight the severe impact of COVID-19, including the staggering death toll and the ongoing need for vaccination, particularly as new vaccines are being authorized for young children. This reinforces my belief in the importance of vaccination for protection against the virus, leading me to decide to receive the vaccine.

Likelihood of Choosing Vaccinated: 80%
Likelihood of Choosing Not Vaccinated: 20%
```

[Seed 1008] Row 698 Response:
```
Brief Reasoning: The articles highlight the ongoing risks associated with COVID-19, including the potential for reinfection and long COVID, which reinforces my belief in the importance of vaccination for protection against the virus. Given my age and demographic factors, I feel that vaccination is a necessary step to safeguard my health.

Likelihood of Choosing Vaccinated: 80%
Likelihood of Choosing Not Vaccinated: 20%
```

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

filtered_path = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/test_500/Chatgpt/Three_Task_Original_Response.xlsx"
results_path = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Chatgpt3.5/M2_random_10_articles/Task_Results_Two_Options_M2_random_10_articles_4.1_seed1009.xlsx"
filtered_df = pd.read_excel(filtered_path)
task_results_df = pd.read_excel(results_path)

filtered_df["id"] = filtered_df["id"].astype(int)
task_results_df["id"] = task_results_df["id"].astype(int)

merged_df = task_results_df.merge(
    filtered_df[["id", "Task3_Original_Response_Two_Options"]],
    on="id",
    how="inner"
)

merged_df["true_label"] = merged_df["Task3_Original_Response_Two_Options"]
merged_df["predicted_label"] = merged_df["Vaccination Decision"]

print("Label Set:", sorted(set(merged_df["true_label"]) | set(merged_df["predicted_label"])))

eps = 1e-10
merged_df["prob_vaccinated"] = merged_df["Likelihood of Choosing Vaccinated (%)"].str.replace("%", "").astype(float) / 100
merged_df["true_binary"] = merged_df["true_label"].apply(lambda x: 1 if x == "Vaccinated" else 0)

merged_df["cross_entropy"] = -(
    merged_df["true_binary"] * np.log(merged_df["prob_vaccinated"] + eps) +
    (1 - merged_df["true_binary"]) * np.log(1 - merged_df["prob_vaccinated"] + eps)
)

entropy_by_class = merged_df.groupby("true_label")["cross_entropy"].mean()

report = classification_report(
    merged_df["true_label"],
    merged_df["predicted_label"],
    labels=["Vaccinated", "Not Vaccinated"],
    target_names=["Vaccinated", "Not Vaccinated"],
    digits=4
)

print("\nClassification Report:")
print(report)

print("\nBinary Cross Entropy Loss by Class:")
for label, loss in entropy_by_class.items():
    print(f"{label}: {loss:.4f}")

Label Set: ['Not Vaccinated', 'Vaccinated']

Classification Report:
                precision    recall  f1-score   support

    Vaccinated     0.8591    0.7792    0.8172       806
Not Vaccinated     0.3383    0.4691    0.3931       194

      accuracy                         0.7190      1000
     macro avg     0.5987    0.6241    0.6051      1000
  weighted avg     0.7581    0.7190    0.7349      1000


Binary Cross Entropy Loss by Class:
Not Vaccinated: 1.0796
Vaccinated: 0.3767


### Geimini-2.5-flash

In [None]:
import pandas as pd
import re
import time
import google.generativeai as genai

genai.configure(api_key="xxxxxxx")

model = genai.GenerativeModel(model_name="gemini-2.5-flash")

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_random_10_articles/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/gemini-2.5-flash/M2/"

seed=42
seeds_to_run = list(range(1004, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_only_demographics_for_media_dietseed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_demo_only_diet_gemini2.5flash_seed{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            response = model.generate_content(full_prompt)
            assistant_response = response.text.strip()

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            l_vac = int(match_vaccinated.group(1)) if match_vaccinated else -1
            l_not = int(match_not_vaccinated.group(1)) if match_not_vaccinated else -1

            if l_vac == -1 or l_not == -1:
                decision = "Extraction Failed"
            else:
                decision = "Vaccinated" if l_vac >= l_not else "Not Vaccinated"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": decision,
                "Likelihood of Choosing Vaccinated (%)": f"{l_vac}%" if l_vac >= 0 else "NA",
                "Likelihood of Choosing Not Vaccinated (%)": f"{l_not}%" if l_not >= 0 else "NA"
            })

        except Exception as e:
            print(f"Error on seed {seed}, row {idx}: {e}")
            time.sleep(1.5)

        time.sleep(0.5)

    df_results = pd.DataFrame(task_results)
    df_results.to_excel(output_file, index=False)
    print(f"Seed {seed} finished. Results saved to {output_file}\n")


### Llama-4-17b

In [None]:
import pandas as pd
import re
import time
import os
from openai import OpenAI

seed=42
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key="xxxxxxx",
)

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_demo_only_diet/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Llama-4-17b/M2/"

# seeds to run
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_only_demographics_for_media_dietseed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_demo_only_diet_Llama_4_17b_{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    print(f"\n===== Running with seed={seed} =====\n")

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            completion = client.chat.completions.create(
                model="meta-llama/Llama-4-Scout-17B-16E-Instruct:cerebras",
                messages=[
                    {
                        "role": "user",
                        "content": task_prompt
                    }
                ]
            )


            assistant_response = completion.choices[0].message.content

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            likelihood_vaccinated = int(match_vaccinated.group(1)) if match_vaccinated else "Extraction Failed"
            likelihood_not_vaccinated = int(match_not_vaccinated.group(1)) if match_not_vaccinated else "Extraction Failed"

            option_mapping = {
                likelihood_vaccinated: "Vaccinated",
                likelihood_not_vaccinated: "Not Vaccinated"
            }

            max_prob = max(likelihood_vaccinated, likelihood_not_vaccinated) if isinstance(likelihood_vaccinated, int) and isinstance(likelihood_not_vaccinated, int) else None
            selected_option = option_mapping[max_prob] if max_prob is not None else "Undetermined"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": selected_option,
                "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vaccinated}%",
                "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vaccinated}%"
            })

        except Exception as e:
            print(f"Error at row {idx}, seed {seed}: {e}")
            continue

        time.sleep(0.2)

    task_results_df = pd.DataFrame(task_results)
    task_results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}\n")


## M2: misinformation only

### GPT

In [None]:
import pandas as pd
import random
import re

# ==== Loading Tasks and Media Data ====
task_df_path = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx"
media_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/media_diet_with_extracted_words.xlsx")

# ==== Retain Only Low Credibility (Misinformation Only) Articles ====
misinfo_articles = media_df[media_df["Category"] == "Low Credibility"].dropna(subset=["Headline", "First_100_Words"])

# ==== Constructing Misinformation Prompt ====
def build_misinfo_prompt(task_prompt):
    selected = misinfo_articles.sample(n=5, random_state=random.randint(0, 10000)).reset_index(drop=True)

    disease_block = f"""% Media Diet Information
Assume today is June 29, 2022. Your decision about receiving a COVID-19 vaccine is primarily shaped by your demographics. However, the kind of information you are exposed to in the media may also influence your thinking.
Below are five real-world news articles from sources that may contain misinformation.
As you read them, consider:
- Do any of these articles reinforce your existing beliefs?
- Do any contradict them or introduce new concerns or reassurances?
Unless strongly challenged by the articles, your decision should remain aligned with your prior beliefs. Only adjust your position if a particular article clearly shifts your perception.
If one article is especially influential, briefly mention it in your reasoning.
"""

    media_context = "The articles are listed below:\n"
    for i, row in selected.iterrows():
        media_context += f"Article {i+1}: {row['Headline'].strip()}\n{row['First_100_Words'].strip()}\n"

    insert_point = task_prompt.find("% Task Prompt")
    return task_prompt if insert_point == -1 else (
        task_prompt[:insert_point] + disease_block + media_context + task_prompt[insert_point:]
    )

# ==== Multi-Round Generation ====
for i in range(10):
    random.seed(1000 + i)

    task_df = pd.read_excel(task_df_path)
    task_df["media_diet_label"] = "Misinformation Only"
    task_df["Task_Prompt_Full"] = task_df["Task_Prompt"].apply(build_misinfo_prompt)

    # Replace the prompt's introductory statement
    task_df["Task_Prompt_Full"] = task_df["Task_Prompt_Full"].apply(
        lambda text: re.sub(
            r"Now, based on the information above,",
            "Now, based on your demographics and the content of the five articles above,",
            text
        )
    )

    output_path = f"/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/task_inputs_two_options_M2_misinformation_for_everyone_seed{1000 + i}.xlsx"
    task_df[["id", "Task_Prompt_Full", "media_diet_label"]].to_excel(output_path, index=False)
    print(f"[Saved] Seed {1000 + i} -> {output_path}")


[Saved] Seed 1000 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/task_inputs_two_options_M2_misinformation_for_everyone_seed1000.xlsx
[Saved] Seed 1001 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/task_inputs_two_options_M2_misinformation_for_everyone_seed1001.xlsx
[Saved] Seed 1002 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/task_inputs_two_options_M2_misinformation_for_everyone_seed1002.xlsx
[Saved] Seed 1003 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/task_inputs_two_options_M2_misinformation_for_everyone_seed1003.xlsx
[Saved] Seed 1004 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/task_inputs_two_options_M2_misinformation_for_everyone_seed1004.xlsx
[Saved] Seed 1005 -> /content/drive/MyDrive/LLM Vaccination 

In [None]:
import pandas as pd
import re
import time
import openai

input_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone"
output_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Chatgpt3.5/M2_public_health_for_everyone"
seed=42
client = openai.OpenAI(api_key="sk-xxxxxxx")

for seed in list(range(1000, 1010)):
    input_path = f"{input_dir}/task_inputs_two_options_M2_misinformation_for_everyone_seed{seed}.xlsx"
    df = pd.read_excel(input_path)

    task_results = []

    for idx, row in df.iterrows():
        task_prompt = row["Task_Prompt_Full"]
        prompt = [{"role": "user", "content": task_prompt}]
        id = row.get("id", f"sample_{idx}")

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are simulating a human persona's decision-making process regarding whether to receive a COVID-19 vaccine."
                },
                {
                    "role": "user",
                    "content": task_prompt
                }
            ],
            seed=seed,
            temperature=0.5,
            max_tokens=1024
        )

        assistant_response = response.choices[0].message.content
        print(f"[Seed {seed}] Row {idx} Response:\n{assistant_response}\n")

        match_vac = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
        match_not_vac = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

        likelihood_vac = int(match_vac.group(1)) if match_vac else "Extraction Failed"
        likelihood_not_vac = int(match_not_vac.group(1)) if match_not_vac else "Extraction Failed"

        if isinstance(likelihood_vac, int) and isinstance(likelihood_not_vac, int):
            selected = "Vaccinated" if likelihood_vac >= likelihood_not_vac else "Not Vaccinated"
        else:
            selected = "Extraction Failed"

        task_results.append({
            "id": id,
            "Task_Prompt": task_prompt,
            "Task_Response": assistant_response,
            "Vaccination Decision": selected,
            "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vac}%",
            "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vac}%"
        })

        time.sleep(0.2)

    results_df = pd.DataFrame(task_results)
    output_path = f"{output_dir}/Task_Results_Two_Options_M2_misinformation_for_everyone_4.1_seed{seed}.xlsx"
    results_df.to_excel(output_path, index=False)
    print(f"[Saved] Seed {seed} -> {output_path}")

### Gemini-2.5-flash

In [None]:
import pandas as pd
import re
import time
import google.generativeai as genai

genai.configure(api_key="xxxxxxx")

model = genai.GenerativeModel(model_name="gemini-2.5-flash")

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/gemini-2.5-flash/M2/"

seed=42
seeds_to_run = list(range(1007, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_misinformation_for_everyone_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_misinformation_for_everyone_gemini2.5flash_seed{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            response = model.generate_content(full_prompt)
            assistant_response = response.text.strip()

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            l_vac = int(match_vaccinated.group(1)) if match_vaccinated else -1
            l_not = int(match_not_vaccinated.group(1)) if match_not_vaccinated else -1

            if l_vac == -1 or l_not == -1:
                decision = "Extraction Failed"
            else:
                decision = "Vaccinated" if l_vac >= l_not else "Not Vaccinated"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": decision,
                "Likelihood of Choosing Vaccinated (%)": f"{l_vac}%" if l_vac >= 0 else "NA",
                "Likelihood of Choosing Not Vaccinated (%)": f"{l_not}%" if l_not >= 0 else "NA"
            })

        except Exception as e:
            print(f"Error on seed {seed}, row {idx}: {e}")
            time.sleep(1.5)

        time.sleep(0.5)

    df_results = pd.DataFrame(task_results)
    df_results.to_excel(output_file, index=False)
    print(f"Seed {seed} finished. Results saved to {output_file}\n")

### Llama-4-17b

In [None]:
import pandas as pd
import re
import time
import os
from openai import OpenAI

seed=42
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key="xxxxxxx",
)

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_misinformation_for_everyone/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Llama-4-17b/M2/"

# seeds to run
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_misinformation_for_everyone_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_misinformation_for_everyone_Llama_4_17b_{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    print(f"\n===== Running with seed={seed} =====\n")

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            completion = client.chat.completions.create(
                model="meta-llama/Llama-4-Scout-17B-16E-Instruct:cerebras",
                messages=[
                    {
                        "role": "user",
                        "content": task_prompt
                    }
                ]
            )


            assistant_response = completion.choices[0].message.content

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            likelihood_vaccinated = int(match_vaccinated.group(1)) if match_vaccinated else "Extraction Failed"
            likelihood_not_vaccinated = int(match_not_vaccinated.group(1)) if match_not_vaccinated else "Extraction Failed"

            option_mapping = {
                likelihood_vaccinated: "Vaccinated",
                likelihood_not_vaccinated: "Not Vaccinated"
            }

            max_prob = max(likelihood_vaccinated, likelihood_not_vaccinated) if isinstance(likelihood_vaccinated, int) and isinstance(likelihood_not_vaccinated, int) else None
            selected_option = option_mapping[max_prob] if max_prob is not None else "Undetermined"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": selected_option,
                "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vaccinated}%",
                "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vaccinated}%"
            })

        except Exception as e:
            print(f"Error at row {idx}, seed {seed}: {e}")
            continue

        time.sleep(0.2)

    task_results_df = pd.DataFrame(task_results)
    task_results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}\n")


## M2: public health only

### GPT

In [None]:
import pandas as pd
import random
import re

task_df_path = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/task_inputs_two_options_M0.xlsx"
media_df = pd.read_excel("/content/drive/MyDrive/LLM Vaccination Attitudes/Input/Media Diet Data/media_diet_with_extracted_words_edited.xlsx")

misinfo_articles = media_df[media_df["Category"] == "Public Health"].dropna(subset=["Headline", "First_100_Words"])

def build_misinfo_prompt(task_prompt):
    selected = misinfo_articles.sample(n=5, random_state=random.randint(0, 10000)).reset_index(drop=True)

    disease_block = f"""% Media Diet Information
Assume today is June 29, 2022. Your decision about receiving a COVID-19 vaccine is primarily shaped by your demographics. However, the kind of information you are exposed to in the media may also influence your thinking.
Below are five real-world news articles from sources that may contain misinformation.
As you read them, consider:
- Do any of these articles reinforce your existing beliefs?
- Do any contradict them or introduce new concerns or reassurances?
Unless strongly challenged by the articles, your decision should remain aligned with your prior beliefs. Only adjust your position if a particular article clearly shifts your perception.
If one article is especially influential, briefly mention it in your reasoning.
"""

    media_context = "The articles are listed below:\n"
    for i, row in selected.iterrows():
        media_context += f"Article {i+1}: {row['Headline'].strip()}\n{row['First_100_Words'].strip()}\n"

    insert_point = task_prompt.find("% Task Prompt")
    return task_prompt if insert_point == -1 else (
        task_prompt[:insert_point] + disease_block + media_context + task_prompt[insert_point:]
    )

for i in range(10):
    random.seed(1000 + i)

    task_df = pd.read_excel(task_df_path)
    task_df["media_diet_label"] = "Public Health"
    task_df["Task_Prompt_Full"] = task_df["Task_Prompt"].apply(build_misinfo_prompt)

    task_df["Task_Prompt_Full"] = task_df["Task_Prompt_Full"].apply(
        lambda text: re.sub(
            r"Now, based on the information above,",
            "Now, based on your demographics and the content of the five articles above,",
            text
        )
    )

    output_path = f"/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/task_inputs_two_options_M2_public_health_for_everyone_seed{1000 + i}.xlsx"
    task_df[["id", "Task_Prompt_Full", "media_diet_label"]].to_excel(output_path, index=False)
    print(f"[Saved] Seed {1000 + i} -> {output_path}")


[Saved] Seed 1000 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/task_inputs_two_options_M2_public_health_for_everyone_seed1000.xlsx
[Saved] Seed 1001 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/task_inputs_two_options_M2_public_health_for_everyone_seed1001.xlsx
[Saved] Seed 1002 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/task_inputs_two_options_M2_public_health_for_everyone_seed1002.xlsx
[Saved] Seed 1003 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/task_inputs_two_options_M2_public_health_for_everyone_seed1003.xlsx
[Saved] Seed 1004 -> /content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/task_inputs_two_options_M2_public_health_for_everyone_seed1004.xlsx
[Saved] Seed 1005 -> /content/drive/MyDrive/LLM Vaccination Attitudes/

In [None]:
import pandas as pd
import re
import time
import openai

input_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone"
output_dir = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Chatgpt/M2_public_health_for_everyone"
seed=42
client = openai.OpenAI(api_key="sk-xxxxxxx")

for seed in list(range(1000, 1010)):
    input_path = f"{input_dir}/task_inputs_two_options_M2_public_health_for_everyone_seed{seed}.xlsx"
    df = pd.read_excel(input_path)

    task_results = []

    for idx, row in df.iterrows():
        task_prompt = row["Task_Prompt_Full"]
        prompt = [{"role": "user", "content": task_prompt}]
        id = row.get("id", f"sample_{idx}")

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are simulating a human persona's decision-making process regarding whether to receive a COVID-19 vaccine."
                },
                {
                    "role": "user",
                    "content": task_prompt
                }
            ],
            seed=seed,
            temperature=0.5,
            max_tokens=1024
        )

        assistant_response = response.choices[0].message.content
        print(f"[Seed {seed}] Row {idx} Response:\n{assistant_response}\n")

        match_vac = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
        match_not_vac = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

        likelihood_vac = int(match_vac.group(1)) if match_vac else "Extraction Failed"
        likelihood_not_vac = int(match_not_vac.group(1)) if match_not_vac else "Extraction Failed"

        if isinstance(likelihood_vac, int) and isinstance(likelihood_not_vac, int):
            selected = "Vaccinated" if likelihood_vac >= likelihood_not_vac else "Not Vaccinated"
        else:
            selected = "Extraction Failed"

        task_results.append({
            "id": id,
            "Task_Prompt": task_prompt,
            "Task_Response": assistant_response,
            "Vaccination Decision": selected,
            "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vac}%",
            "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vac}%"
        })

        time.sleep(0.2)

    results_df = pd.DataFrame(task_results)
    output_path = f"{output_dir}/Task_Results_Two_Options_M2_public_health_for_everyone_4.1_seed{seed}.xlsx"
    results_df.to_excel(output_path, index=False)
    print(f"[Saved] Seed {seed} -> {output_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
```
Brief Reasoning: The articles emphasize the safety and effectiveness of vaccines, as well as the increasing COVID-19 cases, which reinforces my belief in the importance of vaccination for protection against serious illness. Therefore, I would choose to receive the vaccine. 
Likelihood of Choosing Vaccinated: 75%
Likelihood of Choosing Not Vaccinated: 25%
```

[Seed 1009] Row 335 Response:
```
Brief Reasoning: The articles provide reassurance about the safety and effectiveness of COVID-19 vaccines, emphasizing the importance of vaccination for protection against severe illness. Given this information, I believe vaccination is important for my health and the health of those around me. 
Likelihood of Choosing Vaccinated: 80%
Likelihood of Choosing Not Vaccinated: 20%
```

[Seed 1009] Row 336 Response:
```
Brief Reasoning: Given my age and the articles emphasizing the safety and effectiveness of COVID-19 vaccines, along w

### Gemini-2.5-flash

In [None]:
import pandas as pd
import re
import time
import google.generativeai as genai

genai.configure(api_key="xxxxxxx")

model = genai.GenerativeModel(model_name="gemini-2.5-flash")

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/gemini-2.5-flash/M2/"

seed=42
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_public_health_for_everyone_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_public_health_for_everyone_gemini2.5flash_seed{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            response = model.generate_content(full_prompt)
            assistant_response = response.text.strip()

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            l_vac = int(match_vaccinated.group(1)) if match_vaccinated else -1
            l_not = int(match_not_vaccinated.group(1)) if match_not_vaccinated else -1

            if l_vac == -1 or l_not == -1:
                decision = "Extraction Failed"
            else:
                decision = "Vaccinated" if l_vac >= l_not else "Not Vaccinated"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": decision,
                "Likelihood of Choosing Vaccinated (%)": f"{l_vac}%" if l_vac >= 0 else "NA",
                "Likelihood of Choosing Not Vaccinated (%)": f"{l_not}%" if l_not >= 0 else "NA"
            })

        except Exception as e:
            print(f"Error on seed {seed}, row {idx}: {e}")
            time.sleep(1.5)

        time.sleep(0.3)

    df_results = pd.DataFrame(task_results)
    df_results.to_excel(output_file, index=False)
    print(f"Seed {seed} finished. Results saved to {output_file}\n")


### Llama-4-17b

In [None]:
import pandas as pd
import re
import time
import os
from openai import OpenAI

seed=42
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key="xxxxxxx",
)

input_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Input/sample_1000/M2_public_health_for_everyone/"
output_base = "/content/drive/MyDrive/LLM Vaccination Attitudes/Results/sample_1000/Llama-4-17b/M2/"

# seeds to run
seeds_to_run = list(range(1000, 1010))

for seed in seeds_to_run:
    input_file = f"{input_base}task_inputs_two_options_M2_public_health_for_everyone_seed{seed}.xlsx"
    output_file = f"{output_base}Task_Results_Two_Options_M2_public_health_for_everyone_Llama_4_17b_{seed}.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Failed to read {input_file}: {e}")
        continue

    print(f"\n===== Running with seed={seed} =====\n")

    task_results = []

    for idx, row in df.iterrows():
        id = row["id"]
        task_prompt = row["Task_Prompt_Full"]

        try:
            full_prompt = task_prompt
            completion = client.chat.completions.create(
                model="meta-llama/Llama-4-Scout-17B-16E-Instruct:cerebras",
                messages=[
                    {
                        "role": "user",
                        "content": task_prompt
                    }
                ]
            )


            assistant_response = completion.choices[0].message.content

            print(f"Seed {seed} - Row {idx} Response:\n{assistant_response}\n")

            match_vaccinated = re.search(r"[Ll]ikelihood of Choosing Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)
            match_not_vaccinated = re.search(r"[Ll]ikelihood of Choosing Not Vaccinated[:：]?\s*\**(\d+)\s*%?", assistant_response)

            likelihood_vaccinated = int(match_vaccinated.group(1)) if match_vaccinated else "Extraction Failed"
            likelihood_not_vaccinated = int(match_not_vaccinated.group(1)) if match_not_vaccinated else "Extraction Failed"

            option_mapping = {
                likelihood_vaccinated: "Vaccinated",
                likelihood_not_vaccinated: "Not Vaccinated"
            }

            max_prob = max(likelihood_vaccinated, likelihood_not_vaccinated) if isinstance(likelihood_vaccinated, int) and isinstance(likelihood_not_vaccinated, int) else None
            selected_option = option_mapping[max_prob] if max_prob is not None else "Undetermined"

            task_results.append({
                "id": id,
                "Task_Prompt": task_prompt,
                "Task_Response": assistant_response,
                "Vaccination Decision": selected_option,
                "Likelihood of Choosing Vaccinated (%)": f"{likelihood_vaccinated}%",
                "Likelihood of Choosing Not Vaccinated (%)": f"{likelihood_not_vaccinated}%"
            })

        except Exception as e:
            print(f"Error at row {idx}, seed {seed}: {e}")
            continue

        time.sleep(0.2)

    task_results_df = pd.DataFrame(task_results)
    task_results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}\n")