In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

ollama_url = "http://localhost:11434/v1"
ollama = OpenAI(api_key="ollama", base_url=ollama_url)

openai = OpenAI()

In [2]:
# Generate synthetic dataset of 1000 American voters from 2016 election with demographics and issue priorities

In [3]:
import pandas as pd
import numpy as np
np.random.seed(42)

n = 1000
data = {
    'age': np.random.choice([18,25,35,45,55,65], n, p=[0.2,0.25,0.2,0.15,0.1,0.1]),
    'gender': np.random.choice(['Male', 'Female'], n, p=[0.52, 0.48]),
    'income': np.random.normal(50000, 20000, n).clip(15000, 150000).round(),
    'education': np.random.choice(['HS', 'Some College', 'College', 'Postgrad'], n, p=[0.3,0.3,0.25,0.15]),
    'vote_2016': np.random.choice(['Clinton', 'Trump', 'Other'], n, p=[0.48,0.46,0.06]),
    'govt_spending': np.random.choice(range(1,8), n),  # 1=less services, 7=more
    'healthcare_govt': np.random.choice(range(1,8), n),
    'immigration_concern': np.random.choice(range(1,8), n),
    'urban_rural': np.random.choice(['Urban', 'Suburban', 'Rural'], n, p=[0.4,0.4,0.2])
}

# Define main concerns based on issue scale priorities and demographics
concerns = []
issues = ['economy', 'healthcare', 'immigration', 'jobs', 'education', 'crime', 'environment']
priorities = ['top priority', 'major concern', 'important issue']

for i in range(n):
    # Determine top 1-3 concerns based on scale scores (higher score = more concern)
    scores = {
        'economy': 8 - data['govt_spending'][i],  # Higher concern if wants less spending
        'healthcare': data['healthcare_govt'][i],
        'immigration': data['immigration_concern'][i]
    }

    # Sort by concern level and pick top 2-3
    sorted_concerns = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    top_concerns = [issue for issue, score in sorted_concerns[:np.random.choice([2,3])]]

    # Add demographic-specific concerns
    if data['age'][i] >= 55:
        top_concerns.append('social security')
    if data['income'][i] < 30000:
        top_concerns.append('cost of living')
    if data['urban_rural'][i] == 'Rural':
        top_concerns.append('rural development')

    # Format as natural language string
    concern_str = ', '.join(top_concerns) + f" ({np.random.choice(priorities)})"
    concerns.append(concern_str)

data['main_concerns'] = concerns
df = pd.DataFrame(data)
df.to_csv('synthetic_voters_2016.csv', index=False)
print(df[['age', 'gender', 'vote_2016', 'main_concerns']].head(10))
print(f"Dataset saved: synthetic_voters_2016.csv ({len(df)} rows)")
print("\nSample LLM prompts ready:")
print("Persona: 35yo female college grad who voted Clinton, main concerns: healthcare (top priority), jobs, education")


   age  gender vote_2016                                      main_concerns
0   25    Male     Trump    economy, healthcare, immigration (top priority)
1   65  Female   Clinton  immigration, healthcare, social security (top ...
2   45  Female   Clinton                economy, immigration (top priority)
3   35  Female     Trump    immigration, economy, healthcare (top priority)
4   18  Female   Clinton  economy, immigration, healthcare, rural develo...
5   18  Female   Clinton  economy, immigration, healthcare (important is...
6   18  Female     Trump             healthcare, immigration (top priority)
7   55  Female     Other  healthcare, economy, immigration, social secur...
8   35    Male   Clinton               immigration, economy (major concern)
9   45    Male   Clinton             immigration, economy (important issue)
Dataset saved: synthetic_voters_2016.csv (1000 rows)

Sample LLM prompts ready:
Persona: 35yo female college grad who voted Clinton, main concerns: healthcare (top 

In [4]:
# Generate prompt for each voter
voter_prompts = []

for i in range(len(df)):
    row = df.iloc[0]
    prompt = (f"You are an American voter with these specifications: {row.age}yo {row.gender}, {row.education}, ${row.income:,} income, voted {row.vote_2016}, main concerns: {row.main_concerns}.\n"
              f"You will have to cast vote and respond with just a word (REP or DEM) for next presidential election based on arguments presented below: ")
    voter_prompts.append(prompt)

In [5]:
len(voter_prompts)

1000

In [6]:
# Prompts for REP and DEM representatives

rep_prompt = (
    "You are a Republican candidate running for President of the United States. "
    "Craft a persuasive argument to convince undecided voters to vote for you in the upcoming election. "
    "Focus on key issues such as the economy, national security, and traditional values. "
    "Keep your message clear, concise, and appealing to a broad audience."
)

dem_prompt = (
    "You are a Democratic candidate running for President of the United States. "
    "Craft a persuasive argument to convince undecided voters to vote for you in the upcoming election. "
    "Focus on key issues such as healthcare, social justice, and climate change. "
    "Keep your message clear, concise, and appealing to a broad audience."
)

In [7]:
rep_message = openai.chat.completions.create(
    model="gpt-5-nano",
    messages=[
        {"role": "system", "content": "You are a Republican candidate running for President of the United States."},
        {"role": "user", "content": rep_prompt}
    ]
)

dem_message = openai.chat.completions.create(
    model="gpt-5-nano",
    messages=[
        {"role": "system", "content": "You are a Democratic candidate running for President of the United States."},
        {"role": "user", "content": dem_prompt}
    ]
)

In [8]:
rep_message = "REP candidate: " + rep_message.choices[0].message.content + "\n"
dem_message = "DEM candidate: " + dem_message.choices[0].message.content + "\n"

In [11]:
from concurrent.futures import ThreadPoolExecutor, as_completed

SYSTEM_MSG = {
    "role": "system",
    "content": "You are an American voter deciding between REP and DEM candidates based on their arguments. Answer with just a single word: REP or DEM.",
}

def call_model(prompt_with_speech: str) -> str:
    resp = ollama.chat.completions.create(
        model="mistral",
        messages=[
            SYSTEM_MSG,
            {"role": "user", "content": prompt_with_speech},
        ],
    )
    return resp.choices[0].message.content

def get_vote_consensus(voter_prompts, rep_message, dem_message, max_workers=4):
    vote_consensus = [None] * len(voter_prompts)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        for i, prompt in enumerate(voter_prompts[:12]):
            prompt_with_speech = prompt + rep_message + dem_message
            fut = executor.submit(call_model, prompt_with_speech)
            futures[fut] = i

        for fut in as_completed(futures):
            idx = futures[fut]
            vote_consensus[idx] = fut.result()

    return vote_consensus

# usage
vote_consensus = get_vote_consensus(voter_prompts, rep_message, dem_message, max_workers=4)


In [12]:
print(vote_consensus[:10])

[' REP', ' REP', ' REP', ' REP', ' REP', " Based on the concerns of a 25yo male, some college education, $32,440 income, voting for Trump, with economy, healthcare, and immigration (top priority), the REP candidate appeals more as they address the economic interests and border security which aligns better with the voter's previous voting behavior.", ' REP', ' Based on the concerns of the American voter in question: REP', ' REP', ' REP']
