In [1]:
from search import search_library
import random
from dotenv import load_dotenv
from openai import OpenAI, OpenAIError
import os

def ask(sys_msg, usr_msg):
    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": sys_msg},
            {"role": "user", "content": usr_msg}
        ]
    )
    return resp.choices[0].message.content

def sample_text(text, x):
    max_start = max(0, len(text) - x)
    random_start = random.randint(0, max_start)
    excerpt = text[random_start:random_start + x]
    return excerpt


load_dotenv()
client = OpenAI()
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "o4-mini-2025-04-16")
DB_FILE = "library.sqlite"
MAX_RESULTS = 100

In [2]:
h1 = """1.  Effectiveness of Different Action Types  
   Hypothesis 1.1 (“force vs. non‐force”):  After controlling for opponent capability, allies, domestic politics, etc., deployment of actual forces (border incursions, no-fly zones) yields a higher probability of success than purely verbal actions (direct diplomacy or threats).  
   – Empirical test:  Include dummies for each action type in a pooled logit; test pairwise contrasts (e.g., β_force_deploy > β_verbal_threat)."""
p1 = """### Hypothesis 1.1: force_vs_nonforce  
• Variables  
  – Treatment(s): C(us_action) with “force_deployment” vs “verbal_threat”  
  – Outcome: success  
  – Key control(s): opponent_gdp_pc, opponent_mil_spend_pc, opponent_allies, us_ally_small, us_ally_great, presidential_approval, election_year, docs_statements_consistency, forces_pre_log  
• Model  
  – Equation: logit(success) = β₀+β₁·I(force_deployment)+β₂·I(verbal_threat)+…+α_opponent+γ_goal  
  – Estimation: logistic regression; opponent_fixed_effect & goal_fixed_effect; SE clustered by opponent_state  
• Diagnostic checks  
  – Pairwise contrast β₁>β₂; VIF for multicollinearity; Hosmer–Lemeshow  
• Code skeleton (`statsmodels` Python)  
```python
import statsmodels.formula.api as smf
model = smf.logit(
    "success ~ C(us_action) + opponent_gdp_pc + opponent_mil_spend_pc + "
    "opponent_allies + us_ally_small + us_ally_great + presidential_approval + "
    "election_year + docs_statements_consistency + forces_pre_log + "
    "C(opponent_fixed_effect) + C(goal_fixed_effect)",
    data=df
).fit(cov_type="cluster", cov_kwds={"groups": df.opponent_state})
```"""

data = ask("What data do I need to collect to test the following hypothesis with the following implementation plan?", f"Hypothesis:\n{h1}\n\nPlan:\n{p1}")
print(data)

Here’s the minimal “data recipe” you’ll need in order to estimate your logit( success ) model and test  
βforce_deployment > βverbal_threat.  You can think of each row in your data frame as “one U.S. action toward one opponent over one stated foreign-policy goal.”

1.  Unit of analysis  
    •  action_id (unique)  
    •  date (or at least year)  
    •  opponent_state_id  
    •  goal_id  

2.  Dependent variable  
    •  success  (0/1)  
       –  Definition: whether the U.S. achieved its stated objective vis-à-vis that opponent on that goal.  
       –  Possible sources: post-action diplomatic summaries, news‐coding (e.g. GDELT, ICEWS), archival/family dispatches (DDI), secondary scholarly codings.  

3.  Treatment variable  
    •  us_action  (categorical)  
       –  Must include at least:  
         ­ force_deployment  (border incursions, limited no-fly zones, show-of-force deployments)  
       –  (Later you’ll dummy-encode C(us_action) and test βforce_deployment>βverbal_threat.

In [None]:
# bulldoze the globe for the data