### LLM predicting conflicts

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "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"  # allen's personal key for the course
from openai import OpenAI
client = OpenAI()

import pandas as pd
import numpy as np
import random
import time
from tqdm.notebook import tqdm
import pickle

def call_gpt(
    prompt, temperature, max_length, model, stop=["\n"]
):
    while 1:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {
                        "role": "system",
                        "content": 'You are a helpful Assistant, and you only response to the "Assistant". Remember, maintain a natural tone. Be precise, concise, and casual. Keep it short.',
                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=temperature,
                max_tokens=max_length,
                stop=stop,
            )
            break
        except:
            print("Openai API error, sleep for 1s")
            time.sleep(1)
    return [response.choices[0].message.content.strip()]

#### Load data from paper

In [3]:
# Get features from dta file
feature_data_path = "../data/Replication/estimationdata_withlags.dta"
feature_data = pd.read_stata(feature_data_path)
# print(feature_data)

In [4]:
# Get all countries
countries = feature_data["statename"].unique()
country_codes = {}
for country in countries:
    country_codes[country] = feature_data[feature_data["statename"] == country]["gwno"].unique()[0]
print(country_codes, len(countries))

{'United States of America': 2, 'Canada': 20, 'Bahamas': 31, 'Cuba': 40, 'Haiti': 41, 'Dominican Republic': 42, 'Jamaica': 51, 'Trinidad and Tobago': 52, 'Barbados': 53, 'Mexico': 70, 'Belize': 80, 'Guatemala': 90, 'Honduras': 91, 'El Salvador': 92, 'Nicaragua': 93, 'Costa Rica': 94, 'Panama': 95, 'Colombia': 100, 'Venezuela': 101, 'Guyana': 110, 'Surinam': 115, 'Ecuador': 130, 'Peru': 135, 'Brazil': 140, 'Bolivia': 145, 'Paraguay': 150, 'Chile': 155, 'Argentina': 160, 'Uruguay': 165, 'United Kingdom': 200, 'Ireland': 205, 'Netherlands': 210, 'Belgium': 211, 'Luxembourg': 212, 'France': 220, 'Switzerland': 225, 'Spain': 230, 'Portugal': 235, 'German Federal Republic': 260, 'German Democratic Republic': 265, 'Poland': 290, 'Austria': 305, 'Hungary': 310, 'Czechoslovakia': 315, 'Czech Republic': 316, 'Slovakia': 317, 'Italy/Sardinia': 325, 'Malta': 338, 'Albania': 339, 'Macedonia (Former Yugoslav Republic of)': 343, 'Croatia': 344, 'Yugoslavia (Serbia)': 345, 'Bosnia-Herzegovina': 346, '

In [7]:
# from https://github.com/ragingstumpers/COS598_Hegre_A1/blob/master/simulator/defs.py
_MAP_CSV_NAME_TO_VARIABLE_ENUM_FOR_STATS_BASE = {
    'lc1': 'previous_year_was_minor_by_country',
    'lc2': 'previous_year_was_major_by_country',
    'ltsc0': 'previous_logs_no_conflict_by_country',
    'loi': 'current_oil_level_by_country',
    'loic1': 'current_oil_times_previous_year_was_minor_by_country',
    'loic2': 'current_oil_times_previous_year_was_major_by_country',
    'lois0': 'current_oil_times_previous_logs_no_conflict_by_country',
    'let': 'current_ethnic_dominance_projection_by_country',
    'letc1': 'current_ethnic_dominance_projection_times_previous_year_was_minor_by_country',
    'letc2':
        'current_ethnic_dominance_projection_times_previous_year_was_major_by_country',
    'lets0': 'current_ethnic_dominance_projection_times_previous_logs_no_conflict_by_country',
    'lli': 'current_imr_level_by_country',
    'limc1': 'current_imr_times_previous_year_was_minor_by_country',
    'limc2': 'current_imr_times_previous_year_was_major_by_country',
    'lims0': 'current_imr_times_previous_logs_no_conflict_by_country',
    'lyo': 'current_youth_level_by_country',
    'lyoc1': 'current_youth_times_previous_year_was_minor_by_country',
    'lyoc2': 'current_youth_times_previous_year_was_major_by_country',
    'lyos0': 'current_youth_times_previous_logs_no_conflict_by_country',
    'llpo': 'current_population_level_by_country',
    'lpoc1': 'current_population_times_previous_year_was_minor_by_country',
    'lpoc2': 'current_population_times_previous_year_was_major_by_country',
    'lpos0': 'current_population_times_previous_logs_no_conflict_by_country',
    'led': 'current_education_level_by_country',
    'ledc1': 'current_education_times_previous_year_was_minor_by_country',
    'ledc2': 'current_education_times_previous_year_was_major_by_country',
    'leds0': 'current_education_times_previous_logs_no_conflict_by_country',
    'llin': 'current_neighborhood_imr_avg_by_country',
    'ledn': 'current_neighborhood_education_avg_by_country',
    'lyon': 'current_neighborhood_youth_avg_by_country',
    'lnc1': 'previous_neighborhood_has_conflict_by_country',
    'lnc1c1': 'previous_neighborhood_has_conflict_times_previous_year_was_minor_by_country',
    'lnc1c2': 'previous_neighborhood_has_conflict_times_previous_year_was_major_by_country',
    'lnc1ts0': 'previous_neighborhood_has_conflict_times_previous_logs_no_conflict_by_country',
    'r4': 'country_in_west_asia_north_africa_region_by_country',
    'r6': 'country_in_west_africa_region_by_country',
    'r7': 'country_in_south_africa_region_by_country',
    'ltsc1': 'previous_logs_minor_conflict_by_country',
    'ltsc2': 'previous_logs_major_conflict_by_country',
}

#### Different models in the paper use different features (using model m23 for now)

In [32]:
""" from ../data/Replication/estimationreplication.do, the script for running simulation in the paper, where the varuables used in each model is defined
""" 
# /*Model m23*/
# mlogit conflict lc1 lc2 ltsc0 ltsc1 ltsc2 loi loic1 loic2 lois0 let letc1 letc2 lets0 lli limc1 limc2 lims0 lyo lyoc1 lyoc2 lyos0 ///
# 	 llpo lpoc1 lpoc2 lpos0 led ledc1 ledc2 leds0 llin ledn lyon lnc1 lnc1c1 lnc1c2 lnc1ts0 r4 r6 r7 ///
# 	 if year >= 1970 & year <= 2009,baseoutcome(0) cons(103/112 115/156 99/100)

# variable_names = ['lc1', 'lc2', 'ltsc0', 'ltsc1', 'ltsc2', 'loi', 'loic1', 'loic2', 'lois0', 'let', 'letc1', 'letc2', 'lets0', 'lli', 'limc1', 'limc2', 'lims0', 'lyo', 'lyoc1', 'lyoc2', 'lyos0', 'llpo', 'lpoc1', 'lpoc2', 'lpos0', 'led', 'ledc1', 'ledc2', 'leds0', 'llin', 'ledn', 'lyon', 'lnc1', 'lnc1c1', 'lnc1c2', 'lnc1ts0', 'r4', 'r6', 'r7']
# variable_names = ['llpo', 'lnc1', 'lnc1c1', 'lnc1c2', 'lnc1ts0', 'r4', 'r6', 'r7']
# for i, name in enumerate(variable_names):
#     print(_MAP_CSV_NAME_TO_VARIABLE_ENUM_FOR_STATS_BASE[name])

current_population_level_by_country
previous_neighborhood_has_conflict_by_country
previous_neighborhood_has_conflict_times_previous_year_was_minor_by_country
previous_neighborhood_has_conflict_times_previous_year_was_major_by_country
previous_neighborhood_has_conflict_times_previous_logs_no_conflict_by_country
country_in_west_asia_north_africa_region_by_country
country_in_west_africa_region_by_country
country_in_south_africa_region_by_country


In [74]:
# convert the data in those variables from year 1970 to 2009 for a country to a string
country_pred = 'Liberia'
his_start_year = 1991
his_end_year = 2000
pred_start_year = 2001
pred_end_year = 2019
variable_names = ['llpo', 'lnc1', 'lnc1c1', 'lnc1c2', 'lnc1ts0', 'r4', 'r6', 'r7']  # for m23

def get_example(country, start_year, end_year):
    full_str = ''
    for i in range(start_year, end_year):
        full_str += f'Year {i}:'
        
        # check if the year exists for this country
        if len(feature_data.loc[(feature_data['year'] == i) & (feature_data['statename'] == country)]) == 0:
            full_str += ' No data\n'
            continue

        for name in variable_names:
            value = feature_data.loc[(feature_data['year'] == i) & (feature_data['statename'] == country)][name].values[0]
            if name in ['r4', 'r6', 'r7']:
                value = int(value)
                full_str += f' {name} {value}'
            else:
                full_str += f' {name} {value:.2f}'
            
        # add conflict
        conflict = int(feature_data.loc[(feature_data['year'] == i) & (feature_data['statename'] == country)]["conflict"].values[0])
        full_str += f' conflict {conflict}'
        full_str += '\n'
    return full_str
example = get_example(country_pred, his_start_year, his_end_year)
print(example)    


Year 1991: llpo 7.67 lnc1 1.00 lnc1c1 1.00 lnc1c2 0.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 1
Year 1992: llpo 7.65 lnc1 1.00 lnc1c1 0.00 lnc1c2 1.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 2
Year 1993: llpo 7.64 lnc1 1.00 lnc1c1 1.00 lnc1c2 0.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 1
Year 1994: llpo 7.63 lnc1 1.00 lnc1c1 0.00 lnc1c2 1.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 1
Year 1995: llpo 7.64 lnc1 1.00 lnc1c1 1.00 lnc1c2 0.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 1
Year 1996: llpo 7.67 lnc1 1.00 lnc1c1 1.00 lnc1c2 0.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 0
Year 1997: llpo 7.73 lnc1 1.00 lnc1c1 1.00 lnc1c2 0.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 0
Year 1998: llpo 7.81 lnc1 1.00 lnc1c1 0.00 lnc1c2 0.00 lnc1ts0 0.00 r4 0 r6 1 r7 0 conflict 0
Year 1999: llpo 7.90 lnc1 2.00 lnc1c1 0.00 lnc1c2 0.00 lnc1ts0 1.39 r4 0 r6 1 r7 0 conflict 0



In [77]:
def get_pred(country_pred, 
             his_start_year, 
             his_end_year, 
             pred_start_year,
             pred_end_year,
             use_cot=True,
             num_example=10,
             num_pred=1, 
             pred_T=0,
             max_pred_len=1024, # with CoT
             model="gpt-4-0125-preview",
    ):
    max_example = 2*num_example
    country_examples = random.sample(countries.tolist(), max_example)
    examples = []
    for country in country_examples:
        # try:    # sometimes some country miss data in some year
        example = get_example(country, his_start_year, his_end_year)
        examples.append(example)
        if len(examples) == num_example: break
        # except:
        #     print('Skip country:', country)
        #     continue

    prompt_bg = f"""
    You are going to predict the conflict level (no, minor, major) of a country from year {pred_start_year} to {pred_end_year}. You are given the history data from {his_start_year} to {his_end_year} for the country, as well as data from a few other countries. For each year, you are given the following features:

    - `llpo`: current_population_level
    - `lnc1`: previous_neighborhood_has_conflict
    - `lnc1c1`: previous_neighborhood_has_conflict_times_previous_year_was_minor
    - `lnc1c2`: previous_neighborhood_has_conflict_times_previous_year_was_major
    - `lnc1ts0`: previous_neighborhood_has_conflict_times_previous_logs_no_conflict
    - `r4`: country_in_west_asia_north_africa_region
    - `r6`: country_in_west_africa_region
    - `r7`: country_in_south_africa_region

    These are the examples:
    """.strip()
    prompt = prompt_bg
    for example in examples:
        prompt += f"\nCountry ?\n{example}"

    # add country to predict
    prompt += f"\nNow, this is the history data for the country you are going to predict:\n{get_example(country_pred, his_start_year, his_end_year)}"
    if use_cot:
        prompt += f'\nCarefully reason the relationship between the features and the conflict level, and then output:\n1. Your reasoning in about 3-5 sentences\n2. Your prediction with the format "Year [year]: [0/1/2]", for year from {pred_start_year} to {pred_end_year}.'
    else:
        prompt += f'\nOutput your prediction with the format "Year [year]: [0/1/2]", for year from {pred_start_year} to {pred_end_year}. Do not include reasoning.'
    # print(prompt)

    # get pred
    preds = []
    for _ in range(num_pred):
        pred = call_gpt(prompt, 
                        temperature=pred_T, 
                        max_length=max_pred_len, 
                        model=model,
                        stop=None)[0]
        preds.append(pred)
    return preds

def extract_pred(output, start_year, end_year):
    """Get numbers from GPT output"""
    preds = []
    for i in range(start_year, end_year+1):
        pred = output.split(f"Year {i}:")[1].split('\n')[0].strip()
        preds.append(int(pred))
    return preds

In [None]:
# test prediction
raw_pred = get_pred(country_pred, 
                    his_start_year, 
                    his_end_year, 
                    pred_start_year,
                    pred_end_year,
                    use_cot=False,
                    )[0]
print(raw_pred)

#### Sweep over countries

In [76]:
country = 'Czech Republic'
get_example(country, 1991, 2000)

'Year 1991: No data\nYear 1992: No data\nYear 1993: llpo nan lnc1 nan lnc1c1 nan lnc1c2 nan lnc1ts0 nan r4 1 r6 0 r7 0 conflict 0\nYear 1994: llpo 9.24 lnc1 2.00 lnc1c1 nan lnc1c2 nan lnc1ts0 0.00 r4 1 r6 0 r7 0 conflict 0\nYear 1995: llpo 9.24 lnc1 1.00 lnc1c1 0.00 lnc1c2 0.00 lnc1ts0 0.00 r4 1 r6 0 r7 0 conflict 0\nYear 1996: llpo 9.24 lnc1 1.00 lnc1c1 0.00 lnc1c2 0.00 lnc1ts0 0.69 r4 1 r6 0 r7 0 conflict 0\nYear 1997: llpo 9.24 lnc1 0.00 lnc1c1 0.00 lnc1c2 0.00 lnc1ts0 0.00 r4 1 r6 0 r7 0 conflict 0\nYear 1998: llpo 9.24 lnc1 0.00 lnc1c1 0.00 lnc1c2 0.00 lnc1ts0 0.00 r4 1 r6 0 r7 0 conflict 0\nYear 1999: llpo 9.24 lnc1 2.00 lnc1c1 0.00 lnc1c2 0.00 lnc1ts0 3.22 r4 1 r6 0 r7 0 conflict 0\n'

In [78]:
num_example = 20
his_start_year = 1991
his_end_year = 2000
pred_start_year = 2001
pred_end_year = 2009
use_cot = True
country_pred_all = countries.tolist()[30:50]
name = 'results'

results = {}
for country_pred in tqdm(country_pred_all):
    raw_preds = get_pred(country_pred, 
                     his_start_year, 
                     his_end_year, 
                     pred_start_year,
                     pred_end_year,
                     use_cot=use_cot,
                     num_example=num_example,
                     num_pred=1, 
                     pred_T=0)
    preds = [extract_pred(raw_pred, 
                          pred_start_year, 
                          pred_end_year,) for raw_pred in raw_preds]
    print(country_pred, preds)
    results[country_pred] = preds

# save
with open(name + '.pkl', 'wb') as f:
    pickle.dump(results, f)

  0%|          | 0/20 [00:00<?, ?it/s]

Ireland [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Netherlands [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Belgium [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Luxembourg [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
France [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Switzerland [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Spain [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Portugal [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
German Federal Republic [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
German Democratic Republic [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Poland [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Austria [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Hungary [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Czechoslovakia [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
Czech Republic [[0, 0, 0, 0, 0, 0, 0, 0, 0]]


ValueError: invalid literal for int() with base 10: '1 (Given the increasing `lnc1ts0`, suggesting growing pressure from neighboring conflicts, there might be a minor conflict.)'

In [79]:
# save
with open(name + '.pkl', 'wb') as f:
    pickle.dump(results, f)

In [80]:
# combine results
results = {}
paths = ['/home/allen/llm_civil_war_prediction/evaluation/results_cot_0-9_num-1.pkl', '/home/allen/llm_civil_war_prediction/evaluation/results_cot_10-29_num-1.pkl', '/home/allen/llm_civil_war_prediction/evaluation/results.pkl']
for path in paths:
    with open(path, 'rb') as f:
        results.update(pickle.load(f))
print('Total number of countries:', len(results))

Total number of countries: 45


 ### Compare with paper predictions

In [20]:
paper_pred_path = "../data/Replication/CountryYearResults.dta"
paper_pred_data = pd.read_stata(paper_pred_path)
print(paper_pred_data)

      gwno  year  sh_cnt_t1  sh_cnt_t2  sh_cnt_c  p10_t1_c  p10_t2_c  p10_c_c  \
0        2  2010   0.057222   0.005222  0.062444       0.0       0.0      0.0   
1        2  2011   0.045000   0.007056  0.052056       0.0       0.0      0.0   
2        2  2012   0.049056   0.016778  0.065833       0.0       0.0      0.0   
3        2  2013   0.060222   0.016778  0.077000       0.0       0.0      0.0   
4        2  2014   0.067611   0.018722  0.086333       0.0       0.0      0.0   
...    ...   ...        ...        ...       ...       ...       ...      ...   
6924   950  2046   0.000278   0.000167  0.000444       0.0       0.0      0.0   
6925   950  2047   0.000056   0.000000  0.000056       0.0       0.0      0.0   
6926   950  2048   0.000222   0.000333  0.000556       0.0       0.0      0.0   
6927   950  2049   0.000167   0.000000  0.000167       0.0       0.0      0.0   
6928   950  2050   0.000000   0.000000  0.000000       0.0       0.0      0.0   

      p50_t1_c  p50_t2_c   

In [21]:
paper_pred_path = "/home/allen/llm_civil_war_prediction/data/Analysis/data/tables/predictions_2001_2009.csv"
paper_pred_data = pd.read_csv(paper_pred_path)
print(paper_pred_data)

      gwcode    year     minor     major  combined
0        2.0  2001.0  0.015111  0.000889  0.016000
1        2.0  2002.0  0.012667  0.000111  0.012778
2        2.0  2003.0  0.010556  0.003556  0.014111
3        2.0  2004.0  0.014556  0.004222  0.018778
4        2.0  2005.0  0.019667  0.006333  0.026000
...      ...     ...       ...       ...       ...
1513   950.0  2005.0  0.000333  0.000222  0.000556
1514   950.0  2006.0  0.000333  0.000333  0.000667
1515   950.0  2007.0  0.000444  0.000111  0.000556
1516   950.0  2008.0  0.000222  0.000222  0.000444
1517   950.0  2009.0  0.000111  0.000111  0.000222

[1518 rows x 5 columns]


In [22]:
ft_data_path = "/home/allen/llm_civil_war_prediction/data/Analysis/data/tables/acd.csv"
ft_data = pd.read_csv(ft_data_path)
# country_code = country_codes[country_pred]
print(ft_data)
# print(country_code.columns)
# print(actual_data[actual_data["gwcode"] == country_code])
# actual_data_country = actual_data[actual_data["gwcode"] == country_code & (actual_data["year"] >= pred_start_year) & (actual_data["year"] <= pred_end_year)]

# # if the year is missing, add it with conflict 0
# for i in range(pred_start_year, pred_end_year+1):
#     if i not in actual_data_country["year"].tolist():
#         # add to dataframe
#         actual_data_country = pd.concat([actual_data_country, 
#                                 pd.DataFrame([[country_code, i, 0, 0, 0, 0]], 
#                                 columns=actual_data_country.columns)], 
#                                 ignore_index=True)

      year  gwcode  intensity_level  minor_actual  major_actual  either_actual
0     1946     339                1           1.0           0.0            1.0
1     1946     800                1           1.0           0.0            1.0
2     1946     145                2           0.0           1.0            1.0
3     1946     200                1           1.0           0.0            1.0
4     1946     220                1           1.0           0.0            1.0
...    ...     ...              ...           ...           ...            ...
1886  2018     770                1           1.0           0.0            1.0
1887  2018     775                1           1.0           0.0            1.0
1888  2018     800                1           1.0           0.0            1.0
1889  2018     840                1           1.0           0.0            1.0
1890  2018     850                1           1.0           0.0            1.0

[1891 rows x 6 columns]


In [88]:
paper_pred_threshold = 0.5

paper_success = []
llm_success = []
for country, preds in results.items():
    country_code = country_codes[country]

    # ground truth
    ft_country = ft_data[ft_data["gwcode"] == country_code & (ft_data["year"] >= pred_start_year) & (ft_data["year"] <= pred_end_year)]
    # print(ft_country)
    for year in range(pred_start_year, pred_end_year+1): # if the year is missing, add it with conflict 0
        if year not in ft_country["year"].tolist():
            # add to dataframe
            ft_country = pd.concat([ft_country, 
                            pd.DataFrame([[year, country_code, 0, 0, 0, 0]], 
                            columns=ft_country.columns)], 
                            ignore_index=True)

    # get paper prediction
    paper_pred_country = paper_pred_data[paper_pred_data["gwcode"] == country_code]
    preds = preds[0]    #! no ensembling, so each preds have only one set of prediction
    for year in range(pred_start_year, pred_end_year+1):
        
        # get ground truth
        assert ft_country[ft_country["year"] == year].minor_actual.values[0] + ft_country[ft_country["year"] == year].major_actual.values[0] < 2, "Cannot be both minor and major conflict at the same time."
        if ft_country[ft_country["year"] == year].minor_actual.values[0] > 0:
            ft = 1
        elif ft_country[ft_country["year"] == year].major_actual.values[0] > 0:
            ft = 2
        else:
            ft = 0

        # get llm prediction
        llm_pred = preds[year-pred_start_year]

        # get the paper prediction
        try:
            if paper_pred_country[paper_pred_country["year"] == year].major.values[0] > paper_pred_threshold:
                paper_pred = 1
            elif paper_pred_country[paper_pred_country["year"] == year].minor.values[0] > paper_pred_threshold:
                paper_pred = 2
            else:
                paper_pred = 0

            # compare - only consider if there is a conflict
            # print(ft, llm_pred, paper_pred)
            # llm_success.append(ft == llm_pred)
            # paper_success.append(ft == paper_pred)
            llm_success.append((ft > 0 and llm_pred > 0) or (ft == 0 and llm_pred == 0))
            paper_success.append((ft > 0 and paper_pred > 0) or (ft == 0 and paper_pred == 0))
        except:
            print('No paper prediction for', country, year)

print('Incidence of conflict:')
print('LLM accuracy:', np.mean(llm_success))
print('Paper accuracy:', np.mean(paper_success))

No paper prediction for German Democratic Republic 2001
No paper prediction for German Democratic Republic 2002
No paper prediction for German Democratic Republic 2003
No paper prediction for German Democratic Republic 2004
No paper prediction for German Democratic Republic 2005
No paper prediction for German Democratic Republic 2006
No paper prediction for German Democratic Republic 2007
No paper prediction for German Democratic Republic 2008
No paper prediction for German Democratic Republic 2009
No paper prediction for Czechoslovakia 2001
No paper prediction for Czechoslovakia 2002
No paper prediction for Czechoslovakia 2003
No paper prediction for Czechoslovakia 2004
No paper prediction for Czechoslovakia 2005
No paper prediction for Czechoslovakia 2006
No paper prediction for Czechoslovakia 2007
No paper prediction for Czechoslovakia 2008
No paper prediction for Czechoslovakia 2009
Incidence of conflict:
LLM accuracy: 0.9509043927648578
Paper accuracy: 0.9741602067183462


#### TODO: get LLM probs with ensemble, and compare AUC