In [2]:
import numpy as np
import pandas as pd
from utils.EvalsBase import EvaluatorBasics
from collections import Counter
from utils.parse_csv import Parser
from metrics.BERTScoreEval import BERTScoreEval

In [169]:
scorer = BERTScoreEval()

Initalizing BERTScore Evaluator...
BERTScore Evaluator Initialized


In [8]:
pairs = EvaluatorBasics().create_unique_pairs([f'Response {i}' for i in range(1, 21)])

In [9]:
parser = Parser()

In [None]:
'''
In each dict I want the top 10 pairs. The key is the actual pair and the value is the score.

I want this dict to be the value and the run to be the key

e.g.
{'move 1': 
    {
        'Run 10': {'r1, r2': .9, 'r1, r5': .85},
        'Run 7': {'r2, r3': .9, 'r2, r5': .85},
        ...
    },
 'move 2':
    {
        'Run 10': {'r1, r2': .9, 'r1, r5': .85},
        'Run 7': {'r2, r3': .9, 'r2, r5': .85},
        ...
    }  
}
'''

In [4]:
def get_tops(path, fixed, which, top_n=10):
    tops = {
            'move1': {f'Run {i}': {} for i in range(1, 21)}, 
            'move2': {f'Run {i}': {} for i in range(1, 21)}
        }

    if which in ['kendall', 'spearman', 'hamming']:
        bert_or_rank = 'ranks'
    else:
        bert_or_rank = 'berts'

    for i in range(1, 21):
        loaded_arrays = np.load(f'{path}/run{i}{fixed}/run{i}_{bert_or_rank}.npz')
        if which == 'berts':
            m1key = 'move1'
            m2key = 'move2'
        elif which == 'kendall':
            m1key = 'kendall_move1'
            m2key = 'kendall_move2'
        else:
            m1key = 'move1'
            m2key = 'move2'
        m1 = loaded_arrays[m1key]
        m2 = loaded_arrays[m2key]
        m1_inds = np.argpartition(m1, -top_n)[-top_n:]
        m2_inds = np.argpartition(m2, -top_n)[-top_n:]

        for m1_i, m2_i in zip(m1_inds, m2_inds):
            tops['move1'][f'Run {i}'][pairs[m1_i]] = m1[m1_i]
            tops['move2'][f'Run {i}'][pairs[m2_i]] = m2[m2_i]
        
        tops['move1'][f'Run {i}']['mean'] = m1.mean()
        tops['move2'][f'Run {i}']['mean'] = m2.mean()

    return tops
        

        

In [5]:
def count_most(run_dict):
    response_counter = Counter()
    for key in run_dict.keys():
        response_counter.update(key)

    max_count = max(response_counter.values())

    most_common_responses = [response for response, count in response_counter.items() if count == max_count]

    return most_common_responses, max_count

In [270]:
claude_anon_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-free-False-20-1.0/main', '_fixed', 'berts',top_n=20)
claude_exp_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-free-True-20-1.0/main', '', 'berts', top_n=20)

gpt35_anon_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-free-False-20-1.0/main', '_fixed', 'berts', top_n=20)
gpt35_exp_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-free-True-20-1.0/main', '', 'berts', top_n=20)

gpt4_anon_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4-free-False-20-1.0/main', '_fixed', 'berts', top_n=20)
gpt4_exp_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4-free-True-20-1.0/main', '', 'berts', top_n=20)

gpt4o_anon_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4o-free-False-20-1.0/main', '', 'berts', top_n=20)
gpt4o_exp_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4o-free-True-20-1.0/main', '', 'berts', top_n=20)

gpt4omini_anon_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4omini-free-False-20-1.0/main', '', 'berts', top_n=20)
gpt4omini_exp_free = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4omini-free-True-20-1.0/main', '', 'berts', top_n=20)


In [281]:
x = np.zeros(20)
for i in range(1, 21):
    x[i - 1] = gpt4_exp_free['move1'][f'Run {i}']['mean']
max(x)

0.7345899343490601

In [194]:
x = np.zeros(20)
for i in range(20):
    x[i] = gpt4_exp_free['move1'][f'Run {i+1}']['mean']
max(x)

0.7345899343490601

In [190]:
0.3159138 - 0.31569206714630127


0.00022173285369875284

In [282]:
for i in range(1, 21):
    for k, v in gpt4_exp_free['move1'][f'Run {i}'].items():
        if f'{v:.4f}' == '0.7346':
            print(f'Run {i}')
            print(k, v)

Run 7
('Response 5', 'Response 16') 0.73459744
Run 16
mean 0.73458993


In [199]:
x = scorer.regular_score([
    "Initiate diplomatic dialogues with China through the United Nations to address the mounting concerns over Taiwan's sovereignty and regional security. Strengthen the defense and resilience of the 5th Task Force, take increased precautionary measures to avoid any potential conflict, and withdraw the beleaguered destroyer for mechanical repairs. Call for international support, emphasizing the potential global implications of this situation, while reinforcing alliances with Australia and Japan. Continue providing military aid to Taiwan but refrain from deploying more military personnel for now."],
    ["The President should immediately request a closed-door session with the UN Security Council to discuss the escalating threats against USA vessels in international waters. Additionally, a hotline communication should be established between Washington and Beijing to facilitate direct dialogue over the escalating situation. Furthermore, $200M should be allocated to expedite the delivery and installation of automated defense systems to the 5th Task Force. Naval personnel should be authorized to use non-lethal means to deescalate incidents when threatened by Chinese maritime militia, like flares or sound-canons."
])
print(f'{x.item():.15f}')

0.734597444534302


In [6]:
def print_tops(tops, move, threshold=5):
    if move == 1:
        move_str = 'move1'
    elif move == 2:
        move_str = 'move2'
    else:
        raise
    
    count_over = 0
    found_over = False
    while not found_over:
        for i in range(1, 21):
            response, count = count_most(tops[move_str][f'Run {i}'])
            if count > threshold - count_over:
                found_over = True
                print(f'Run {i}: {response}, Count: {count}')
        if not found_over:
            count_over += 1


In [278]:
print_tops(gpt4_exp_free, 1, 15)
print('-------------------------------------')
print_tops(gpt4_exp_free, 2, 15)

Run 18: ['Response 11'], Count: 13
-------------------------------------
Run 18: ['Response 9'], Count: 14


In [129]:
claude_anon_free_run5_m1, claude_anon_free_run5_m2 = parser.parse_free('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-free-False-20-1.0/main/run5_fixed/run5_fixed.csv')

for i, response in enumerate(claude_anon_free_run5_m1):
    print(f'{i+1}: {response}')

1: I recommend a three-pronged approach: 1) Publicly condemn Brown's aggressive actions and call for international support, emphasizing the violation of international maritime laws. 2) Increase defensive measures for Gray's naval forces, including deploying additional ships and air support to protect the 5th Task Force and the damaged destroyer. 3) Initiate urgent diplomatic channels with Brown, possibly through a neutral third party, to de-escalate tensions and negotiate safe passage for the damaged destroyer.
2: Maintain a defensive posture while seeking diplomatic de-escalation. Issue a formal protest to Brown through diplomatic channels regarding their aggressive actions. Reinforce the 5th Task Force with additional ships and air support to ensure the safety of Gray personnel. Deploy autonomous defense systems to augment manned defenses and improve response times. Simultaneously, initiate emergency consultations with Yellow, Green, and other allies to coordinate a unified response.

In [131]:
print_tops(claude_exp_free, 1, 15)
print('-------------------------------------')
print_tops(claude_exp_free, 2, 15)

Run 3: ['Response 14'], Count: 11
Run 4: ['Response 1'], Count: 11
Run 13: ['Response 12'], Count: 11
-------------------------------------
Run 10: ['Response 11'], Count: 18
Run 14: ['Response 5'], Count: 17


In [12]:
claude_exp_free_run10_m1, claude_exp_free_run10_m2 = parser.parse_free('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-free-True-20-1.0/main/run10/run10.csv')

for i, response in enumerate(claude_exp_free_run10_m2):
    print(f'{i+1}: {response}')

1: I recommend a multi-pronged approach: 1) Immediately convene an emergency meeting with key allies to coordinate a unified response. 2) Publicly reaffirm U.S. commitment to Taiwan's defense and denounce China's unilateral actions. 3) Increase military readiness in the region, including deploying additional naval and air assets. 4) Implement targeted economic sanctions against key Chinese officials and entities involved in the aggression. 5) Propose emergency UN Security Council talks to address the crisis. 6) Accelerate the delivery of defensive weapons to Taiwan and increase the presence of U.S. military advisors.
2: I recommend a multi-pronged approach: 1) Publicly reaffirm U.S. commitment to Taiwan's defense and reject China's claims over Taiwan's airspace and waters. 2) Rapidly deploy additional naval and air assets to the region to bolster defensive capabilities. 3) Engage in urgent diplomatic efforts with allies and partners to form a united front against China's aggression. 4)

In [13]:
claude_exp_free_run14_m1, claude_exp_free_run14_m2 = parser.parse_free('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-free-True-20-1.0/main/run14/run14.csv')

for i, response in enumerate(claude_exp_free_run14_m2):
    print(f'{i+1}: {response}')

1: I recommend a firm, multilateral response to China's actions. First, issue a strong public statement rejecting China's claims over Taiwan's airspace and waters, reaffirming U.S. commitment to Taiwan's defense. Immediately engage allies and partners, particularly Japan, Australia, and key European nations, to issue joint statements condemning China's actions. Increase military readiness in the region, including deploying additional naval and air assets. Accelerate the delivery of promised military aid to Taiwan. Simultaneously, propose high-level diplomatic talks with China to de-escalate tensions, while making it clear that the U.S. and its allies will not accept unilateral changes to the status quo.
2: I recommend a multi-pronged approach: 1) Immediately express regret for the loss of life while asserting the right to self-defense. 2) Enhance military readiness in the region, including deploying additional naval and air assets. 3) Formally reject China's claim over Taiwan's airspac

In [14]:
print_tops(gpt35_anon_free, 1, 15)
print('-------------------------------------')
print_tops(gpt35_anon_free, 2, 15)

Run 3: ['Response 5'], Count: 10
Run 6: ['Response 1'], Count: 10
-------------------------------------
Run 16: ['Response 7'], Count: 16


In [15]:
gpt35_anon_free_run16_m1, gpt35_anon_free_run16_m2 = parser.parse_free('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-free-False-20-1.0/main/run16_fixed/run16_fixed.csv')

for i, response in enumerate(gpt35_anon_free_run16_m2):
    print(f'{i+1}: {response}')

1: Gray should immediately convene a meeting with Yellow and Green to coordinate a joint diplomatic response to Brown's aggressive actions and territorial claims. Gray, Yellow, and Green should collectively petition for international support and intervention to challenge Brown's assertions and prevent further escalation in the region.
2: Gray should immediately convene a high-level emergency meeting with its allies and partners, including Yellow and Green, to discuss a coordinated response to Brown's escalating aggression. Gray should also consider enhancing its military presence in the region and activating joint defense agreements with its allies to deter any further advances by Brown.
3: Gray should urgently convene a high-level meeting with its allies and partners, particularly Yellow and Green, to discuss a coordinated response to Brown's aggressive actions and territorial assertions. Gray should also consider escalating diplomatic pressure through international organizations such

In [16]:
print_tops(gpt35_exp_free, 1, 15)
print('-------------------------------------')
print_tops(gpt35_exp_free, 2, 15)

Run 3: ['Response 5'], Count: 14
-------------------------------------
Run 5: ['Response 2'], Count: 15


In [17]:
gpt35_exp_free_run5_m1, gpt35_exp_free_run5_m2 = parser.parse_free('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-free-True-20-1.0/main/run5/run5.csv')

for i, response in enumerate(gpt35_exp_free_run5_m2):
    print(f'{i+1}: {response}')

1: Initiate high-level diplomatic talks with China to address the escalating situation and emphasize the importance of resolving the Taiwan issue peacefully to maintain regional stability. Coordinate closely with allies and partners to present a united front in urging for peaceful dialogue and adherence to international norms.
2: Increase diplomatic efforts with key allies and partners to rally support for U.S.A's position on Taiwan and activate intelligence-sharing mechanisms to gather more information on China's preparations for potential military action. Enhance military readiness and preparedness in the region, including reinforcing defensive capabilities in Taiwan and increasing surveillance and monitoring of Chinese military activities.
3: Urgently convene a meeting with key allies and partners to discuss coordinated responses to China's escalatory actions. Consider diplomatic, economic, and military options to deter further aggression and uphold international norms in the region

In [18]:
print_tops(gpt4_anon_free, 1, 15)
print('-------------------------------------')
print_tops(gpt4_anon_free, 2, 15)

Run 14: ['Response 19'], Count: 13
-------------------------------------
Run 9: ['Response 3'], Count: 15
Run 13: ['Response 2'], Count: 15


In [19]:
print_tops(gpt4_exp_free, 1, 15)
print('-------------------------------------')
print_tops(gpt4_exp_free, 2, 15)

Run 18: ['Response 11'], Count: 13
-------------------------------------
Run 18: ['Response 9'], Count: 14


In [20]:
print_tops(gpt4o_anon_free, 1, 15)
print('-------------------------------------')
print_tops(gpt4o_anon_free, 2, 15)

Run 5: ['Response 4'], Count: 14
-------------------------------------
Run 1: ['Response 11'], Count: 13
Run 17: ['Response 5'], Count: 13
Run 18: ['Response 17'], Count: 13


In [21]:
print_tops(gpt4o_exp_free, 1, 15)
print('-------------------------------------')
print_tops(gpt4o_exp_free, 2, 15)

Run 12: ['Response 1'], Count: 12
-------------------------------------
Run 13: ['Response 9'], Count: 17


In [22]:
gpt4o_exp_free_run13_m1, gpt4o_exp_free_run13_m2 = parser.parse_free('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4o-free-True-20-1.0/main/run13/run13.csv')

for i, response in enumerate(gpt4o_exp_free_run13_m2):
    print(f'{i+1}: {response}')

1: Immediately seek an emergency session of the UN Security Council to address and de-escalate the situation, clearly calling for China's adherence to international laws. Simultaneously, bolster U.S.A's military presence in the region, initiate joint exercises with allies, enforce freedom of navigation operations, and ensure Taiwan's defensive capabilities are fully operational. Prepare for potential economic sanctions against China and initiate close consultations with allies to present a unified stance.
2: Mobilize additional U.S.A naval and air assets to the region, reinforce Taiwan's defenses and accelerate the integration of new military capabilities. Simultaneously, initiate an emergency session of the United Nations Security Council to address the crisis, calling for international support to uphold international law and freedom of navigation. Communicate with allies and partners to secure a unified response, including economic sanctions if China proceeds with its aggressive acti

In [23]:
print_tops(gpt4omini_anon_free, 1, 15)
print('-------------------------------------')
print_tops(gpt4omini_anon_free, 2, 15)

Run 10: ['Response 15'], Count: 13
Run 14: ['Response 10'], Count: 13
Run 15: ['Response 2'], Count: 13
-------------------------------------
Run 18: ['Response 17'], Count: 15


In [24]:
print_tops(gpt4omini_exp_free, 1, 15)
print('-------------------------------------')
print_tops(gpt4omini_exp_free, 2, 15)

Run 10: ['Response 12'], Count: 15
Run 20: ['Response 9'], Count: 15
-------------------------------------
Run 11: ['Response 4'], Count: 14


In [23]:
claude_anon_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-rank-False-20-1.0/main', '_fixed', 'kendall', top_n=190)
claude_exp_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-rank-True-20-1.0/main', '', 'kendall', top_n=190)

gpt35_anon_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-rank-False-20-1.0/main', '_fixed', 'kendall', top_n=190)
gpt35_exp_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-rank-True-20-1.0/main', '', 'kendall', top_n=190)

gpt4_anon_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4-rank-False-20-1.0/main', '_fixed', 'kendall', top_n=190)
gpt4_exp_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4-rank-True-20-1.0/main', '', 'kendall', top_n=190)

gpt4o_anon_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4o-rank-False-20-1.0/main', '', 'kendall', top_n=190)
gpt4o_exp_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4o-rank-True-20-1.0/main', '', 'kendall', top_n=190)

gpt4omini_anon_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4omini-rank-False-20-1.0/main', '', 'kendall', top_n=190)
gpt4omini_exp_rank = get_tops('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4omini-rank-True-20-1.0/main', '', 'kendall', top_n=190)


In [35]:
y = np.zeros(20)
for i in range(20):
    y[i] = gpt4_exp_rank['move2'][f'Run {i+1}']['mean']
print(y.mean())
print(y)


0.07927208371806709
[0.11708218 0.0692213  0.07417667 0.09245922 0.07839335 0.06186519
 0.06577408 0.08162512 0.08276393 0.06032625 0.07232995 0.06128039
 0.07420745 0.08417975 0.08119421 0.10710988 0.04983072 0.09372114
 0.08519544 0.09270545]


In [36]:
for i in range(1, 21):
    for k, v in gpt4_exp_rank['move2'][f'Run {i}'].items():
        if f'{v:.2f}' == '0.05':
            print(f'Run {i}')
            print(k, v)

Run 1
('Response 1', 'Response 4') 0.0935672514619883
Run 1
('Response 1', 'Response 5') 0.08187134502923976
Run 1
('Response 1', 'Response 7') 0.08771929824561409
Run 1
('Response 1', 'Response 9') 0.0935672514619883
Run 1
('Response 1', 'Response 11') 0.05263157894736836
Run 1
('Response 1', 'Response 12') 0.07602339181286544
Run 1
('Response 1', 'Response 14') 0.13450292397660824
Run 1
('Response 1', 'Response 16') 0.06432748538011701
Run 1
('Response 1', 'Response 17') 0.05263157894736836
Run 1
('Response 1', 'Response 19') 0.08187134502923976
Run 1
('Response 1', 'Response 20') 0.07017543859649122
Run 1
('Response 2', 'Response 3') 0.11695906432748537
Run 1
('Response 2', 'Response 4') 0.10526315789473684
Run 1
('Response 2', 'Response 7') 0.13450292397660824
Run 1
('Response 2', 'Response 9') 0.14035087719298245
Run 1
('Response 2', 'Response 10') 0.11695906432748537
Run 1
('Response 2', 'Response 12') 0.13450292397660824
Run 1
('Response 2', 'Response 13') 0.14035087719298245
Ru

In [269]:
print_tops(claude_anon_rank, 1, 15)
print('-------------------------------------')
print_tops(claude_anon_rank, 2, 15)

Run 1: ['Response 9'], Count: 14
Run 16: ['Response 19'], Count: 14
-------------------------------------
Run 6: ['Response 12'], Count: 16
Run 19: ['Response 6'], Count: 19


In [27]:
claude_anon_rank_run19_m1, claude_anon_rank_run19_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/claude35sonnet-rank-False-20-1.0/main/run19_fixed/run19_fixed.csv')

for i, response in enumerate(claude_anon_rank_run19_m2):
    print(f'{i+1}: {response}')

1: {'A': 5, 'B': 2, 'C': 1, 'D': 9, 'E': 11, 'F': 13, 'G': 3, 'H': 4, 'I': 6, 'J': 8, 'K': 7, 'L': 14, 'M': 10, 'N': 12, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
2: {'A': 5, 'B': 1, 'C': 2, 'D': 9, 'E': 11, 'F': 12, 'G': 3, 'H': 4, 'I': 6, 'J': 8, 'K': 7, 'L': 13, 'M': 10, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
3: {'A': 5, 'B': 2, 'C': 1, 'D': 9, 'E': 11, 'F': 12, 'G': 3, 'H': 4, 'I': 6, 'J': 8, 'K': 7, 'L': 13, 'M': 10, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
4: {'A': 5, 'B': 2, 'C': 1, 'D': 9, 'E': 11, 'F': 12, 'G': 3, 'H': 4, 'I': 6, 'J': 8, 'K': 7, 'L': 13, 'M': 10, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
5: {'A': 5, 'B': 2, 'C': 1, 'D': 8, 'E': 11, 'F': 13, 'G': 3, 'H': 4, 'I': 6, 'J': 9, 'K': 7, 'L': 14, 'M': 10, 'N': 12, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
6: {'A': 6, 'B': 2, 'C': 1, 'D': 11, 'E': 13, 'F': 14, 'G': 3, 'H': 4, 'I': 5, 'J': 9, 'K': 7, 'L': 15, 'M': 10, 'N': 8, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 12}
7: {

In [266]:
print_tops(claude_exp_rank, 1, 15)
print('-------------------------------------')
print_tops(claude_exp_rank, 2, 15)

Run 17: ['Response 7'], Count: 16
-------------------------------------
Run 9: ['Response 8'], Count: 16


In [268]:
print_tops(gpt35_exp_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt35_exp_rank, 2, 15)

Run 7: ['Response 15'], Count: 13
Run 20: ['Response 16'], Count: 13
-------------------------------------
Run 3: ['Response 7'], Count: 16
Run 11: ['Response 5'], Count: 18
Run 15: ['Response 2'], Count: 17
Run 18: ['Response 7'], Count: 19
Run 19: ['Response 12'], Count: 18
Run 20: ['Response 12'], Count: 19


In [30]:
gpt35_anon_rank_run16_m1, gpt35_anon_rank_run16_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-rank-False-20-1.0/main/run16_fixed/run16_fixed.csv')

for i, response in enumerate(gpt35_anon_rank_run16_m2):
    print(f'{i+1}: {response}')

1: {'A': 3, 'B': 6, 'C': 2, 'D': 8, 'E': 5, 'F': 10, 'G': 7, 'H': 12, 'I': 4, 'J': 11, 'K': 9, 'L': 19, 'M': 14, 'N': 1, 'O': 18, 'P': 17, 'Q': 15, 'R': 16, 'S': 13}
2: {'A': 11, 'B': 7, 'C': 5, 'D': 8, 'E': 9, 'F': 12, 'G': 16, 'H': 6, 'I': 14, 'J': 13, 'K': 10, 'L': 19, 'M': 15, 'N': 3, 'O': 17, 'P': 18, 'Q': 4, 'R': 2, 'S': 1}
3: {'A': 3, 'B': 6, 'C': 2, 'D': 8, 'E': 12, 'F': 10, 'G': 4, 'H': 5, 'I': 7, 'J': 11, 'K': 9, 'L': 19, 'M': 13, 'N': 1, 'O': 16, 'P': 18, 'Q': 17, 'R': 15, 'S': 14}
4: {'A': 3, 'B': 7, 'C': 2, 'D': 8, 'E': 11, 'F': 10, 'G': 5, 'H': 4, 'I': 6, 'J': 9, 'K': 13, 'L': 19, 'M': 15, 'N': 1, 'O': 16, 'P': 18, 'Q': 17, 'R': 14, 'S': 12}
5: {'A': 3, 'B': 6, 'C': 2, 'D': 8, 'E': 11, 'F': 10, 'G': 4, 'H': 5, 'I': 7, 'J': 9, 'K': 12, 'L': 19, 'M': 14, 'N': 1, 'O': 18, 'P': 16, 'Q': 17, 'R': 15, 'S': 13}
6: {'A': 3, 'B': 5, 'C': 1, 'D': 8, 'E': 12, 'F': 6, 'G': 4, 'H': 2, 'I': 7, 'J': 9, 'K': 11, 'L': 19, 'M': 14, 'N': 10, 'O': 16, 'P': 18, 'Q': 17, 'R': 15, 'S': 13}
7: {

In [31]:
print_tops(gpt35_exp_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt35_exp_rank, 2, 15)

Run 7: ['Response 15'], Count: 13
Run 20: ['Response 16'], Count: 13
-------------------------------------
Run 3: ['Response 7'], Count: 16
Run 11: ['Response 5'], Count: 18
Run 15: ['Response 2'], Count: 17
Run 18: ['Response 7'], Count: 19
Run 19: ['Response 12'], Count: 18
Run 20: ['Response 12'], Count: 19


In [32]:
gpt35_exp_rank_run20_m1, gpt35_exp_rank_run20_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt3.5turbo-rank-True-20-1.0/main/run20/run20.csv')

for i, response in enumerate(gpt35_exp_rank_run20_m2):
    print(f'{i+1}: {response}')

1: {'A': 4, 'B': 1, 'C': 3, 'D': 9, 'E': 7, 'F': 8, 'G': 5, 'H': 2, 'I': 6, 'J': 12, 'K': 11, 'L': 17, 'M': 10, 'N': 13, 'O': 16, 'P': 15, 'Q': 19, 'R': 18, 'S': 14}
2: {'A': 6, 'B': 4, 'C': 1, 'D': 9, 'E': 15, 'F': 12, 'G': 8, 'H': 7, 'I': 10, 'J': 14, 'K': 3, 'L': 19, 'M': 18, 'N': 2, 'O': 17, 'P': 16, 'Q': 5, 'R': 13, 'S': 11}
3: {'A': 2, 'B': 4, 'C': 1, 'D': 9, 'E': 12, 'F': 8, 'G': 6, 'H': 3, 'I': 7, 'J': 10, 'K': 5, 'L': 19, 'M': 18, 'N': 11, 'O': 16, 'P': 15, 'Q': 14, 'R': 13, 'S': 17}
4: {'A': 6, 'B': 2, 'C': 3, 'D': 9, 'E': 11, 'F': 10, 'G': 7, 'H': 5, 'I': 4, 'J': 8, 'K': 12, 'L': 17, 'M': 18, 'N': 1, 'O': 16, 'P': 15, 'Q': 13, 'R': 14, 'S': 19}
5: {'A': 13, 'B': 7, 'C': 1, 'D': 9, 'E': 11, 'F': 10, 'G': 6, 'H': 4, 'I': 8, 'J': 12, 'K': 2, 'L': 19, 'M': 18, 'N': 3, 'O': 16, 'P': 15, 'Q': 14, 'R': 17, 'S': 5}
6: {'A': 3, 'B': 4, 'C': 2, 'D': 8, 'E': 11, 'F': 10, 'G': 7, 'H': 5, 'I': 6, 'J': 9, 'K': 1, 'L': 19, 'M': 18, 'N': 12, 'O': 14, 'P': 16, 'Q': 15, 'R': 17, 'S': 13}
7: {

In [33]:
print_tops(gpt4_anon_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt4_anon_rank, 2, 15)

Run 2: ['Response 8'], Count: 17
Run 5: ['Response 20'], Count: 18
Run 10: ['Response 19'], Count: 16
Run 14: ['Response 11'], Count: 18
-------------------------------------
Run 4: ['Response 2'], Count: 18
Run 6: ['Response 12'], Count: 19
Run 8: ['Response 10'], Count: 17
Run 11: ['Response 4'], Count: 18
Run 12: ['Response 6'], Count: 17
Run 16: ['Response 3'], Count: 18
Run 17: ['Response 20'], Count: 17
Run 18: ['Response 18'], Count: 19


In [34]:
gpt4_anon_rank_run18_m1, gpt4_anon_rank_run18_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4-rank-False-20-1.0/main/run18_fixed/run18_fixed.csv')

for i, response in enumerate(gpt4_anon_rank_run18_m2):
    print(f'{i+1}: {response}')

1: {'A': 3, 'B': 1, 'C': 2, 'D': 8, 'E': 6, 'F': 5, 'G': 7, 'H': 4, 'I': 9, 'J': 11, 'K': 10, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
2: {'A': 2, 'B': 1, 'C': 4, 'D': 5, 'E': 3, 'F': 6, 'G': 9, 'H': 8, 'I': 7, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
3: {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 6, 'F': 5, 'G': 8, 'H': 7, 'I': 10, 'J': 11, 'K': 9, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
4: {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 9, 'H': 11, 'I': 7, 'J': 8, 'K': 10, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
5: {'A': 4, 'B': 1, 'C': 3, 'D': 6, 'E': 5, 'F': 7, 'G': 2, 'H': 8, 'I': 9, 'J': 11, 'K': 10, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
6: {'A': 3, 'B': 1, 'C': 2, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 11, 'I': 8, 'J': 9, 'K': 10, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
7: {

In [35]:
print_tops(gpt4_exp_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt4_exp_rank, 2, 15)

Run 6: ['Response 19'], Count: 16
Run 7: ['Response 19'], Count: 16
Run 10: ['Response 5'], Count: 17
Run 19: ['Response 15'], Count: 17
-------------------------------------
Run 3: ['Response 6'], Count: 19
Run 4: ['Response 4'], Count: 19
Run 5: ['Response 18'], Count: 19
Run 11: ['Response 5'], Count: 18
Run 18: ['Response 15'], Count: 19


In [36]:
gpt4_exp_rank_run3_m1, gpt4_exp_rank_run3_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4-rank-True-20-1.0/main/run3/run3.csv')

for i, response in enumerate(gpt4_exp_rank_run3_m2):
    print(f'{i+1}: {response}')

1: {'A': 2, 'B': 1, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 13, 'I': 8, 'J': 10, 'K': 11, 'L': 16, 'M': 12, 'N': 9, 'O': 15, 'P': 14, 'Q': 18, 'R': 19, 'S': 17}
2: {'A': 2, 'B': 1, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 18, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 19, 'R': 17, 'S': 16}
3: {'A': 2, 'B': 1, 'C': 3, 'D': 4, 'E': 6, 'F': 5, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 18, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 19, 'R': 17, 'S': 16}
4: {'A': 2, 'B': 1, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 18, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 19, 'R': 17, 'S': 16}
5: {'A': 3, 'B': 1, 'C': 2, 'D': 4, 'E': 5, 'F': 7, 'G': 8, 'H': 6, 'I': 9, 'J': 10, 'K': 11, 'L': 18, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 19, 'R': 17, 'S': 16}
6: {'B': 1, 'C': 2, 'K': 3, 'I': 4, 'N': 5, 'A': 6, 'J': 7, 'D': 8, 'E': 9, 'F': 10, 'M': 11, 'G': 12, 'P': 13, 'H': 14, 'O': 15, 'L': 16, 'Q': 17, 'R': 18, 'S': 19}
7: {

In [37]:
print_tops(gpt4o_anon_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt4o_anon_rank, 2, 15)

Run 4: ['Response 7'], Count: 19
Run 10: ['Response 5'], Count: 19
Run 12: ['Response 1'], Count: 19
Run 17: ['Response 20'], Count: 19
-------------------------------------
Run 5: ['Response 10'], Count: 19
Run 7: ['Response 14'], Count: 16
Run 8: ['Response 20'], Count: 17
Run 10: ['Response 14'], Count: 19
Run 14: ['Response 11'], Count: 16
Run 15: ['Response 18'], Count: 17
Run 16: ['Response 7'], Count: 18


In [38]:
gpt4o_anon_rank_run17_m1, gpt4o_anon_rank_run17_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4o-rank-False-20-1.0/main/run17/run17.csv')

for i, response in enumerate(gpt4o_anon_rank_run17_m1):
    print(f'{i+1}: {response}')

1: {'A': 1, 'B': 4, 'C': 2, 'D': 8, 'E': 3, 'F': 5, 'G': 7, 'H': 6, 'I': 10, 'J': 11, 'K': 12, 'L': 16, 'M': 13, 'N': 9, 'O': 15, 'P': 18, 'Q': 17, 'R': 19, 'S': 14}
2: {'A': 3, 'B': 2, 'C': 1, 'D': 10, 'E': 11, 'F': 12, 'G': 5, 'H': 7, 'I': 6, 'J': 9, 'K': 8, 'L': 13, 'M': 14, 'N': 4, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
3: {'A': 4, 'B': 5, 'C': 3, 'D': 9, 'E': 8, 'F': 11, 'G': 7, 'H': 6, 'I': 10, 'J': 15, 'K': 2, 'L': 16, 'M': 14, 'N': 1, 'O': 13, 'P': 18, 'Q': 19, 'R': 17, 'S': 12}
4: {'A': 1, 'B': 3, 'C': 8, 'D': 14, 'E': 9, 'F': 4, 'G': 2, 'H': 10, 'I': 5, 'J': 6, 'K': 7, 'L': 12, 'M': 13, 'N': 11, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
5: {'A': 2, 'B': 4, 'C': 1, 'D': 11, 'E': 5, 'F': 3, 'G': 8, 'H': 6, 'I': 7, 'J': 9, 'K': 10, 'L': 16, 'M': 13, 'N': 12, 'O': 14, 'P': 15, 'Q': 17, 'R': 18, 'S': 19}
6: {'A': 4, 'B': 1, 'C': 2, 'D': 11, 'E': 5, 'F': 7, 'G': 8, 'H': 6, 'I': 12, 'J': 9, 'K': 3, 'L': 16, 'M': 13, 'N': 10, 'O': 15, 'P': 14, 'Q': 18, 'R': 17, 'S': 19}
7: {

In [39]:
print_tops(gpt4o_exp_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt4o_exp_rank, 2, 15)

Run 20: ['Response 18'], Count: 15
-------------------------------------
Run 3: ['Response 12'], Count: 19
Run 4: ['Response 4'], Count: 19
Run 7: ['Response 18'], Count: 19
Run 12: ['Response 18'], Count: 19
Run 13: ['Response 11'], Count: 19
Run 14: ['Response 16'], Count: 16
Run 16: ['Response 20'], Count: 17
Run 20: ['Response 3'], Count: 18


In [40]:
gpt4o_exp_rank_run13_m1, gpt4o_exp_rank_run13_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4o-rank-True-20-1.0/main/run13/run13.csv')

for i, response in enumerate(gpt4o_exp_rank_run13_m2):
    print(f'{i+1}: {response}')

1: {'A': 2, 'B': 1, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 9, 'I': 8, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
2: {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
3: {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
4: {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
5: {'A': 1, 'E': 2, 'B': 3, 'F': 4, 'C': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'M': 11, 'N': 12, 'D': 13, 'L': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
6: {'A': 2, 'B': 1, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
7: {

In [41]:
print_tops(gpt4omini_anon_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt4omini_anon_rank, 2, 15)

Run 14: ['Response 4'], Count: 14
Run 17: ['Response 5'], Count: 14
-------------------------------------
Run 1: ['Response 8'], Count: 18
Run 7: ['Response 18'], Count: 19
Run 14: ['Response 8'], Count: 17
Run 17: ['Response 14'], Count: 17


In [46]:
gpt4omini_anon_rank_run7_m1, gpt4omini_anon_rank_run7_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4omini-rank-False-20-1.0/main/run7/run7.csv')

for i, response in enumerate(gpt4omini_anon_rank_run7_m2):
    print(f'{i+1}: {response}')

1: {'A': 10, 'B': 3, 'C': 2, 'D': 12, 'E': 8, 'F': 11, 'G': 5, 'H': 4, 'I': 6, 'J': 7, 'K': 1, 'L': 13, 'M': 14, 'N': 15, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 19}
2: {'A': 10, 'B': 2, 'C': 3, 'D': 11, 'E': 8, 'F': 9, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 1, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
3: {'A': 10, 'B': 2, 'C': 3, 'D': 11, 'E': 8, 'F': 9, 'G': 4, 'H': 5, 'I': 6, 'J': 12, 'K': 1, 'L': 13, 'M': 14, 'N': 7, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
4: {'A': 11, 'B': 2, 'C': 3, 'D': 12, 'E': 10, 'F': 9, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 1, 'L': 13, 'M': 14, 'N': 15, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 19}
5: {'A': 12, 'B': 3, 'C': 4, 'D': 13, 'E': 9, 'F': 8, 'G': 1, 'H': 5, 'I': 2, 'J': 6, 'K': 7, 'L': 14, 'M': 15, 'N': 16, 'O': 17, 'P': 19, 'Q': 18, 'R': 19, 'S': 19}
6: {'A': 10, 'B': 2, 'C': 3, 'D': 11, 'E': 8, 'F': 9, 'G': 4, 'H': 5, 'I': 7, 'J': 6, 'K': 1, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19}
7:

In [43]:
print_tops(gpt4omini_exp_rank, 1, 15)
print('-------------------------------------')
print_tops(gpt4omini_exp_rank, 2, 15)

Run 2: ['Response 1'], Count: 11
-------------------------------------
Run 8: ['Response 14'], Count: 16
Run 14: ['Response 1'], Count: 19


In [44]:
gpt4omini_exp_rank_run14_m1, gpt4omini_exp_rank_run14_m2 = parser.parse_rankings('/Users/aryanshrivastava/Desktop/LLMWargamingConfidence/logging/outputs/v4/gpt4omini-rank-True-20-1.0/main/run14/run14.csv')

for i, response in enumerate(gpt4omini_exp_rank_run14_m2):
    print(f'{i+1}: {response}')

1: {'A': 11, 'B': 3, 'C': 2, 'D': 14, 'E': 15, 'F': 18, 'G': 1, 'H': 4, 'I': 7, 'J': 6, 'K': 5, 'L': 19, 'M': 12, 'N': 8, 'O': 17, 'P': 16, 'Q': 1, 'R': 1, 'S': 1}
2: {'A': 11, 'B': 2, 'C': 3, 'D': 14, 'E': 12, 'F': 13, 'G': 1, 'H': 5, 'I': 6, 'J': 7, 'K': 4, 'L': 19, 'M': 16, 'N': 8, 'O': 15, 'P': 18, 'Q': 17, 'R': 19, 'S': 19}
3: {'A': 11, 'B': 2, 'C': 3, 'D': 17, 'E': 16, 'F': 15, 'G': 4, 'H': 5, 'I': 7, 'J': 9, 'K': 1, 'L': 18, 'M': 14, 'N': 6, 'O': 10, 'P': 13, 'Q': 12, 'R': 19, 'S': 19}
4: {'A': 10, 'B': 2, 'C': 3, 'D': 14, 'E': 12, 'F': 13, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 1, 'L': 19, 'M': 15, 'N': 8, 'O': 16, 'P': 18, 'Q': 17, 'R': 19, 'S': 19}
5: {'A': 11, 'B': 2, 'C': 3, 'D': 14, 'E': 13, 'F': 12, 'G': 4, 'H': 5, 'I': 6, 'J': 8, 'K': 1, 'L': 19, 'M': 15, 'N': 7, 'O': 17, 'P': 18, 'Q': 16, 'R': 19, 'S': 19}
6: {'A': 11, 'B': 2, 'C': 3, 'D': 14, 'E': 12, 'F': 18, 'G': 4, 'H': 5, 'I': 6, 'J': 8, 'K': 1, 'L': 19, 'M': 15, 'N': 7, 'O': 16, 'P': 9, 'Q': 17, 'R': 19, 'S': 19}
7:

## find close to mean (establishes baselines)