In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

sns.set()

In [2]:
triggers = ["Let's reason the events according to the reference game", 
            "Let’s think step by step.", 
            "According to the rules of the game", 
            "The reference game is", 
            "Let's think like a game tester",
            "First,", 
            "Let's think like a game designer."]

In [3]:
accuracies = {}

In [4]:
for model in ['OPT-66B', 'OPT-175B', 'text-ada-001', 'text-babbage-001', 'text-curie-001', 'text-davinci-002']:
  accuracies[model] = {}
  for desc in ['Descr1', 'Descr2']:
    accuracies[model][desc] = {}
    for i, t in enumerate(triggers):
      df = pd.read_csv(f'CSV-Summary/{model}/{desc}-trigger-{i+1}-{model}.csv')
      acc = np.mean(df['GT'] == df['Prediction'])
      accuracies[model][desc][i+1] = acc*100

In [5]:
accuracy_df = pd.concat({k: pd.DataFrame(v) for k,v in accuracies.items()}, axis=1)
accuracy_df

Unnamed: 0_level_0,OPT-66B,OPT-66B,OPT-175B,OPT-175B,text-ada-001,text-ada-001,text-babbage-001,text-babbage-001,text-curie-001,text-curie-001,text-davinci-002,text-davinci-002
Unnamed: 0_level_1,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2
1,15.568862,23.353293,14.97006,32.934132,31.137725,22.155689,49.101796,29.94012,43.113772,27.54491,70.658683,59.88024
2,15.568862,31.137725,15.568862,32.934132,34.131737,19.161677,49.101796,31.137725,41.317365,29.94012,62.874251,58.083832
3,48.502994,31.736527,13.772455,31.137725,16.167665,6.586826,49.700599,31.736527,41.317365,31.137725,52.095808,58.682635
4,44.311377,31.137725,16.167665,31.736527,7.784431,2.994012,47.904192,30.538922,44.91018,32.335329,52.694611,55.688623
5,26.946108,37.125749,13.173653,31.137725,27.54491,19.161677,47.904192,32.934132,36.526946,31.736527,50.898204,50.898204
6,28.143713,29.94012,19.161677,31.736527,20.958084,8.982036,49.101796,31.137725,43.113772,29.341317,45.508982,50.299401
7,22.155689,36.526946,13.173653,31.137725,23.353293,17.964072,49.101796,31.736527,39.520958,32.934132,43.113772,50.299401


In [6]:
accuracy_df.round(2)

Unnamed: 0_level_0,OPT-66B,OPT-66B,OPT-175B,OPT-175B,text-ada-001,text-ada-001,text-babbage-001,text-babbage-001,text-curie-001,text-curie-001,text-davinci-002,text-davinci-002
Unnamed: 0_level_1,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2,Descr1,Descr2
1,15.57,23.35,14.97,32.93,31.14,22.16,49.1,29.94,43.11,27.54,70.66,59.88
2,15.57,31.14,15.57,32.93,34.13,19.16,49.1,31.14,41.32,29.94,62.87,58.08
3,48.5,31.74,13.77,31.14,16.17,6.59,49.7,31.74,41.32,31.14,52.1,58.68
4,44.31,31.14,16.17,31.74,7.78,2.99,47.9,30.54,44.91,32.34,52.69,55.69
5,26.95,37.13,13.17,31.14,27.54,19.16,47.9,32.93,36.53,31.74,50.9,50.9
6,28.14,29.94,19.16,31.74,20.96,8.98,49.1,31.14,43.11,29.34,45.51,50.3
7,22.16,36.53,13.17,31.14,23.35,17.96,49.1,31.74,39.52,32.93,43.11,50.3


# Statistical Analysis

In [7]:
from scipy.stats import mannwhitneyu, wilcoxon
from statannotations.Annotator import Annotator

In [8]:
correctness = {}

for model in ['text-davinci-002']:
  for desc in ['Descr1', 'Descr2']:
    correctness[desc] = {}
    for i, t in enumerate(triggers):
      df = pd.read_csv(f'CSV-Summary/{model}/{desc}-trigger-{i+1}-{model}.csv')
      corrt = (df['GT'] == df['Prediction']).astype(int)
      # acc = np.mean(df['GT'] == df['Prediction'])
      correctness[desc][i+1] = corrt

In [9]:
correctness_df = pd.concat({k: pd.DataFrame(v) for k,v in correctness.items()}, axis=1)

In [10]:
correctness_df

Unnamed: 0_level_0,Descr1,Descr1,Descr1,Descr1,Descr1,Descr1,Descr1,Descr2,Descr2,Descr2,Descr2,Descr2,Descr2,Descr2
Unnamed: 0_level_1,1,2,3,4,5,6,7,1,2,3,4,5,6,7
0,1,1,1,0,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,1,1,0,0,0,0,0
2,0,1,1,0,1,1,1,1,0,1,0,0,0,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,1,0,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,1,1,1,1,1,1,1,1,1,1,1,1,1,1
163,1,0,0,0,0,0,0,1,0,1,1,1,0,0
164,1,1,1,1,1,0,0,1,0,0,1,1,1,1
165,0,1,1,1,0,1,1,0,0,1,0,1,1,0


In [11]:
siginficanc_matrix = np.zeros((7,7))

for X in range(1, 8):
  for Y in range(1, 8):
    siginficanc_matrix[X-1,Y-1] = wilcoxon(correctness_df[('Descr1', X)], correctness_df[('Descr2', Y)], alternative="two-sided").pvalue

In [12]:
siginficanc_matrix

array([[3.89474557e-02, 1.26939701e-02, 2.90963317e-02, 3.43319285e-03,
        2.92084890e-04, 2.07501594e-04, 2.89619410e-04],
       [5.91919663e-01, 3.76991993e-01, 4.63071015e-01, 2.05903211e-01,
        2.90963317e-02, 2.60144796e-02, 2.43579157e-02],
       [1.63394231e-01, 2.86422023e-01, 2.21623602e-01, 5.12690760e-01,
        8.23063274e-01, 7.47730165e-01, 7.44881620e-01],
       [1.63024384e-01, 3.05058859e-01, 2.51349109e-01, 5.63702862e-01,
        7.32439900e-01, 6.54720846e-01, 6.46355196e-01],
       [1.07798214e-01, 1.85112017e-01, 1.63394231e-01, 3.58795358e-01,
        1.00000000e+00, 9.14621388e-01, 9.13626661e-01],
       [1.33082930e-02, 2.43579157e-02, 1.39062969e-02, 6.51964191e-02,
        2.92171299e-01, 3.82733089e-01, 3.93768635e-01],
       [1.74511870e-03, 6.06758497e-03, 4.55634980e-03, 2.43579157e-02,
        1.38476715e-01, 2.05903211e-01, 1.90430264e-01]])

In [13]:
pvalues = np.diagonal(siginficanc_matrix)
formatted_pvalues = [f'p={pvalue:.2e}' for pvalue in pvalues]

In [14]:
pvalues

array([0.03894746, 0.37699199, 0.2216236 , 0.56370286, 1.        ,
       0.38273309, 0.19043026])