In [1]:
import pandas as pd
import pickle
import jsonlines

In [2]:
from collections import Counter

In [3]:
import ast

In [4]:
import choix
from itertools import combinations

In [5]:
from scipy.stats import spearmanr, kendalltau

In [6]:
import numpy as np

### Load Data

In [7]:
all_7b = pd.read_csv("7b_pairwise_annotations.csv")
all_7b["texts"] = all_7b["texts"].apply(ast.literal_eval)

In [8]:
all_3b = pd.read_csv("3b_pairwise_annotations.csv")
all_3b["texts"] = all_3b["texts"].apply(ast.literal_eval)

In [9]:
automated_metrics = pd.read_csv("./raw_automated_metrics.csv")

In [10]:
votecols = ['plot', 'character', 'creativity',
       'development', 'language', 'overall_quality']

In [11]:
genres_we_care_about = ['scifi', 'fantasy', 'romance', 'historical']

# Clean util

In [12]:
def clean(continuation, withwords=False):
    continuation = continuation.split("<END_OF_CHAPTER>")[0].strip()
    continuation = continuation.split("### END_OF_CHAPTER ###")[0].strip()
    continuation = continuation.split("### End of Chapter")[0].strip()
    continuation = continuation.split("**To be continued...** ")[0].strip()
    continuation = continuation.split("---\n\nThis chapter")[0].strip()
    lines = continuation.split("\n\n")
    newlines = []
    seen = set()
    for l in lines:
        if l in seen:
            break
        seen.add(l)
        newlines.append(l)
    continuation = "\n\n".join(newlines)
    # super repetitive lines
    lines = continuation.split("\n")
    newlines = []
    seen = {}
    for l in lines:
        if len(l.split(" ")) >= 10 and l in seen:
            if seen[l] >= 3:
                break
        seen[l] = seen.get(l, 0) + 1
        newlines.append(l)
    continuation = "\n".join(newlines)
    
    # repeating words
    if withwords:
        words = continuation.split(" ")
        chunks = {}
        newwords = words[:20]
        for i in range(20, len(words)):
            chunkwords = words[i-20:i]
            chunk = " ".join(chunkwords)
            if chunk in chunks:
                if chunks[chunk] >= 10:
                    # no matter what
#                     print('hard cut chunk')
#                     print(chunk)
                    break
                if len(set(chunkwords)) <= 10:
#                     print('bad looking chunk')
#                     print(chunk)
                    break
            chunks[chunk] = chunks.get(chunk, 0) + 1
            newwords.append(words[i])
        continuation = " ".join(newwords)
    return continuation.strip()

# Choix util (pairwise win rates, strength, etc.)

In [13]:
def get_pairwise_winner_data_params(df, key='overall_quality', options = ['base7b', 'basewr_7b']):
    combs = list(combinations(options, r=2))
    
    data = []
    for c in combs:
        o1, o2 = c
        str_options = ",".join(sorted([o1,o2]))
        sub = df[df['options'] == str_options]
        for i, row in sub.iterrows():
            winner = row[key]
            if winner == 'same':
                continue
            winner_idx = options.index(winner)
            loser = o2 if winner == o1 else o1
            loser_idx = options.index(loser)
            data.append((winner_idx, loser_idx))
            
    params = choix.ilsr_pairwise(len(options), data)
    return data, params

# Tables

## Preference Probability by Dimension - BT-Probability A wins (A vs B)

In [16]:
def get_preference_data(df, ordered_columns):
    all_options = df['options'].unique()
    preference_data = []
    for key in votecols:
        row = [key]
        for cur_options in ordered_columns:
            all_count = Counter(df[df['options'] == cur_options][key])
            compdata, compparams = get_pairwise_winner_data_params(df, key=key, options=cur_options.split(","))

            prob_0_wins, prob_1_wins = choix.probabilities([0, 1], compparams)
            row.append(prob_0_wins*100)
        preference_data.append(row)
    pref_df = pd.DataFrame(data=preference_data, columns=["Dimension", *ordered_columns])
    return pref_df

### 7B

In [17]:
columns = ['basesft_7b,base7b', 'basewr_7b,base7b', 'basewr_7b,basesft_7b',
       'trained_7b,basesft_7b', 'trained_7b,base7b',
       'trained_7b,basewr_7b']

In [18]:
get_preference_data(all_7b, columns)

Unnamed: 0,Dimension,"basesft_7b,base7b","basewr_7b,base7b","basewr_7b,basesft_7b","trained_7b,basesft_7b","trained_7b,base7b","trained_7b,basewr_7b"
0,plot,5.263158,72.222222,83.333333,89.473684,52.941176,62.5
1,character,26.315789,68.75,84.210526,89.473684,46.666667,61.538462
2,creativity,21.052632,57.142857,82.352941,84.210526,81.25,70.588235
3,development,22.222222,68.75,88.235294,89.473684,52.941176,66.666667
4,language,35.0,66.666667,76.470588,88.888889,80.0,58.823529
5,overall_quality,15.0,66.666667,83.333333,90.0,76.470588,64.705882


### 3B

In [19]:
columns = ['basesft_3b,base3b', 'basewr_3b,base3b', 'basewr_3b,basesft_3b',
       'trained_3b,basesft_3b', 'trained_3b,base3b',
       'trained_3b,basewr_3b']

In [20]:
get_preference_data(all_3b, columns)

Unnamed: 0,Dimension,"basesft_3b,base3b","basewr_3b,base3b","basewr_3b,basesft_3b","trained_3b,basesft_3b","trained_3b,base3b","trained_3b,basewr_3b"
0,plot,5.555556,58.823529,83.333333,83.333333,66.666667,57.142857
1,character,17.647059,52.941176,80.0,94.117647,52.941176,69.230769
2,creativity,17.647059,66.666667,77.777778,86.666667,53.846154,38.461538
3,development,22.222222,61.111111,73.684211,92.857143,66.666667,57.142857
4,language,15.0,62.5,78.947368,94.444444,46.666667,42.857143
5,overall_quality,5.882353,60.0,85.0,94.117647,52.941176,62.5


## Win-Rate Tables (A / (A + Same + B))

In [21]:
def get_winrate_data(df, ordered_columns, nice_comb_names=None):
    all_options = df['options'].unique()
    winrate_data = []
    for key in votecols:
        row = [key]
        for cur_options in columns:
            opt1 = cur_options.split(",")[0]
            nice_options = ",".join(sorted(cur_options.split(",")))
            all_count = Counter(df[df['options'] == nice_options][key])
            a_winrate = all_count[opt1] / sum(all_count.values())

            row.append(a_winrate * 100)
        winrate_data.append(row)
    nice_columns = ordered_columns
    if nice_comb_names is not None:
        nice_columns = []
        for c in columns:
            nc = nice_comb_names[c]
            nice_columns.append("WR-" + nc)
    
    wr_df = pd.DataFrame(data=winrate_data, columns=["Dimension", *nice_columns])
    return wr_df

### 7B

In [22]:
columns = ['basesft_7b,base7b', 'basewr_7b,base7b', 'basewr_7b,basesft_7b',
       'trained_7b,basesft_7b', 'trained_7b,base7b',
       'trained_7b,basewr_7b']

In [23]:
nice_comb_names = {
    'basesft_7b,base7b': 'sft+base',
    'basewr_7b,base7b': 'br+base',
    'basewr_7b,basesft_7b': 'br+sft',
    'trained_7b,basesft_7b': 'rl+sft',
    'trained_7b,base7b': 'rl+base',
    'trained_7b,basewr_7b': 'rl+br'
}

In [24]:
get_winrate_data(all_7b, columns, nice_comb_names)

Unnamed: 0,Dimension,WR-sft+base,WR-br+base,WR-br+sft,WR-rl+sft,WR-rl+base,WR-rl+br
0,plot,5.0,65.0,75.0,85.0,45.0,50.0
1,character,25.0,55.0,80.0,85.0,35.0,40.0
2,creativity,20.0,40.0,70.0,80.0,65.0,60.0
3,development,20.0,55.0,75.0,85.0,45.0,50.0
4,language,35.0,40.0,65.0,80.0,60.0,50.0
5,overall_quality,15.0,60.0,75.0,90.0,65.0,55.0


### 3B

In [25]:
columns = ['basesft_3b,base3b', 'basewr_3b,base3b', 'basewr_3b,basesft_3b',
       'trained_3b,basesft_3b', 'trained_3b,base3b',
       'trained_3b,basewr_3b']

In [26]:
nice_comb_names = {
    'basesft_3b,base3b': 'sft+base',
    'basewr_3b,base3b': 'br+base',
    'basewr_3b,basesft_3b': 'br+sft',
    'trained_3b,basesft_3b': 'rl+sft',
    'trained_3b,base3b': 'rl+base',
    'trained_3b,basewr_3b': 'rl+br'
}

In [27]:
get_winrate_data(all_3b, columns, nice_comb_names)

Unnamed: 0,Dimension,WR-sft+base,WR-br+base,WR-br+sft,WR-rl+sft,WR-rl+base,WR-rl+br
0,plot,5.0,50.0,75.0,75.0,50.0,40.0
1,character,15.0,45.0,80.0,80.0,45.0,45.0
2,creativity,15.0,60.0,70.0,65.0,35.0,25.0
3,development,20.0,55.0,70.0,65.0,50.0,40.0
4,language,15.0,50.0,75.0,85.0,35.0,30.0
5,overall_quality,5.0,60.0,85.0,80.0,45.0,50.0


## Improvement By Genre

In [28]:
all_genres = ['scifi', 'fantasy', 'romance', 'historical']

In [36]:
nice_genre_names = {
    'scifi':'Scifi',
    'fantasy':'Fantasy',
    'romance':'Romance',
    'historical':'Historical',
}

# GENRE Win-Rates and Pref Probs for Both

In [37]:
df = pd.concat([all_7b, all_3b])

In [38]:
columns = ['basesft_3b,base3b', 'basewr_3b,base3b', 'basewr_3b,basesft_3b',
       'trained_3b,basesft_3b', 'trained_3b,base3b',
       'trained_3b,basewr_3b']

In [39]:
nice_comb_names = {
    'basesft_3b,base3b': 'sft+base',
    'basewr_3b,base3b': 'br+base',
    'basewr_3b,basesft_3b': 'br+sft',
    'trained_3b,basesft_3b': 'rl+sft',
    'trained_3b,base3b': 'rl+base',
    'trained_3b,basewr_3b': 'rl+br'
}

In [40]:
genre_pref_data = []
for genre in genres_we_care_about:
    for is_7b in [False, True]:
        size = "7b" if is_7b else "3b"
        row = [genre + "-" + size]
        subgenre = df[df[genre]]
        for cur_options in columns:
            if is_7b:
                cur_options = cur_options.replace("3", "7")
            opt1, opt2 = cur_options.split(",")
            nice_options = ",".join(sorted(cur_options.split(",")))
            all_count = Counter(subgenre[subgenre['options'] == nice_options]['overall_quality'])
            a_winrate = all_count.get(opt1, 0) / sum(all_count.values())
            assert opt1 in all_count or opt2 in all_count # fine if not true just need to handle it explicitly

            row.append(a_winrate*100)
#             row.append(a_pref*100)
        genre_pref_data.append(row)

In [41]:
nice_columns = []
for c in columns:
    nc = nice_comb_names[c]
    nice_columns.append("WR-" + nc)

In [42]:
gen_pref_7b = pd.DataFrame(data=genre_pref_data, columns=["Dimension", *nice_columns])
gen_pref_7b

Unnamed: 0,Dimension,WR-sft+base,WR-br+base,WR-br+sft,WR-rl+sft,WR-rl+base,WR-rl+br
0,scifi-3b,0.0,80.0,80.0,80.0,60.0,60.0
1,scifi-7b,0.0,60.0,100.0,100.0,100.0,60.0
2,fantasy-3b,0.0,60.0,100.0,80.0,30.0,60.0
3,fantasy-7b,30.0,60.0,70.0,80.0,40.0,60.0
4,romance-3b,0.0,40.0,100.0,80.0,20.0,40.0
5,romance-7b,0.0,40.0,100.0,100.0,40.0,60.0
6,historical-3b,10.0,60.0,80.0,80.0,50.0,50.0
7,historical-7b,30.0,70.0,50.0,80.0,60.0,50.0


# Automated Metrics

In [43]:
cats = ['num_words', 'pct_unique_words',
       'pct_tri_overlap',
       'rouge-l-f', 'rouge-l-p']

In [44]:
nice_cats = {
    'num_words': '\# Words',
    'pct_unique_words': 'Unique Words',
    'pct_tri_overlap': 'Unseen Trigrams',
    'rouge-l-f': 'Rouge-L F1',
    'rouge-l-p': 'Rouge-L Prec',
    'story_text_rouge_p': "Prev. Chap. Rouge-L Prec.",
     'next_chapter_header_rouge_p': "Next Chap. Header Rouge-L Prec.",
     'prior_plot_summary_rouge_p': "Prior Sum. Rouge-L Prec.",
     'high_level_plot_summary_rouge_p': "Plot Sketch Rouge-L Prec.",
     'character_sheets_rouge_p': "CSheets Rouge-L Prec.",
     'next_chapter_synopsis_rouge_p': "Next Chap. Syn. Rouge-L Prec.",
    
}

In [45]:
keys = ['base3', 'sft3', 'basewr3', 'rl3', 'base7', 'sft7', 'basewr7', 'rl7', 
        'reas3', 'trainreas3', 'reas7', 'trainreas7']

In [46]:
nice_keys = {'sft3':"3B-SFT", 
'sft7':"7B-SFT", 
'base3':"3B-B", 
'basewr3':"3B-BR",
'rl3':"3B-RL-Trained", 
'rl7':"7B-RL-Trained", 
'base7':"7B-B",
'basewr7':"7B-BR", 
'reas7':"7B-BR-REAS", 
'trainreas7':"7B-RL-Trained-REAS", 
'reas3':"3B-BR-REAS", 
'trainreas3':"3B-RL-Trained-REAS"
}

In [47]:
aut_data = []
for key in keys:
    nk = nice_keys[key]
    row = [nk]
    for cat in cats:
        sub = automated_metrics[(automated_metrics['cat'] == cat) & (automated_metrics['key'] == key)]
        mean = sub['val'].mean()
        row.append(mean)
    aut_data.append(row)

In [48]:
aut_df = pd.DataFrame(data=aut_data, columns=['Model', *[nice_cats[c] for c in cats]])
aut_df

Unnamed: 0,Model,\# Words,Unique Words,Unseen Trigrams,Rouge-L F1,Rouge-L Prec
0,3B-B,1383.093923,35.576036,83.326706,0.097232,0.283545
1,3B-SFT,1149.287293,25.292415,56.010547,0.083985,0.320041
2,3B-BR,1441.209945,34.131509,82.798635,0.098992,0.277007
3,3B-RL-Trained,1353.226519,34.945049,82.491012,0.096647,0.282993
4,7B-B,1486.453039,32.72641,77.789219,0.103015,0.286289
5,7B-SFT,1278.066298,23.729973,53.835538,0.086784,0.307694
6,7B-BR,1567.972376,31.843103,77.488631,0.105944,0.282245
7,7B-RL-Trained,1479.502762,33.228115,77.240235,0.109658,0.305675
8,3B-BR-REAS,845.839779,29.758648,62.170551,0.089131,0.380485
9,3B-RL-Trained-REAS,721.574586,25.489611,48.840001,0.078574,0.405359


### Rouge-L By Story-Information Section

In [57]:
prompt_part_rouge_cats = ['story_text_rouge_p',
#  'next_chapter_header_rouge_p',
 'prior_plot_summary_rouge_p',
 'high_level_plot_summary_rouge_p',
 'character_sheets_rouge_p',
 'next_chapter_synopsis_rouge_p']

In [58]:
prompt_aut_data = []
for key in keys:
    nk = nice_keys[key]
    row = [nk]
    for cat in prompt_part_rouge_cats:
        sub = automated_metrics[(automated_metrics['cat'] == cat) & (automated_metrics['key'] == key)]
        mean = sub['val'].mean()
        row.append(mean)
    prompt_aut_data.append(row)

In [59]:
prompt_aut_df = pd.DataFrame(data=prompt_aut_data, columns=['Model', *[nice_cats[c] for c in prompt_part_rouge_cats]])
prompt_aut_df

Unnamed: 0,Model,Prev. Chap. Rouge-L Prec.,Prior Sum. Rouge-L Prec.,Plot Sketch Rouge-L Prec.,CSheets Rouge-L Prec.,Next Chap. Syn. Rouge-L Prec.
0,3B-B,0.196772,0.106383,0.116176,0.171054,0.051314
1,3B-SFT,0.238306,0.109784,0.120971,0.178466,0.070356
2,3B-BR,0.191268,0.101448,0.111783,0.165952,0.047863
3,3B-RL-Trained,0.19678,0.105952,0.115856,0.170689,0.053452
4,7B-B,0.206881,0.098693,0.108412,0.159731,0.060753
5,7B-SFT,0.228981,0.105275,0.116716,0.169398,0.065658
6,7B-BR,0.199698,0.097129,0.107095,0.157653,0.055705
7,7B-RL-Trained,0.214421,0.104532,0.11541,0.166833,0.065041
8,3B-BR-REAS,0.234572,0.166864,0.183171,0.256898,0.113126
9,3B-RL-Trained-REAS,0.247983,0.177626,0.195036,0.242686,0.186144


### Aut Statistical Tests

In [60]:
from scipy.stats import f_oneway

In [61]:
chapter_keys_3b = ['base3',
#  'sft3',
 'basewr3',
 'rl3']
chapter_keys_7b = ['base7',
#  'sft7',
 'basewr7',
 'rl7'
]

In [62]:
reasoning_keys = [
    'reas3', 'trainreas3', 'reas7', 'trainreas7'
]

In [63]:
for cat in cats:
    vals = []
    for key in chapter_keys_3b:
        aa = automated_metrics[(automated_metrics['key'] == key) & (automated_metrics['cat'] == cat)]
        a = aa['val'].values
        vals.append(list(a))
    print(cat, f_oneway(*vals))

num_words F_onewayResult(statistic=1.132940473502349, pvalue=0.3228491315614238)
pct_unique_words F_onewayResult(statistic=2.338759308869122, pvalue=0.09742346585550624)
pct_tri_overlap F_onewayResult(statistic=0.6274088418521276, pvalue=0.5343624084124453)
rouge-l-f F_onewayResult(statistic=1.2573715776907501, pvalue=0.28481540360478286)
rouge-l-p F_onewayResult(statistic=2.729494016419706, pvalue=0.06570121375499259)


In [64]:
for cat in cats:
    vals = []
    for key in chapter_keys_7b:
        aa = automated_metrics[(automated_metrics['key'] == key) & (automated_metrics['cat'] == cat)]
        a = aa['val'].values
        vals.append(list(a))
    print(cat, f_oneway(*vals))

num_words F_onewayResult(statistic=1.1358047110545213, pvalue=0.3219295957847666)
pct_unique_words F_onewayResult(statistic=2.507804585513545, pvalue=0.08239506361844508)
pct_tri_overlap F_onewayResult(statistic=0.10851523073295723, pvalue=0.8971847893105264)
rouge-l-f F_onewayResult(statistic=1.986163866356637, pvalue=0.13772033184956198)
rouge-l-p F_onewayResult(statistic=7.298766909039793, pvalue=0.0007101635878570675)


In [65]:
for cat in cats:
    vals = []
    for key in chapter_keys_3b:
        aa = automated_metrics[(automated_metrics['key'] == key) & (automated_metrics['cat'] == cat)]
        a = aa['val'].values
        vals.append(list(a))
    print(cat, f_oneway(*vals))

num_words F_onewayResult(statistic=1.132940473502349, pvalue=0.3228491315614238)
pct_unique_words F_onewayResult(statistic=2.338759308869122, pvalue=0.09742346585550624)
pct_tri_overlap F_onewayResult(statistic=0.6274088418521276, pvalue=0.5343624084124453)
rouge-l-f F_onewayResult(statistic=1.2573715776907501, pvalue=0.28481540360478286)
rouge-l-p F_onewayResult(statistic=2.729494016419706, pvalue=0.06570121375499259)


# Perplexity Correlation

In [111]:
syn_to_improvement = {}
for syn, comp_to_ppl_data in syn_to_completion_to_ppl_qwen7B.items():
    for comp, ppl_data in comp_to_ppl_data.items():
        assert syn not in syn_to_improvement
        syn_to_improvement[syn] = ppl_data['percent_improvement']
        
syn_to_improvement_trained = {}
for syn, comp_to_ppl_data in syn_to_completion_to_ppl_qwen7B_trained.items():
    for comp, ppl_data in comp_to_ppl_data.items():
        assert syn not in syn_to_improvement_trained
        syn_to_improvement_trained[syn] = ppl_data['percent_improvement']

In [112]:
key = 'overall_quality'
cur_options = ['base7b','same', 'other']
df = all_7b[all_7b['options'].isin(['base7b,trained_7b', 'base7b,basewr_7b', 'basewr_7b,trained_7b'])]

cats = []
imps = []
imp_bools = []
for i, row in df.iterrows():
    setting = row['options']
    a, b = setting.split(",")
    opts = [a, "same", b]
    val = row[key]
#     if val == b:
# #         val = 'trained_7b' if 'trained' in setting else 'basewr_7b'
#         val = 'other'
    
#     cat = cur_options.index(val)
    cat = opts.index(val)
    cats.append(cat)
    
    real = row['real_text']
    first, second = real
    fsyn, ftype, fgenres = text_to_syn_type_genre[first]
    ssyn, stype, sgenres = text_to_syn_type_genre[second]
    assert fgenres == sgenres
    assert fsyn == ssyn
    assert ftype != stype
    seen_types = set(row[v] for v in votecols)
    assert len(seen_types) <= 3
    assert all(t in ['same', ftype, stype] for t in seen_types)
    assert all(t in ['same', a, b] for t in seen_types)
    
    br_imp = syn_to_improvement[fsyn].item()
    timp = syn_to_improvement_trained[fsyn].item()
    
    imp_diff = br_imp # base7b,basewr_7b
    if 'basewr_7b' == a and b == 'trained_7b':
        imp_diff = timp - br_imp
    elif 'base7b' == a and b == 'trained_7b':
        imp_diff = timp
    elif 'base7b' == a and b == 'basewr_7b':
        pass
    else:
        raise Exception("INVALID COMBINATION")
    
    imps.append(imp_diff)
len(cats)

60

In [113]:
spearmanr(cats, imps)

SignificanceResult(statistic=0.3258817301597572, pvalue=0.011055278814122951)

In [114]:
import seaborn as sns

In [115]:
tempdf = pd.DataFrame(data=[[c, i] for c, i in zip(cats, imps)], columns=['annot', 'imp'])

# Fleiss Kappa 7B

In [122]:
from statsmodels.stats.inter_rater import fleiss_kappa

In [123]:
base_to_basewr7b = pd.read_csv("final_7bbase_basewr_annotations.csv")

In [124]:
votecols = ['plot', 'character', 'creativity',
       'development', 'language', 'overall_quality']

In [125]:
t_cols = ['t_plot',
       't_character', 't_creativity', 't_development', 't_language',
       't_overall_quality', 't_optional_comments']

In [126]:
# try and recreate fleiss kappa from all_used
# key = "overall_quality"
for key in votecols + ['fake']:
    titles = ['base7b', 'same', 'basewr_7b']
    subject_category_counts = []
    for t in base_to_basewr7b['text'].unique():
        sub = base_to_basewr7b[base_to_basewr7b['text'] == t]
        # sub has guaranteed 3 rows
        assert len(sub) == 3
#         if key == 'overall_quality':
#             print(sub['text'].unique(), Counter(sub[key]))
        counts = [0, 0, 0]
        for i, row in sub.iterrows():
#             if i == 25:
#                 print(row)
            vote = row[key]
            index = titles.index(vote)
            counts[index] += 1
        subject_category_counts.append(counts)
    print(key, fleiss_kappa(subject_category_counts))

plot 0.1795806745670011
character 0.1021126760563379
creativity 0.21807124239791476
development 0.1863517060367455
language 0.06011854360711254
overall_quality 0.20975609756097566
fake 0.19881305637982202


In [127]:
raw_base_to_basewr7b = pd.read_csv("raw_7b_base_basewr.csv")

In [128]:
# try and recreate fleiss kappa from all_used
# key = "overall_quality"
titles = ['base7b', 'same', 'basewr_7b']
new_data = []
for t in base_to_basewr7b['text'].unique():
    sub = base_to_basewr7b[base_to_basewr7b['text'] == t]
    row = [t]
    for key in votecols + ['fake']:
        # sub has guaranteed 3 rows
        assert len(sub) == 3
        counts = [0, 0, 0]
        votes = list(sub[key].values)
        
        most_common = Counter(votes).most_common(1)[0][0]
#         if votes.count(most_common) == 1 and 'same' in votes:
        if votes.count(most_common) == 1:
            print(votes)
            most_common = 'same' # break ties with same
        row.append(most_common)
    new_data.append(row)
agreed_base_basewr7b = pd.DataFrame(data=new_data, columns=['text', *votecols, 'fake'])

['same', 'basewr_7b', 'base7b']
['same', 'basewr_7b', 'base7b']
['basewr_7b', 'same', 'base7b']
['basewr_7b', 'base7b', 'same']
['basewr_7b', 'same', 'base7b']
['basewr_7b', 'same', 'base7b']
['same', 'base7b', 'basewr_7b']
['base7b', 'same', 'basewr_7b']
['base7b', 'same', 'basewr_7b']
['same', 'base7b', 'basewr_7b']
['basewr_7b', 'base7b', 'same']
['basewr_7b', 'base7b', 'same']
['same', 'base7b', 'basewr_7b']
['same', 'base7b', 'basewr_7b']


In [129]:
raw_base_to_basewr7b_textid_to_text = {}
for i, row in raw_base_to_basewr7b.iterrows():
    if row['text'] in raw_base_to_basewr7b_textid_to_text:
        assert row['real_text'] == raw_base_to_basewr7b_textid_to_text[row['text']]
    raw_base_to_basewr7b_textid_to_text[row['text']] = row['real_text']

In [130]:
agreed_base_basewr7b['real_text'] = [ast.literal_eval(raw_base_to_basewr7b_textid_to_text[t]) for t in agreed_base_basewr7b['text']]

In [131]:
agreed_base_basewr7b['options'] = ['base7b,basewr_7b' for i in range(len(agreed_base_basewr7b))]

In [132]:
# agreed_base_basewr7b.to_csv("agreed_base_basewr_7b.csv", index=False)

## Dump

In [133]:
# old table

\begin{table}[t]
\begin{center}
\small
% \begin{tabular}{lllllll}
\begin{tabular}{lcccc}
\toprule
\multicolumn{1}{c}{Genre}  & \multicolumn{1}{c}{3B Win Rate} & \multicolumn{1}{c}{3B Pref. Prob} & \multicolumn{1}{c}{7B \% Win Rate} & \multicolumn{1}{c}{7B Pref. Pronn} \\
\midrule
Scifi & 80\% & \textbf{80\%} & 60\% & 75.0\% \\
Fantasy & 60\% & \textbf{60\%} & 50\% & 55.6\% \\
Romance & 40\% & 40\% & 60\% & \textbf{75.0\%} \\
% Adult & 60\% & 60\% & 70\% & \textbf{77.8\%} \\
% YA & 40\% & 40\% & 60\% & \textbf{75.0\%} \\
Historical & 60\% & 60\% & 60\% & \textbf{60.0\%} \\
\bottomrule
\end{tabular}
\end{center}
\caption{The win-rate ($\frac{|\text{br}|}{|\text{br + b 
 + same}}|$) and Bradley-Terry preference probability for Base-Reasoning vs Base, by 3B and 7B models and by genre. We find that including reasoning in Scifi significantly improves 3B performance, but that it may decrease performance in Romance books. In contrast, reasoning consistently helps the 7B model across genres with the most pronounced effect in Adult fiction. Note the win-rate percentage includes `same' annotations, while preference probabilities do not. Also note that these results are taken from a subset of the already-small test-set, so a few of these genres only apply to one book (and five annotations).}\label{table:base_basewr_winrates_genre}
\end{table}