In [60]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [61]:
import pandas as pd
from functools import partial, reduce
import statsmodels.stats.inter_rater as ir
import numpy as np
import plotly.express as px
import glob
from collections import defaultdict

# Loading and Combining Reviews

In [62]:

models_map = {
    'answer_gpt4': 'gpt-4', 
    'answer_gpt35': 'gpt-3.5',
    'answer_bard': 'bard',
    'answer_claude': 'claude', 
    'answer_vicuna-13b': 'vicuna',
}
reviewers_map = {
    'gpt-4': 'gpt-4',
    'gpt-3.5-turbo-0301': 'gpt-3.5',
    'text-bison@001': 'bard',
    'claude-1': 'claude',
    'vicuna-13b': 'vicuna',
}


In [63]:
def review_filename(modelA, modelB, reviewer):
    name = f'{modelA}-vs-{modelB}-{reviewer}-reviewer*.jsonl'
    glob_path = './ratings-*/' + name
    globs = glob.glob(glob_path)
    return globs[0] if len(globs) > 0 else None

def format_df(df):
    # map A wins to -1, B wins to 1, and ties to 0
    df.score = df.score.map({-1: 0, 1: -1, 2: 1, 3: 0})
    df.sort_values(by='question_id', inplace=True)

def load_reviews(models_map, reviewers_map):
    dfs_list = []
    for modelA in models_map.keys():
        for modelB in models_map.keys():
            if modelA == modelB:
                continue

            for reviewer in reviewers_map.keys():
                filename = review_filename(modelA, modelB, reviewer)
                if filename is None:
                    print(f'No review file for {modelA} vs {modelB} by {reviewer}')
                    continue
                # get df and add to array
                df = pd.read_json(filename, lines=True)[['question_id', 'score']]
                format_df(df)
                invalid = df.score.isna()
                ninvalid = invalid.sum()
                if ninvalid > 0:
                    print(ninvalid, f'#invalid. {modelA} vs {modelB} by {reviewer}')
                    print(df[invalid])
                df['model_a'] = models_map[modelA]
                df['model_b'] = models_map[modelB]
                df['reviewer'] = reviewers_map[reviewer]
                dfs_list.append(df)
        
    # combine all dfs and shuffle
    reviews = pd.concat(dfs_list).sample(frac=1, random_state=42)
    return reviews

In [64]:
# if there are multiple reviews for a question from a given review, take the majority vote
def human_majority(df):
    def take_majority(frame):
        x = frame.mean()
        return np.sign(x)

    return df.groupby(['question_id', 'model_a', 'model_b', 'reviewer'], as_index=False).agg({'score': take_majority})

In [65]:
auto_reviews = load_reviews(models_map, reviewers_map)
human_reviews = human_majority(load_reviews(models_map, {'human': 'human'}))
gpt4_reviews = auto_reviews[auto_reviews.reviewer == 'gpt-4']

No review file for answer_gpt4 vs answer_claude by human
No review file for answer_gpt35 vs answer_claude by human
No review file for answer_bard vs answer_claude by human
No review file for answer_vicuna-13b vs answer_claude by human


In [66]:
auto_reviews.index.size
human_reviews.index.size
gpt4_reviews.index.size

8000

800

1600

# Win Rates

## Pairwise

In [67]:
# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_pairwise_win_fraction(battles):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles['score'] == -1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles[battles['score'] == 1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(battles,
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (
        (a_win_ptbl + b_win_ptbl.T) /
        (num_battles_ptbl + num_battles_ptbl.T)
    )

    # Arrange ordering according to proprition of wins
    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
    model_names = list(prop_wins.keys())
    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
    return row_beats_col

# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def visualize_pairwise_win_fraction(battles, title):
    row_beats_col = compute_pairwise_win_fraction(battles)
    print(row_beats_col)
    average_row_beats_col = row_beats_col.mean(axis=1)
    print(average_row_beats_col)
    average_row_beats_col.name = "average"
    fig = px.imshow(row_beats_col.merge(average_row_beats_col, on='model_a'), color_continuous_scale='RdBu',
                    text_auto=".2f", title=title)
    fig.update_layout(xaxis_title="Model B",
                  yaxis_title="Model A",
                  xaxis_side="top",
                  title_y=0.07, title_x=0.5)
    fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")

    return fig

In [68]:
fig = visualize_pairwise_win_fraction(auto_reviews[auto_reviews.score != 0],
      title = "Fraction of Model A Wins for All A vs. B Battles (excluding ties)")
fig

model_b     gpt-4    claude    vicuna   gpt-3.5      bard
model_a                                                  
gpt-4         NaN  0.607727  0.824701  0.819178  0.831341
claude   0.392273       NaN  0.750337  0.757997  0.794355
vicuna   0.175299  0.249663       NaN  0.528571  0.597183
gpt-3.5  0.180822  0.242003  0.471429       NaN  0.557471
bard     0.168659  0.205645  0.402817  0.442529       NaN
model_a
gpt-4      0.770737
claude     0.673741
vicuna     0.387679
gpt-3.5    0.362931
bard       0.304912
dtype: float64


## Integral Combined Win Rates

In [69]:
# regular win fractions
def compute_winrates(battles):
    num_battles = defaultdict(lambda: 0)
    num_wins = defaultdict(lambda: 0)

    for i, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        num_battles[model_a] += 1
        num_battles[model_b] += 1
        if winner == -1:
            num_wins[model_a] += 1
        elif winner == 1:
            num_wins[model_b] += 1

    winrates = {model: num_wins[model] / num_battles[model] for model in num_battles.keys()}
    return dict(sorted(winrates.items(), key=lambda item: item[1], reverse=True))


In [70]:
print("Auto Winrates:")
compute_winrates(auto_reviews)
print("Human Winrates:")
compute_winrates(human_reviews)
print("GPT4 Winrates:")
compute_winrates(gpt4_reviews)

Auto Winrates:


{'gpt-4': 0.704375,
 'claude': 0.61125,
 'vicuna': 0.3471875,
 'gpt-3.5': 0.32,
 'bard': 0.273125}

Human Winrates:


{'gpt-4': 0.76875,
 'claude': 0.628125,
 'vicuna': 0.315625,
 'gpt-3.5': 0.253125,
 'bard': 0.234375}

GPT4 Winrates:


{'gpt-4': 0.7890625,
 'claude': 0.6171875,
 'vicuna': 0.284375,
 'gpt-3.5': 0.25,
 'bard': 0.184375}

## Fractional Combined Win Rates

In [71]:
def compute_fractional_winrates(battles):
    scores_a = battles.groupby(['model_a']).agg(
        sum=pd.NamedAgg(column="score", aggfunc=lambda x: -x.sum()),
        count=pd.NamedAgg(column="score", aggfunc=len))

    scores_b = battles.groupby(['model_b']).agg(
        sum=pd.NamedAgg(column="score", aggfunc='sum'),
        count=pd.NamedAgg(column="score", aggfunc=len))
        
    sum_scores = scores_a.add(scores_b, fill_value=0)
    sum_scores.index.name = 'model'
    sum_scores['winrate'] = (sum_scores['sum'] / sum_scores['count'] + 1)/2
    return sum_scores['winrate'].sort_values(ascending=False).to_dict()

In [72]:
print("Auto Fractional Winrates:")
compute_fractional_winrates(auto_reviews)
print("Human Fractional Winrates:")
compute_fractional_winrates(human_reviews)
print("GPT4 Fractional Winrates:")
compute_fractional_winrates(gpt4_reviews)

Auto Fractional Winrates:


{'gpt-4': 0.74984375,
 'claude': 0.66171875,
 'vicuna': 0.3934375,
 'gpt-3.5': 0.37546875,
 'bard': 0.31953125}

Human Fractional Winrates:


{'gpt-4': 0.821875,
 'claude': 0.6890625,
 'vicuna': 0.3890625,
 'gpt-3.5': 0.3140625,
 'bard': 0.2859375}

GPT4 Fractional Winrates:


{'gpt-4': 0.85625,
 'claude': 0.70859375,
 'vicuna': 0.3484375,
 'gpt-3.5': 0.3421875,
 'bard': 0.24453124999999998}

# Elo Calculations

In [73]:
# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, score in battles[['model_a', 'model_b', 'score']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))

        # score from scale of [-1, 1] corresponding to [model_a, model_b] wins
        sa = (1 - score) / 2

        if abs(score) > 1.001:
            print("problem @", model_a, model_b)
            raise Exception(f"unexpected vote {score}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return dict(sorted(rating.items(), key=lambda x: x[1], reverse=True))


In [74]:
print("Auto ELO:")
compute_elo(auto_reviews.sample(frac=1, random_state=42))
print("Human ELO:")
compute_elo(human_reviews.sample(frac=1, random_state=42))
print("GPT4 ELO:")
compute_elo(gpt4_reviews.sample(frac=1, random_state=42))

Auto ELO:


{'gpt-4': 1153.4482785233479,
 'claude': 1129.869490579247,
 'vicuna': 928.4186687458919,
 'gpt-3.5': 894.7596510899301,
 'bard': 893.5039110615811}

Human ELO:


{'gpt-4': 1268.5002092438044,
 'claude': 1157.0433884486936,
 'vicuna': 891.2900929821122,
 'gpt-3.5': 868.6004571051353,
 'bard': 814.5658522202546}

GPT4 ELO:


{'gpt-4': 1250.9361746398897,
 'claude': 1136.0313085098653,
 'bard': 888.2487114878877,
 'vicuna': 867.9868600806446,
 'gpt-3.5': 856.7969452817135}

# Normalization functions

In [75]:
def get_norm(weights, op, div_factor=pd.DataFrame.sum):
    frame = pd.DataFrame(weights.items(), columns=['reviewer', 'weight'])
    frame.set_index('reviewer', inplace=True)
    weights = frame['weight']
    w = op(weights)
    # normalize to sum of 1
    frame['weight'] = w / div_factor(w)
    return frame.to_dict()['weight'] 

In [76]:
def minmax_norm(weights):
    def minmax(x):
        return (x - x.min()) / (x.max() - x.min())
    return get_norm(weights, op=minmax)

In [77]:
def softmax_norm(weights):
    return get_norm(weights, op=np.exp)

In [78]:
def zscore_norm(weights):
    def zscore(weights):
        return (weights - weights.mean()) / weights.std()
    return get_norm(weights, op=zscore)

# Weighted Normal Reviews

In [79]:
def run_normal_test(reviews, iterations, metric):
    def print_scores(scores):
        df = pd.DataFrame(scores['scores'].items(), columns=['model', 'score'])
        df.set_index('model', inplace=True)
        print(df)
        print()

    print(f"{metric.__name__} for {iterations} iters")
    print_scores(metric(reviews, iterations))

## Fractional Winrates

In [80]:
def compute_weighted_fractional_winrates(battles, weights=None):
    if weights is not None:
        weights = get_norm(weights, op=lambda x: x, div_factor=pd.DataFrame.mean)
    def weighted_sum(frame, negate=False):
        if weights is None:
            score = frame['score'].sum()
        else:
            w = frame['reviewer'].map(weights).values
            score = (frame['score'] * w).sum()
        if negate:
            score = -score
        count = frame['score'].count()
        return pd.Series([score, count], index=['sum', 'count'])
        
    scores_a = battles.groupby(['model_a']).apply(partial(weighted_sum, negate=True))
    scores_b = battles.groupby(['model_b']).apply(weighted_sum)
        
    sum_scores = scores_a.add(scores_b, fill_value=0)
    sum_scores.index.name = 'model'
    sum_scores['winrate'] = (sum_scores['sum'] / sum_scores['count'] + 1)/2
    return sum_scores['winrate'].sort_values(ascending=False).to_dict()

In [81]:
def normal_fractional_winrate(reviews, num_iterations, norm_func=minmax_norm):
    weights = None
    for i in range(num_iterations):
        winrates = compute_weighted_fractional_winrates(reviews, weights=weights)
        weights = norm_func(winrates)
    return { 'scores': winrates, 'weights': weights }

In [82]:
run_normal_test(auto_reviews, 5, normal_fractional_winrate)

normal_fractional_winrate for 5 iters
            score
model            
gpt-4    0.802025
claude   0.684978
vicuna   0.376249
gpt-3.5  0.346164
bard     0.290584



## Elo scores

In [83]:
def compute_weighted_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000, weights=None):
    if weights is None:
        weights = defaultdict(lambda: 1)
    else:
        weights = get_norm(weights, op=lambda x: x, div_factor=pd.DataFrame.mean)
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, reviewer, score in battles[['model_a', 'model_b', 'reviewer', 'score']].itertuples():
        w = weights[reviewer]
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))

        # score from scale of [-1, 1] corresponding to [model_a, model_b] wins
        sa = (1 - score) / 2

        if abs(score) > 1.001:
            print("problem @", model_a, model_b)
            raise Exception(f"unexpected vote {score}")
        rating[model_a] += w * K * (sa - ea)
        rating[model_b] += w * K * (1 - sa - eb)

    return dict(sorted(rating.items(), key=lambda x: x[1], reverse=True))

In [84]:
def normal_elo(reviews, num_iterations, norm_func=minmax_norm):
    weights = None
    for i in range(num_iterations):
        elo = compute_weighted_elo(reviews, weights=weights)
        weights = norm_func(elo)
    return { 'scores': elo, 'weights': weights }

In [85]:
run_normal_test(auto_reviews, 5, normal_elo)

normal_elo for 5 iters
               score
model               
gpt-4    1221.808346
vicuna   1015.114640
claude    991.452391
gpt-3.5   916.227387
bard      855.397236



# Weighted Majorities

In [86]:
def weighted_winnertakeall_vote(df, weights=None, EPS=0.1):
    def take_weighted_wta_vote(frame):
        if weights is None:
            w = None
        else:
            w = frame['reviewer'].map(weights).values
        x = np.average(frame['score'], weights=w)
        y = np.where(x > EPS, 1, np.where(x < -EPS, -1, 0))
        return pd.Series(y, index=['score'])

    return df.groupby(['question_id', 'model_a', 'model_b'], as_index=False).apply(take_weighted_wta_vote)

In [87]:
def weighted_average_vote(df, weights=None):
    def take_weighted_mean(frame):
        if weights is None:
            w = None
        else:
            w = frame['reviewer'].map(weights).values
        x = np.average(frame['score'], weights=w)
        return pd.Series(x, index=['score'])

    return df.groupby(['question_id', 'model_a', 'model_b'], as_index=False).apply(take_weighted_mean)

In [88]:
def run_majority_test(reviews, iterations, metric):
    def print_scores(scores):
        df = pd.DataFrame(scores['scores'].items(), columns=['model', 'score'])
        df.set_index('model', inplace=True)
        print(df)
        print()

    print(f"{iterations} iters\n")

    print(f"{metric.__name__} w ties and average vote")
    print_scores(metric(reviews, iterations))

    print(f"{metric.__name__} w/o ties and average vote")
    print_scores(metric(reviews, iterations, remove_ties=True))

    print(f"{metric.__name__} w ties and wta vote")
    print_scores(metric(reviews, iterations, voting_func=weighted_winnertakeall_vote))

    print(f"{metric.__name__} w/o ties and wta vote")
    print_scores(metric(reviews, iterations, voting_func=weighted_winnertakeall_vote, remove_ties=True))


## Integral Winrates

In [89]:
def majority_winrate(reviews, num_iterations, voting_func=weighted_average_vote, norm_func=minmax_norm, remove_ties=False):
    weights = None
    for i in range(num_iterations):
        maj = voting_func(reviews, weights=weights)
        if remove_ties:
            maj = maj[maj.score != 0]
        winrates = compute_winrates(maj)
        weights = norm_func(winrates)
    return { 'scores': winrates, 'weights': weights }

In [90]:
run_majority_test(auto_reviews, 2, majority_winrate)

2 iters

majority_winrate w ties and average vote


            score
model            
gpt-4    0.382812
claude   0.270313
vicuna   0.068750
gpt-3.5  0.054688
bard     0.018750

majority_winrate w/o ties and average vote
            score
model            
gpt-4    0.382812
claude   0.270313
vicuna   0.068750
gpt-3.5  0.054688
bard     0.018750

majority_winrate w ties and wta vote
            score
model            
gpt-4    0.790625
claude   0.623437
vicuna   0.281250
gpt-3.5  0.242188
bard     0.173437

majority_winrate w/o ties and wta vote
            score
model            
gpt-4    0.898757
claude   0.745794
vicuna   0.333952
gpt-3.5  0.285451
bard     0.212644



## Fractional Winrates

In [91]:
def majority_fractional_winrate(reviews, num_iterations, voting_func=weighted_average_vote, norm_func=minmax_norm, remove_ties=False):
    weights = None
    for i in range(num_iterations):
        maj = voting_func(reviews, weights=weights)
        if remove_ties:
            maj = maj[maj.score != 0]
        winrates = compute_fractional_winrates(maj)
        weights = norm_func(winrates)
    return { 'scores': winrates, 'weights': weights }

In [92]:
# for comparison with majority vote using ties and average vote
run_normal_test(auto_reviews, 2, normal_fractional_winrate)

normal_fractional_winrate for 2 iters
            score
model            
gpt-4    0.801478
claude   0.684828
vicuna   0.376569
gpt-3.5  0.346210
bard     0.290916



In [93]:
run_majority_test(auto_reviews, 2, majority_fractional_winrate)

2 iters

majority_fractional_winrate w ties and average vote
            score
model            
gpt-4    0.801478
claude   0.684828
vicuna   0.376569
gpt-3.5  0.346210
bard     0.290916

majority_fractional_winrate w/o ties and average vote
            score
model            
gpt-4    0.801272
claude   0.684714
vicuna   0.376654
gpt-3.5  0.346247
bard     0.291113

majority_fractional_winrate w ties and wta vote
            score
model            
gpt-4    0.850781
claude   0.705469
vicuna   0.360156
gpt-3.5  0.317969
bard     0.265625

majority_fractional_winrate w/o ties and wta vote
            score
model            
gpt-4    0.898757
claude   0.745794
vicuna   0.333952
gpt-3.5  0.285451
bard     0.212644



## Elo scores

In [94]:
def majority_elo(reviews, num_iterations, voting_func=weighted_average_vote, norm_func=minmax_norm, remove_ties=False):
    weights = None
    for i in range(num_iterations):
        maj = voting_func(reviews, weights=weights)

        # randomize order
        maj = maj.sample(frac=1, random_state=73)
        if remove_ties:
            maj = maj[maj.score != 0]
        elos = dict(compute_elo(maj))
        weights = minmax_norm(elos)
    return { 'scores': elos, 'weights': weights }

In [95]:
run_normal_test(auto_reviews, 5, normal_elo)

normal_elo for 5 iters
               score
model               
gpt-4    1221.808346
vicuna   1015.114640
claude    991.452391
gpt-3.5   916.227387
bard      855.397236



In [96]:
run_majority_test(auto_reviews, 5, majority_elo)

5 iters

majority_elo w ties and average vote


               score
model               
gpt-4    1202.663763
claude   1070.430278
vicuna    951.354291
bard      901.966066
gpt-3.5   873.585602

majority_elo w/o ties and average vote
               score
model               
gpt-4    1202.686433
claude   1070.417732
vicuna    951.356626
bard      901.963553
gpt-3.5   873.575656

majority_elo w ties and wta vote
               score
model               
gpt-4    1320.125321
claude   1112.204590
vicuna    908.340523
bard      846.146818
gpt-3.5   813.182749

majority_elo w/o ties and wta vote
               score
model               
gpt-4    1331.787077
claude   1104.842206
vicuna    915.034849
bard      834.270473
gpt-3.5   814.065395



# Correlations and Accuracy

In [97]:
human_copy = human_reviews.copy()
# switch model_a and model_b
human_copy['model_a'], human_copy['model_b'] = human_copy['model_b'], human_copy['model_a']
human_copy['score'] = -human_copy['score']

doubled_human_reviews = pd.concat([human_reviews, human_copy])

In [98]:
def correlate(*review_dfs):
    for df in review_dfs:
        df.sort_values(by=['question_id', 'model_a', 'model_b'], inplace=True)

    scores = np.array([df['score'].values for df in review_dfs]).T
    print(scores)
    subject_category_matrix = ir.aggregate_raters(scores)[0]
    print(subject_category_matrix)
    return ir.fleiss_kappa(subject_category_matrix)

In [99]:
correlate(weighted_winnertakeall_vote(auto_reviews),  doubled_human_reviews)

[[ 1. -0.]
 [-1. -1.]
 [ 1.  1.]
 ...
 [ 1. -0.]
 [ 1.  1.]
 [ 1.  1.]]
[[0 1 1]
 [2 0 0]
 [0 0 2]
 ...
 [0 1 1]
 [0 0 2]
 [0 0 2]]


0.39221819366847316

In [100]:
def accuracy(*review_dfs):
    for df in review_dfs:
        df.sort_values(by=['question_id', 'model_a', 'model_b'], inplace=True)

    scores = np.array([df['score'].values for df in review_dfs])
    len_comparisons = scores.shape[1]

    comparisons = []
    num_comparisons = 0
    for i, ratingA in enumerate(scores):
        for j, ratingB in enumerate(scores):
            if i >= j:
                continue
            num_comparisons += 1
            comparisons.append(np.sum(ratingA == ratingB))
    
    return reduce(lambda x, y: x + y, comparisons) / (num_comparisons * len_comparisons)
    

In [101]:
accuracy(weighted_winnertakeall_vote(auto_reviews),  doubled_human_reviews)

0.64375