In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import glob
from collections import defaultdict

# Loading and Combining Reviews

In [3]:

models_map = {
    'answer_gpt4': 'gpt-4', 
    'answer_gpt35': 'gpt-3.5',
    'answer_bard': 'bard',
    'answer_claude': 'claude', 
    'answer_vicuna-13b': 'vicuna',
}
reviewers_map = {
    'gpt-4': 'gpt-4',
    'gpt-3.5-turbo-0301': 'gpt-3.5',
    'text-bison@001': 'bard',
    'claude-1': 'claude',
    'vicuna-13b': 'vicuna',
}


In [4]:
def review_filename(modelA, modelB, reviewer):
    name = f'{modelA}-vs-{modelB}-{reviewer}-reviewer*.jsonl'
    glob_path = './ratings-*/' + name
    globs = glob.glob(glob_path)
    return globs[0] if len(globs) > 0 else None

def format_df(df):
    # map A wins to -1, B wins to 1, and ties to 0
    df.score = df.score.map({-1: 0, 1: -1, 2: 1, 3: 0})
    df.sort_values(by='question_id', inplace=True)

def load_reviews(models_map, reviewers_map):
    dfs_list = []
    for modelA in models_map.keys():
        for modelB in models_map.keys():
            if modelA == modelB:
                continue

            for reviewer in reviewers_map.keys():
                filename = review_filename(modelA, modelB, reviewer)
                if filename is None:
                    print(f'No review file for {modelA} vs {modelB} by {reviewer}')
                    continue
                # get df and add to array
                df = pd.read_json(filename, lines=True)[['question_id', 'score']]
                format_df(df)
                invalid = df.score.isna()
                ninvalid = invalid.sum()
                if ninvalid > 0:
                    print(ninvalid, f'#invalid. {modelA} vs {modelB} by {reviewer}')
                    print(df[invalid])
                df['model_a'] = models_map[modelA]
                df['model_b'] = models_map[modelB]
                df['reviewer'] = reviewers_map[reviewer]
                dfs_list.append(df)
        
    # combine all dfs and shuffle
    reviews = pd.concat(dfs_list).sample(frac=1, random_state=42)
    return reviews

In [5]:
# if there are multiple reviews for a question from a given review, take the majority vote
def human_majority(df):
    def take_majority(frame):
        x = frame.mean()
        return np.sign(x)

    return df.groupby(['question_id', 'model_a', 'model_b', 'reviewer'], as_index=False).agg({'score': take_majority})

In [6]:
reviews = human_majority(load_reviews(models_map, reviewers_map))
human_reviews = human_majority(load_reviews(models_map, {**reviewers_map, 'human': 'human'}))


No review file for answer_gpt4 vs answer_claude by human
No review file for answer_gpt35 vs answer_claude by human
No review file for answer_bard vs answer_claude by human
No review file for answer_vicuna-13b vs answer_claude by human


In [7]:
hr = human_reviews[human_reviews.reviewer == 'human']
hr.groupby(['model_a', 'model_b'], as_index=False).agg({'score': 'mean'})

Unnamed: 0,model_a,model_b,score
0,bard,gpt-3.5,-0.225
1,bard,gpt-4,0.595745
2,bard,vicuna,0.357143
3,claude,bard,-0.7375
4,claude,gpt-3.5,-0.625
5,claude,gpt-4,0.625
6,claude,vicuna,-0.775
7,gpt-3.5,bard,-0.225
8,gpt-3.5,gpt-4,0.755556
9,gpt-3.5,vicuna,0.285714


# Win Rates

## Pairwise

In [8]:
# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_pairwise_win_fraction(battles):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles['score'] == -1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles[battles['score'] == 1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(battles,
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (
        (a_win_ptbl + b_win_ptbl.T) /
        (num_battles_ptbl + num_battles_ptbl.T)
    )

    # Arrange ordering according to proprition of wins
    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
    model_names = list(prop_wins.keys())
    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
    return row_beats_col

# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def visualize_pairwise_win_fraction(battles, title):
    row_beats_col = compute_pairwise_win_fraction(battles)
    fig = px.imshow(row_beats_col, color_continuous_scale='RdBu',
                    text_auto=".2f", title=title)
    fig.update_layout(xaxis_title="Model B",
                  yaxis_title="Model A",
                  xaxis_side="top",
                  title_y=0.07, title_x=0.5)
    fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")

    return fig

In [9]:
fig = visualize_pairwise_win_fraction(reviews[reviews.score != 0],
      title = "Fraction of Model A Wins for All A vs. B Battles (excluding ties)")
fig

## Integral Combined Win Rates

In [11]:
# regular win fractions
def compute_winrates(battles):
    num_battles = defaultdict(lambda: 0)
    num_wins = defaultdict(lambda: 0)

    for i, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        num_battles[model_a] += 1
        num_battles[model_b] += 1
        if winner == -1:
            num_wins[model_a] += 1
        elif winner == 1:
            num_wins[model_b] += 1

    winrates = {model: num_wins[model] / num_battles[model] for model in num_battles.keys()}
    return dict(sorted(winrates.items(), key=lambda item: item[1], reverse=True))


In [24]:
compute_winrates(reviews)

{'gpt-4': 0.704375,
 'claude': 0.61125,
 'vicuna': 0.3471875,
 'gpt-3.5': 0.32,
 'bard': 0.273125}

## Fractional Combined Win Rates

In [76]:
def compute_fractional_winrates(battles):
    scores_a = battles.groupby(['model_a']).agg(
        sum=pd.NamedAgg(column="score", aggfunc=lambda x: -x.sum()),
        count=pd.NamedAgg(column="score", aggfunc=len))

    scores_b = battles.groupby(['model_b']).agg(
        sum=pd.NamedAgg(column="score", aggfunc='sum'),
        count=pd.NamedAgg(column="score", aggfunc=len))
        
    sum_scores = scores_a + scores_b
    sum_scores.index.name = 'model'
    sum_scores['winrate'] = (sum_scores['sum'] / sum_scores['count'] + 1)/2
    return sum_scores['winrate'].to_dict()

In [77]:
compute_fractional_winrates(reviews)

{'bard': 0.31953125,
 'claude': 0.66171875,
 'gpt-3.5': 0.37546875,
 'gpt-4': 0.74984375,
 'vicuna': 0.3934375}

# Elo Calculations

In [13]:

# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    i = 0
    for rd, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        i += 1
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        sa = (1 + winner) / 2

        if abs(winner) > 1.001:
            print("problem @", model_a, model_b)
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (1 - sa - ea)
        rating[model_b] += K * (sa - eb)

    print("Iterations: ", i)
    return rating


In [23]:
compute_elo(reviews)

Iterations:  8000


defaultdict(<function __main__.compute_elo.<locals>.<lambda>()>,
            {'claude': 1031.6725418197743,
             'gpt-3.5': 1007.557519196017,
             'bard': 865.786377478712,
             'vicuna': 979.8998841581231,
             'gpt-4': 1115.0836773473761})

# Weighted Majorities

In [15]:
def weighted_majority_vote(df, weights=None, EPS=0.0):
    def take_weighted_mean(frame):
        if weights is None:
            w = None
        else:
            w = frame['reviewer'].map(weights).values
        x = np.average(frame['score'], weights=w)
        y = np.where(x > EPS, 1, np.where(x < -EPS, -1, 0))
        return pd.Series(y, index=['score'])

    return df.groupby(['question_id', 'model_a', 'model_b'], as_index=False).apply(take_weighted_mean)

## Normalization functions

In [16]:
def minmax_norm(weights):
    frame = pd.DataFrame(weights.items(), columns=['reviewer', 'weight'])
    frame.set_index('reviewer', inplace=True)
    weights = frame['weight']
    frame['weight'] = (weights - weights.min()) / (weights.max() - weights.min())
    return frame.to_dict()['weight']

In [17]:
def exponential_weights(weights):
    frame = pd.DataFrame(weights.items(), columns=['reviewer', 'weight'])
    frame.set_index('reviewer', inplace=True)
    weights = frame['weight']
    frame['weight'] = np.exp(weights)
    return frame.to_dict()['weight']

In [18]:
def normalize_to_sum_one(weights):
    frame = pd.DataFrame(weights.items(), columns=['reviewer', 'weight'])
    frame.set_index('reviewer', inplace=True)
    weights = frame['weight']
    frame['weight'] = weights / weights.sum()
    return frame.to_dict()['weight']
    

## Weighted Winrates

In [19]:
def weighted_winrate(reviews, num_iterations, remove_ties=False, eps=0.01):
    weights = None
    for i in range(num_iterations):
        maj = weighted_majority_vote(reviews, weights=weights, EPS=eps)
        if remove_ties:
            maj = maj[maj.score != 0]
        winrates = compute_winrates(maj)
        weights = exponential_weights(winrates)
        weights = normalize_to_sum_one(weights)
    return { 'winrates': winrates, 'weights': weights }

In [20]:
wr1 = weighted_winrate(human_reviews[human_reviews.reviewer == 'human'], 1)['winrates']
wr2 = weighted_winrate(human_reviews[human_reviews.reviewer == 'human'], 1, remove_ties=True)['winrates']

print(wr1)
print(wr2)

{'gpt-4': 0.76875, 'claude': 0.628125, 'vicuna': 0.315625, 'gpt-3.5': 0.253125, 'bard': 0.234375}
{'gpt-4': 0.8601398601398601, 'claude': 0.7153024911032029, 'vicuna': 0.36996336996337, 'gpt-3.5': 0.28825622775800713, 'bard': 0.2613240418118467}


In [21]:
wr1 = weighted_winrate(reviews[reviews.reviewer == 'gpt-4'], 1)['winrates']
wr2 = weighted_winrate(reviews[reviews.reviewer == 'gpt-4'], 1, remove_ties=True)['winrates']

print(wr1)
print(wr2)

{'gpt-4': 0.7890625, 'claude': 0.6171875, 'vicuna': 0.284375, 'gpt-3.5': 0.25, 'bard': 0.184375}
{'gpt-4': 0.9115523465703971, 'claude': 0.7552581261950286, 'vicuna': 0.32616487455197135, 'gpt-3.5': 0.3065134099616858, 'bard': 0.20959147424511546}


In [22]:
wr1 = weighted_winrate(reviews, 1)['winrates']
wr2 = weighted_winrate(reviews, 1, remove_ties=True)['winrates']

print(wr1)
print(wr2)

{'gpt-4': 0.803125, 'claude': 0.6859375, 'vicuna': 0.2953125, 'gpt-3.5': 0.2890625, 'bard': 0.20625}
{'gpt-4': 0.8697123519458545, 'claude': 0.7453310696095077, 'vicuna': 0.3241852487135506, 'gpt-3.5': 0.32342657342657344, 'bard': 0.22641509433962265}


## Weighted Elo scores

In [89]:
def weighted_elo(reviews, num_iterations, remove_ties=False, eps=0.01):
    weights = None
    for i in range(num_iterations):
        maj = weighted_majority_vote(reviews, weights=weights, EPS=eps)
        maj = maj.sample(frac=1, random_state=73)
        if remove_ties:
            maj = maj[maj.score != 0]
        elos = dict(compute_elo(maj))
        print(elos)
        weights = minmax_norm(elos)
        weights = normalize_to_sum_one(weights)
    return { 'elos': elos, 'weights': weights }

In [90]:
weighted_elo(reviews, 2)

Iterations:  1600
{'bard': 849.9097764644408, 'claude': 1106.8992733633418, 'gpt-4': 1270.4909290460278, 'vicuna': 918.5838236699275, 'gpt-3.5': 854.1161974562614}
Iterations:  1600
{'bard': 833.7801380542992, 'claude': 1101.2617524590241, 'gpt-4': 1338.1348443680863, 'vicuna': 913.2390204201415, 'gpt-3.5': 813.5842446984492}


{'elos': {'bard': 833.7801380542992,
  'claude': 1101.2617524590241,
  'gpt-4': 1338.1348443680863,
  'vicuna': 913.2390204201415,
  'gpt-3.5': 813.5842446984492},
 'weights': {'bard': 0.021667582037988794,
  'claude': 0.30864076622195424,
  'gpt-4': 0.5627749637589491,
  'vicuna': 0.10691668798110783,
  'gpt-3.5': 0.0}}