In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import glob

# Loading and Combining Reviews

In [2]:

models_map = {
    'answer_gpt4': 'gpt-4', 
    'answer_gpt35': 'gpt-3.5',
    'answer_bard': 'bard',
    'answer_claude': 'claude', 
    'answer_vicuna-13b': 'vicuna',
}
reviewers_map = {
    'gpt-4': 'gpt-4',
    'gpt-3.5-turbo-0301': 'gpt-3.5',
    'text-bison@001': 'bard',
    'claude-1': 'claude',
    'vicuna-13b': 'vicuna',
}


In [3]:
def review_filename(modelA, modelB, reviewer):
    name = f'{modelA}-vs-{modelB}-{reviewer}-reviewer*.jsonl'
    glob_path = './ratings-*/' + name
    globs = glob.glob(glob_path)
    return globs[0]

def format_df(df):
    # map A wins to -1, B wins to 1, and ties to 0
    df.score = df.score.map({-1: 0, 1: -1, 2: 1, 3: 0})
    df.sort_values(by='question_id', inplace=True)

def load_reviews(models_map, reviewers_map):
    dfs_list = []
    for modelA in models_map.keys():
        for modelB in models_map.keys():
            if modelA == modelB:
                continue

            for reviewer in reviewers_map.keys():
                filename = review_filename(modelA, modelB, reviewer)
                # get df and add to array
                df = pd.read_json(filename, lines=True)[['question_id', 'score']]
                format_df(df)
                invalid = df.score.isna()
                ninvalid = invalid.sum()
                if ninvalid > 0:
                    print(ninvalid, f'#invalid. {modelA} vs {modelB} by {reviewer}')
                    print(df[invalid])
                df['model_a'] = models_map[modelA]
                df['model_b'] = models_map[modelB]
                df['reviewer'] = reviewers_map[reviewer]
                dfs_list.append(df)
        
    # combine all dfs and shuffle
    reviews = pd.concat(dfs_list)
    return reviews

In [4]:
# if there are multiple reviews for a question from a given review, take the majority vote
def majority_vote_review(df):
    def take_majority(frame):
        x = frame.mean()
        return np.sign(x)

    return df.groupby(['question_id', 'model_a', 'model_b', 'reviewer'], as_index=False).agg({'score': take_majority})

In [5]:
reviews = majority_vote_review(load_reviews(models_map, reviewers_map))
reviews

Unnamed: 0,question_id,model_a,model_b,reviewer,score
0,1,bard,claude,bard,-1.0
1,1,bard,claude,claude,1.0
2,1,bard,claude,gpt-3.5,1.0
3,1,bard,claude,gpt-4,-1.0
4,1,bard,claude,vicuna,1.0
...,...,...,...,...,...
7995,80,vicuna,gpt-4,bard,1.0
7996,80,vicuna,gpt-4,claude,1.0
7997,80,vicuna,gpt-4,gpt-3.5,1.0
7998,80,vicuna,gpt-4,gpt-4,1.0


# Pairwise Win Fractions

In [6]:
# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_pairwise_win_fraction(battles):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles['score'] == -1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles[battles['score'] == 1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(battles,
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (
        (a_win_ptbl + b_win_ptbl.T) /
        (num_battles_ptbl + num_battles_ptbl.T)
    )

    # Arrange ordering according to proprition of wins
    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
    model_names = list(prop_wins.keys())
    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
    return row_beats_col

# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def visualize_pairwise_win_fraction(battles, title):
    row_beats_col = compute_pairwise_win_fraction(battles)
    fig = px.imshow(row_beats_col, color_continuous_scale='RdBu',
                    text_auto=".2f", title=title)
    fig.update_layout(xaxis_title="Model B",
                  yaxis_title="Model A",
                  xaxis_side="top",
                  title_y=0.07, title_x=0.5)
    fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")

    return fig

In [7]:
fig = visualize_pairwise_win_fraction(reviews,
      title = "Fraction of Model A Wins for All A vs. B Battles (including ties)")
fig

# Regular Win Fraction

In [8]:
# regular win fractions
def compute_winrates(battles):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles['score'] == -1],
        index="model_a", aggfunc="size", fill_value=0)

    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles[battles['score'] == 1],
        index="model_b", aggfunc="size", fill_value=0)

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(battles,
        index="model_a", aggfunc="size", fill_value=0)

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (
        (a_win_ptbl + b_win_ptbl) /
        (num_battles_ptbl + num_battles_ptbl.T)
    )

    # Arrange ordering according to proprition of wins
    prop_wins = row_beats_col_freq.sort_values(ascending=False)
    return prop_wins.to_dict()

# Elo Calculations

In [9]:
from collections import defaultdict

# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    i = 0
    for rd, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        i += 1
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if winner == -1:
            sa = 1
        elif winner == 1:
            sa = 0
        elif winner == 0:
            sa = 0.5
        else:
            print("problem @", model_a, model_b)
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    print("Iterations: ", i)
    return rating


In [10]:
compute_elo(reviews.sample(frac=1, random_state=128))

Iterations:  8000


defaultdict(<function __main__.compute_elo.<locals>.<lambda>()>,
            {'claude': 1031.6725418197743,
             'gpt-3.5': 1007.557519196017,
             'bard': 865.786377478712,
             'vicuna': 979.8998841581231,
             'gpt-4': 1115.0836773473761})

# Weighted Majorities

In [11]:
def weighted_majority_vote(df, weights=None, EPS=0.0):
    def take_weighted_mean(frame):
        if weights is None:
            w = None
        else:
            w = frame['reviewer'].map(weights).values
        x = np.average(frame['score'], weights=w)
        y = np.where(x > EPS, 1, np.where(x < -EPS, -1, 0))
        return pd.Series(y, index=['score'])

    return df.groupby(['question_id', 'model_a', 'model_b'], as_index=False).apply(take_weighted_mean)

## Normalization functions

In [41]:
def zscore(weights):
    frame = pd.DataFrame(weights.items(), columns=['reviewer', 'weight'])
    # make reviewer the index
    frame.set_index('reviewer', inplace=True)
    weights = frame['weight']
    frame['weight'] = np.tanh((weights - weights.mean()) / weights.std()) + 1 
    return frame.to_dict()['weight']

In [82]:
def minmax_norm(weights):
    frame = pd.DataFrame(weights.items(), columns=['reviewer', 'weight'])
    frame.set_index('reviewer', inplace=True)
    weights = frame['weight']
    frame['weight'] = (weights - weights.min()) / (weights.max() - weights.min()) + 1
    return frame.to_dict()['weight']

## Weighted Winrates

In [78]:
def weighted_winrate(reviews, num_iterations):
    weights = None
    for i in range(num_iterations):
        maj = weighted_majority_vote(reviews, weights=weights)
        winrates = compute_winrates(maj)
        print(winrates)
        weights = minmax_norm(winrates)
    return { 'winrates': winrates, 'weights': weights }

In [83]:
weighted_winrate(reviews, 2)

{'gpt-4': 0.803125, 'claude': 0.6859375, 'vicuna': 0.2953125, 'gpt-3.5': 0.2890625, 'bard': 0.20625}
{'gpt-4': 0.8515625, 'claude': 0.7328125, 'vicuna': 0.34375, 'gpt-3.5': 0.3375, 'bard': 0.234375}


{'winrates': {'gpt-4': 0.8515625,
  'claude': 0.7328125,
  'vicuna': 0.34375,
  'gpt-3.5': 0.3375,
  'bard': 0.234375},
 'weights': {'gpt-4': 2.0,
  'claude': 1.8075949367088606,
  'vicuna': 1.1772151898734178,
  'gpt-3.5': 1.1670886075949367,
  'bard': 1.0}}

## Weighted Elo scores

In [80]:
def weighted_elo(reviews, num_iterations):
    weights = None
    for i in range(num_iterations):
        maj = weighted_majority_vote(reviews, weights=weights)
        elos = dict(compute_elo(maj))
        print(elos)
        weights = minmax_norm(elos)
    return { 'elos': elos, 'weights': weights }

In [84]:
weighted_elo(reviews, 2)

Iterations:  1600
{'bard': 828.5795371027081, 'claude': 1051.4767206705333, 'gpt-3.5': 954.1763813758497, 'gpt-4': 1379.2469712285902, 'vicuna': 786.5203896223201}
Iterations:  1600
{'bard': 837.9256782006146, 'claude': 1048.7173371647682, 'gpt-3.5': 923.8760067453912, 'gpt-4': 1400.1219577957413, 'vicuna': 789.3590200934827}


{'elos': {'bard': 837.9256782006146,
  'claude': 1048.7173371647682,
  'gpt-3.5': 923.8760067453912,
  'gpt-4': 1400.1219577957413,
  'vicuna': 789.3590200934827},
 'weights': {'bard': 1.0795180177268837,
  'claude': 1.424646456196267,
  'gpt-3.5': 1.220244186980259,
  'gpt-4': 2.0,
  'vicuna': 1.0}}