In [7]:
import pandas as pd
import numpy as np
import plotly.express as px
import glob

models = ['answer_gpt4', 'answer_gpt35', 'answer_bard', 'answer_claude', 'answer_vicuna-13b']
reviewers = ['gpt-4', 'claude-1', 'vicuna-13b', 'gpt-3.5-turbo-0301', 'text-bison@001']

def review_filename(modelA, modelB, reviewer):
    name = f'{modelA}-vs-{modelB}-{reviewer}-reviewer*.jsonl'
    glob_path = './ratings-*/' + name
    globs = glob.glob(glob_path)
    return globs[0]

def format_df(df):
    # map A wins to -1, B wins to 1, and ties to 0
    df.score = df.score.map({-1: 0, 1: -1, 2: 1, 3: 0})
    df.sort_values(by='question_id', inplace=True)

def load_reviews(model, reviewers):
    dfs_list = []
    for modelA in models:
        for modelB in models:
            if modelA == modelB:
                continue

            scores = []
            for reviewer in reviewers:
                filename = review_filename(modelA, modelB, reviewer)
                # get df and add to array
                df = pd.read_json(filename, lines=True)[['question_id', 'score']]
                format_df(df)
                invalid = df.score.isna()
                ninvalid = invalid.sum()
                if ninvalid > 0:
                    print(ninvalid, f'#invalid. {modelA} vs {modelB} by {reviewer}')
                    print(df[invalid])
                df['model_a'] = modelA
                df['model_b'] = modelB
                df['reviewer'] = reviewer
                dfs_list.append(df)
        
    # combine all dfs and shuffle
    reviews = pd.concat(dfs_list)
    return reviews

In [39]:
def majority_vote(df):
    def take_majority(frame):
        x = frame.mean()
        return np.sign(x)

    return df.groupby(['question_id', 'model_a', 'model_b'], as_index=False).agg({'score': take_majority})

In [40]:
from collections import defaultdict

# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    i = 0
    for rd, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        i += 1
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if winner == -1:
            sa = 1
        elif winner == 1:
            sa = 0
        elif winner == 0:
            sa = 0.5
        else:
            print("problem @", model_a, model_b)
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    print("Iterations: ", i)
    return rating


In [14]:
# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_pairwise_win_fraction(battles):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles['score'] == -1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles[battles['score'] == 1],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(battles,
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (
        (a_win_ptbl + b_win_ptbl.T) /
        (num_battles_ptbl + num_battles_ptbl.T)
    )

    # Arrange ordering according to proprition of wins
    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
    model_names = list(prop_wins.keys())
    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
    return row_beats_col

# function adapted from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def visualize_pairwise_win_fraction(battles, title):
    row_beats_col = compute_pairwise_win_fraction(battles)
    fig = px.imshow(row_beats_col, color_continuous_scale='RdBu',
                    text_auto=".2f", title=title)
    fig.update_layout(xaxis_title="Model B",
                  yaxis_title="Model A",
                  xaxis_side="top",
                  title_y=0.07, title_x=0.5)
    fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")

    return fig

In [10]:
reviews = load_reviews(models, reviewers)
reviews

Unnamed: 0,question_id,score,model_a,model_b,reviewer
0,1,-1,answer_gpt4,answer_gpt35,gpt-4
1,2,-1,answer_gpt4,answer_gpt35,gpt-4
2,3,-1,answer_gpt4,answer_gpt35,gpt-4
3,4,-1,answer_gpt4,answer_gpt35,gpt-4
4,5,-1,answer_gpt4,answer_gpt35,gpt-4
...,...,...,...,...,...
75,76,-1,answer_vicuna-13b,answer_claude,text-bison@001
76,77,-1,answer_vicuna-13b,answer_claude,text-bison@001
77,78,-1,answer_vicuna-13b,answer_claude,text-bison@001
78,79,-1,answer_vicuna-13b,answer_claude,text-bison@001


In [15]:
fig = visualize_pairwise_win_fraction(reviews,
      title = "Fraction of Model A Wins for All A vs. B Battles (including ties)")
fig

In [6]:
maj = majority_vote(reviews)
compute_elo(maj.sample(frac=1, random_state=128))

NameError: name 'majority_vote' is not defined