In [13]:
import pandas as pd
import numpy as np
import glob
from pathlib import Path

models = ['answer_gpt4', 'answer_gpt35', 'answer_bard', 'answer_claude', 'answer_vicuna-13b']
reviewers = ['gpt-4', 'claude-1', 'vicuna-13b', 'gpt-3.5-turbo-0301', 'text-bison@001']

def review_filename(modelA, modelB, reviewer):
    name = f'{modelA}-vs-{modelB}-{reviewer}-reviewer*.jsonl'
    glob_path = './ratings-*/' + name
    globs = glob.glob(glob_path)
    return globs[0]

def format_df(df):
    # map A wins to -1, B wins to 1, and ties to 0
    df.score = df.score.map({1: -1, 2: 1, 3: 0})
    df.sort_values(by='question_id', inplace=True)

def load_reviews(model, reviewers):
    dfs_list = []
    for modelA in models:
        for modelB in models:
            if modelA == modelB:
                continue

            scores = []
            for reviewer in reviewers:
                filename = review_filename(modelA, modelB, reviewer)
                # get df and add to array
                df = pd.read_json(filename, lines=True)[['question_id', 'score']]
                format_df(df)
                invalid = df.score.isna()
                ninvalid = invalid.sum()
                if ninvalid > 0:
                    print(ninvalid, f'#invalid. {modelA} vs {modelB} by {reviewer}')
                    print(df[invalid])
                df['model_a'] = modelA
                df['model_b'] = modelB
                df['reviewer'] = reviewer
                dfs_list.append(df)
        
    # combine all dfs and shuffle
    reviews = pd.concat(dfs_list)
    return reviews

In [39]:
def majority_vote(df):
    def take_majority(frame):
        x = frame.mean()
        return np.sign(x)

    return df.groupby(['question_id', 'model_a', 'model_b'], as_index=False).agg({'score': take_majority})

In [40]:
from collections import defaultdict

# function obtained from https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5
def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    i = 0
    for rd, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        i += 1
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if winner == -1:
            sa = 1
        elif winner == 1:
            sa = 0
        elif winner == 0:
            sa = 0.5
        else:
            print("problem @", model_a, model_b)
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    print("Iterations: ", i)
    return rating


In [15]:
reviews = load_reviews(models, reviewers)
reviews

11 #invalid. answer_gpt4 vs answer_gpt35 by vicuna-13b
    question_id  score
60           61    NaN
61           62    NaN
62           63    NaN
63           64    NaN
64           65    NaN
65           66    NaN
66           67    NaN
67           68    NaN
68           69    NaN
69           70    NaN
75           76    NaN
10 #invalid. answer_gpt4 vs answer_bard by vicuna-13b
    question_id  score
60           61    NaN
61           62    NaN
62           63    NaN
63           64    NaN
64           65    NaN
65           66    NaN
66           67    NaN
67           68    NaN
68           69    NaN
69           70    NaN
11 #invalid. answer_gpt4 vs answer_claude by vicuna-13b
    question_id  score
60           61    NaN
61           62    NaN
62           63    NaN
63           64    NaN
64           65    NaN
65           66    NaN
66           67    NaN
67           68    NaN
68           69    NaN
69           70    NaN
78           79    NaN
10 #invalid. answer_gpt4 vs an

Unnamed: 0,question_id,score,model_a,model_b,reviewer
0,1,-1.0,answer_gpt4,answer_gpt35,gpt-4
1,2,-1.0,answer_gpt4,answer_gpt35,gpt-4
2,3,-1.0,answer_gpt4,answer_gpt35,gpt-4
3,4,-1.0,answer_gpt4,answer_gpt35,gpt-4
4,5,-1.0,answer_gpt4,answer_gpt35,gpt-4
...,...,...,...,...,...
75,76,-1.0,answer_vicuna-13b,answer_claude,text-bison@001
76,77,-1.0,answer_vicuna-13b,answer_claude,text-bison@001
77,78,-1.0,answer_vicuna-13b,answer_claude,text-bison@001
78,79,-1.0,answer_vicuna-13b,answer_claude,text-bison@001


In [42]:
maj = majority_vote(reviews)
compute_elo(maj.sample(frac=1, random_state=128))

Iterations:  1600


defaultdict(<function __main__.compute_elo.<locals>.<lambda>()>,
            {'answer_vicuna-13b': 829.7941341216695,
             'answer_bard': 768.194934341448,
             'answer_claude': 1270.690444483077,
             'answer_gpt35': 840.0506986772573,
             'answer_gpt4': 1291.2697883765477})