In [106]:
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path

models = ['answer_gpt4', 'answer_gpt35', 'answer_bard', 'answer_claude', 'answer_vicuna-13b']
reviewers = ['gpt-4', 'claude-1']

class ReviewLoader:
    def __init__(self, models, reviewers):
        self.models = models
        self.reviewers = reviewers
        self._init_reviews()

    @staticmethod
    def review_filename(modelA, modelB, reviewer):
        name = f'{modelA}-vs-{modelB}-{reviewer}-reviewer*.jsonl'
        glob_path = './ratings-*/' + name
        globs = glob.glob(glob_path)
        return globs[0]

    @staticmethod
    def _format_df(df):
        # map A wins to -1, B wins to 1, and ties to 0
        df.score = df.score.map({1: -1, 2: 1, 3: 0})
        df.sort_values(by='question_id', inplace=True)

    def _init_reviews(self):
        dfs_list = []
        for modelA in self.models:
            for modelB in self.models:
                if modelA == modelB:
                    continue

                scores = []
                for reviewer in self.reviewers:
                    filename = ReviewLoader.review_filename(modelA, modelB, reviewer)
                    # get df and add to array
                    df = pd.read_json(filename, lines=True)
                    print(modelA, modelB, reviewer)
                    print(df[df.score == -1].index.size)
                    ReviewLoader._format_df(df)
                    scores.append(df['score'].to_numpy())

                # calculate agreed score of reviewers
                mean_scores = np.array(scores).mean(axis=0)
                print("nan: ", np.count_nonzero(np.isnan(mean_scores)))
                signed_scores = np.sign(mean_scores)

                # add model info and append to list
                df = pd.DataFrame(signed_scores, columns=['score'])
                df['model_a'] = modelA
                df['model_b'] = modelB
                dfs_list.append(df)
        
        # combine all dfs and shuffle
        self.reviews = pd.concat(dfs_list).sample(frac=1)

loader = ReviewLoader(models, reviewers)
reviews = loader.reviews

answer_gpt4 answer_gpt35 gpt-3.5-turbo-0301
0
answer_gpt4 answer_gpt35 gpt-4
0
answer_gpt4 answer_gpt35 vicuna-13b
11
answer_gpt4 answer_gpt35 claude-1
0
answer_gpt4 answer_gpt35 text-bison@001
0
nan:  0
answer_gpt4 answer_bard gpt-3.5-turbo-0301
0
answer_gpt4 answer_bard gpt-4
0
answer_gpt4 answer_bard vicuna-13b
10
answer_gpt4 answer_bard claude-1
0
answer_gpt4 answer_bard text-bison@001
0
nan:  0
answer_gpt4 answer_claude gpt-3.5-turbo-0301
0
answer_gpt4 answer_claude gpt-4
0
answer_gpt4 answer_claude vicuna-13b
11
answer_gpt4 answer_claude claude-1
0
answer_gpt4 answer_claude text-bison@001
1
nan:  0
answer_gpt4 answer_vicuna-13b gpt-3.5-turbo-0301
0
answer_gpt4 answer_vicuna-13b gpt-4
0
answer_gpt4 answer_vicuna-13b vicuna-13b
10
answer_gpt4 answer_vicuna-13b claude-1
0
answer_gpt4 answer_vicuna-13b text-bison@001
0
nan:  0
answer_gpt35 answer_gpt4 gpt-3.5-turbo-0301
0
answer_gpt35 answer_gpt4 gpt-4
0
answer_gpt35 answer_gpt4 vicuna-13b
11
answer_gpt35 answer_gpt4 claude-1
0
answe

In [107]:
from collections import defaultdict

def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if winner == -1:
            sa = 1
        elif winner == 1:
            sa = 0
        elif winner == 0:
            sa = 0.5
        else:
            print("problem @", model_a, model_b)
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating


In [108]:
compute_elo(reviews)

defaultdict(<function __main__.compute_elo.<locals>.<lambda>()>,
            {'answer_claude': 1135.3033085039308,
             'answer_gpt4': 1219.4542471951718,
             'answer_bard': 884.8087872551939,
             'answer_vicuna-13b': 930.7379563447608,
             'answer_gpt35': 829.6957007009396})