In [1]:
import pandas as pd
import numpy as np
import glob
from pathlib import Path

models = ['answer_gpt4', 'answer_gpt35']
reviewers = ['gpt-3.5-turbo-0301', 'gpt-4']

class ReviewLoader:
    def __init__(self, models, reviewers):
        self.models = models
        self.reviewers = reviewers
        self._init_reviews()

    @staticmethod
    def review_filename(modelA, modelB, reviewer):
        name = f'{modelA}-vs-{modelB}-{reviewer}-reviewer-threeclass.jsonl'
        glob_path = './eval/ratings-*/' + name
        return glob.glob(glob_path)[0]

    @staticmethod
    def _format_df(df):
        # map A wins to -1, B wins to 1, and ties to 0
        df.score = df.score.map({1: -1, 2: 1, 3: 0})
        df.sort_values(by='question_id', inplace=True)

    def _init_reviews(self):
        dfs_list = []
        for modelA in self.models:
            for modelB in self.models:
                if modelA == modelB:
                    continue

                scores = []
                for reviewer in self.reviewers:
                    filename = ReviewLoader.review_filename(modelA, modelB, reviewer)
                    # get df and add to array
                    df = pd.read_json(lines=True)
                    ReviewLoader._format_df(df)
                    scores.append(df['score'].to_numpy())

                # calculate agreed score of reviewers
                mean_scores = np.array(scores).mean(axis=0)
                signed_scores = np.sign(mean_scores)

                # add model info and append to list
                df = pd.DataFrame(signed_scores, columns=['score'])
                df['model_a'] = modelA
                df['model_b'] = modelB
                dfs_list.append(df)
        
        # combine all dfs and shuffle
        self.reviews = pd.concat(dfs_list).sample(frac=1)

loader = ReviewLoader(models, reviewers)
reviews = loader.reviews

Unnamed: 0,review_id,question_id,answer1_id,answer2_id,reviewer_id,metadata,text,score
0,4gLzStHWqR2s33juPh4UuS,1,k3KTH9U8v39Sqqb2Z4jo8C,BZGowHM7L3RvtWRktKZjLT,1,{},Assistant 1 provided a comprehensive and detai...,1
1,XrmReMXYKSMN4fDsjpiAwz,2,asuUCLx4qNGr4TgeHVDoxe,GsrPLfiNEcRDYAC5aUoY23,1,{},Assistant 1's answer is very detailed and prov...,1
2,VR9QTtbrTHXmY2ngvur8sn,3,iekZGGkvTnwm5T4J6MtvDi,5SGfexmPoJTWoVbPMB87KB,1,{},Assistant 1 provided a more comprehensive and ...,1
3,EmE9XALN58re3tgxLgsNMu,4,eo5mhtwgZYFrqp4wXrR3Z6,RcqQg364kgCkYPshqvmGSp,1,{},Assistant 1's answer is very comprehensive and...,1
4,GdqwTrZFoFRPppANNyv7sj,5,H6mF8revhnrFJ7XGJfD6pn,3R9jAndzLHQqJwmMi5mNox,1,{},"Assistant 1's answer is very helpful, relevant...",1


In [None]:
from collections import defaultdict

def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, winner in battles[['model_a', 'model_b', 'score']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if winner == -1:
            sa = 1
        elif winner == 1:
            sa = 0
        elif winner == 0:
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating
