# Elo rating system

In [4]:
from collections import defaultdict
import pandas as pd

# Initialize Elo ratings for each model (default 1000 for new models)
ratings = defaultdict(lambda: 1000)
K = 32  # K-factor controls how big the updates are
df = pd.read_csv('elo_ratings.csv')
# Iterate through each pairwise comparison result
for idx, row in df.iterrows():
    model_a = row['model_a']
    model_b = row['model_b']
    winner = row['winner']
    
    # Current ratings of model A and model B
    Ra = ratings[model_a]
    Rb = ratings[model_b]
    
    # Compute expected win probabilities for A and B
    Ea = 1 / (1 + 10 ** ((Rb - Ra) / 400))
    Eb = 1 / (1 + 10 ** ((Ra - Rb) / 400))
    
    # Determine actual score for A and B based on the winner
    if winner == model_a:
        Sa, Sb = 1.0, 0.0   # A wins, B loses
    elif winner == model_b:
        Sa, Sb = 0.0, 1.0   # A loses, B wins
    else:
        Sa, Sb = 0.5, 0.5   # tie scenario
    
    # Update Elo ratings for both models
    ratings[model_a] = Ra + K * (Sa - Ea)
    ratings[model_b] = Rb + K * (Sb - Eb)

In [5]:
ratings

defaultdict(<function __main__.<lambda>()>,
            {'GPT-4': 1015.3991542567205,
             'Claude': 1000.6684908377623,
             'LLaMA': 999.1995875226886,
             'GPT-3.5': 984.7327673828286})