In [1]:
from huggingface_hub import HfFileSystem, hf_hub_download
from typing import Literal
from utils import apply_template


def download_latest_data_from_space(
    repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
    """
    Downloads the latest data file of the specified file type from the given repository space.
    Args:
        repo_id (str): The ID of the repository space.
        file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
    Returns:
        str: The local file path of the downloaded data file.
    """

    def extract_date(filename):
        return filename.split("/")[-1].split(".")[0].split("_")[-1]

    fs = HfFileSystem()
    data_file_path = f"spaces/{repo_id}/*.{file_type}"
    files = fs.glob(data_file_path)
    files = [
        file for file in files if "leaderboard_table" in file or "elo_results" in file
    ]
    latest_file = sorted(files, key=extract_date, reverse=True)[0]

    latest_filepath_local = hf_hub_download(
        repo_id=repo_id,
        filename=latest_file.split("/")[-1],
        repo_type="space",
    )
    print(latest_file.split("/")[-1])
    return latest_filepath_local


def get_constants(dfs):
    """
    Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month.
    Parameters:
    - dfs (dict): A dictionary containing DataFrames for different categories.
    Returns:
    - min_elo_score (float): The minimum Elo score across all DataFrames.
    - max_elo_score (float): The maximum Elo score across all DataFrames.
    """
    filter_ranges = {}
    for k, df in dfs.items():
        filter_ranges[k] = {
            "min_elo_score": df["rating"].min().round(),
            "max_elo_score": df["rating"].max().round(),
        }

    min_elo_score = float("inf")
    max_elo_score = float("-inf")

    for _, value in filter_ranges.items():
        min_elo_score = min(min_elo_score, value["min_elo_score"])
        max_elo_score = max(max_elo_score, value["max_elo_score"])

    return min_elo_score, max_elo_score


def format_data(df):
    """
    Formats the given DataFrame by performing the following operations:
    - Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'.
    - Converts the 'Release Date' column to datetime format.
    - Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column.
    - Rounds the 'rating' column to the nearest integer.
    - Resets the index of the DataFrame.
    Args:
        df (pandas.DataFrame): The DataFrame to be formatted.
    Returns:
        pandas.DataFrame: The formatted DataFrame.
    """

    PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]

    df["License"] = df["License"].apply(
        lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
    )
    df["Release Date"] = pd.to_datetime(df["Release Date"])
    df["Month-Year"] = df["Release Date"].dt.to_period("M")
    df["rating"] = df["rating"].round()
    return df.reset_index(drop=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pickle
import pandas as pd

KEY_TO_CATEGORY_NAME = {
    "full": "Overall",
    "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
    "math": "Math",
    "if": "Instruction Following",
    "multiturn": "Multi-Turn",
    "coding": "Coding",
    "hard_6": "Hard Prompts (Overall)",
    "hard_english_6": "Hard Prompts (English)",
    "long_user": "Longer Query",
    "english": "English",
    "chinese": "Chinese",
    "french": "French",
    "german": "German",
    "spanish": "Spanish",
    "russian": "Russian",
    "japanese": "Japanese",
    "korean": "Korean",
    "no_tie": "Exclude Ties",
    "no_short": "Exclude Short Query (< 5 tokens)",
    "no_refusal": "Exclude Refusal",
    "overall_limit_5_user_vote": "overall_limit_5_user_vote",
    "full_old": "Overall (Deprecated)",
}

# gather ELO data
latest_elo_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
)

with open(latest_elo_file_local, "rb") as fin:
    elo_results = pickle.load(fin)

# TO-DO: need to also include vision
elo_results = elo_results["text"]

arena_dfs = {}
for k in KEY_TO_CATEGORY_NAME.keys():
    if k not in elo_results:
        continue
    arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]

# gather open llm leaderboard data
latest_leaderboard_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
)
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)

# load release date mapping data
release_date_mapping = pd.read_json("data/release_date_mapping.json", orient="records")


# merge leaderboard data with ELO data
merged_dfs = {}
for k, v in arena_dfs.items():
    merged_dfs[k] = (
        pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
        .sort_values("rating", ascending=False)
        .reset_index(drop=True)
    )

# add release dates into the merged data
for k, v in merged_dfs.items():
    merged_dfs[k] = pd.merge(
        merged_dfs[k],
        release_date_mapping[["key", "Release Date"]],
        on="key",
        how="left",
    )

# format dataframes
merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()}

# get constants
min_elo_score, max_elo_score = get_constants(merged_dfs)
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
orgs = merged_dfs["Overall"].Organization.unique().tolist()

elo_results_20250128.pkl
leaderboard_table_20250128.csv


In [5]:
df = merged_dfs["Overall"]

In [6]:
top_orgs = df.groupby("Organization")["rating"].max().nlargest(11).index.tolist()
top_orgs = [el for el in top_orgs if el not in ["NexusFlow", "Princeton", "Nvidia"]]
top_orgs

['Google',
 'OpenAI',
 'DeepSeek',
 'StepFun',
 'xAI',
 '01 AI',
 'Anthropic',
 'Alibaba',
 'Zhipu AI']

In [7]:
df = df.loc[(df["Organization"].isin(top_orgs)) & (df["rating"] > 1000)]
display(df.loc[df["Release Date"].isna()])

df = df.loc[~df["Release Date"].isna()]

Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking,key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link,Release Date,Month-Year
0,1382.0,11.677728,1388.882744,1376.601738,7505,2,gemini-2.0-flash-thinking-exp-01-21,Gemini-2.0-Flash-Thinking-Exp-01-21,-,-,-,Proprietary LLM,Google,https://aistudio.google.com/prompts/new_chat?m...,NaT,NaT
14,1304.0,15.970325,1313.363176,1297.231103,4774,44,step-2-16k-exp-202412,Step-2-16K-Exp,-,-,-,Proprietary LLM,StepFun,https://platform.stepfun.com/docs/llm/text,NaT,NaT


In [11]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from scipy.special import expit

# Assuming your DataFrame is named 'df'
# df = pd.read_csv('your_data.csv')

# Convert Release Date to datetime if it's not already
df["Release Date"] = pd.to_datetime(df["Release Date"])

# Sort the DataFrame by Release Date and rating (descending)
df = df.sort_values(["Release Date", "rating"], ascending=[True, False])

# Define the current date (October 3, 2024)
current_date = pd.Timestamp("2025-01-31")


# Function to create sigmoid transition
def sigmoid_transition(x, x0, k=0.1):
    return expit(k * (x - x0))


# Define organization to country mapping and colors
org_info = {
    "OpenAI": ("#00A67E", "🇺🇸"),  # Teal
    "Google": ("#4285F4", "🇺🇸"),  # Google Blue
    "xAI": ("black", "🇺🇸"),  # Bright Orange
    "Anthropic": ("#cc785c", "🇺🇸"),  # Brown (as requested)
    "Meta": ("#0064E0", "🇺🇸"),  # Facebook Blue
    "Alibaba": ("#6958cf", "🇨🇳"),
    "DeepSeek": ("#C70039", "🇨🇳"),
    "01 AI": ("#11871e", "🇨🇳"),  # Bright Green
    "DeepSeek AI": ("#9900CC", "🇨🇳"),  # Purple
    "Mistral": ("#ff7000", "🇫🇷"),  # Mistral Orange (as requested)
    "AI21 Labs": ("#1E90FF", "🇮🇱"),  # Dodger Blue,
    "Reka AI": ("#FFC300", "🇺🇸"),
    "Zhipu AI": ("#FFC300", "🇨🇳"),
}

# Create figure
fig = go.Figure()

for i, org in enumerate(
    df.groupby("Organization")["rating"]
    .max()
    .sort_values(ascending=False)
    .index.tolist()
):
    org_data = df[df["Organization"] == org]

    if len(org_data) > 0:
        x_values = []
        y_values = []
        current_best = -np.inf
        best_models = []

        # Group by date and get the best model for each date
        daily_best = org_data.groupby("Release Date").first().reset_index()

        for _, row in daily_best.iterrows():
            if row["rating"] > current_best:
                if len(x_values) > 0:
                    # Create smooth transition
                    transition_days = (row["Release Date"] - x_values[-1]).days
                    transition_points = pd.date_range(
                        x_values[-1],
                        row["Release Date"],
                        periods=max(100, transition_days),
                    )
                    x_values.extend(transition_points)

                    transition_y = current_best + (
                        row["rating"] - current_best
                    ) * sigmoid_transition(
                        np.linspace(-6, 6, len(transition_points)), 0, k=1
                    )
                    y_values.extend(transition_y)

                x_values.append(row["Release Date"])
                y_values.append(row["rating"])
                current_best = row["rating"]
                best_models.append(row)

        # Extend the line to the current date
        if x_values[-1] < current_date:
            x_values.append(current_date)
            y_values.append(current_best)

        # Get org color and flag
        color, flag = org_info.get(org, ("#808080", ""))

        # Add line plot
        fig.add_trace(
            go.Scatter(
                x=x_values,
                y=y_values,
                mode="lines",
                name=f"{i + 1}. {org} {flag}",
                line=dict(color=color, width=2),
                hoverinfo="skip",
            )
        )

        # Add scatter plot for best model points
        best_models_df = pd.DataFrame(best_models)
        fig.add_trace(
            go.Scatter(
                x=best_models_df["Release Date"],
                y=best_models_df["rating"],
                mode="markers",
                name=org,
                showlegend=False,
                marker=dict(color=color, size=8, symbol="circle"),
                text=best_models_df["Model"],
                hovertemplate="<b>%{text}</b><br>Date: %{x}<br>ELO Score: %{y:.2f}<extra></extra>",
            )
        )


# Update layout
fig.update_layout(
    xaxis_title="Date",
    title="La course au classement",
    yaxis_title="Score ELO",
    legend_title="Classement en Novembre 2024",
    xaxis_range=[pd.Timestamp("2024-01-01"), current_date],  # Extend x-axis for labels
    yaxis_range=[1103, 1400],
)
apply_template(fig)

fig.update_xaxes(
    tickformat="%m-%Y",
)

fig.show()


Discarding nonzero nanoseconds in conversion.



In [14]:
fig = go.Figure()

for i, org in enumerate(
    df.groupby("Organization")["rating"]
    .max()
    .sort_values(ascending=False)
    .index.tolist()
):
    org_data = df[df["Organization"] == org]

    if len(org_data) > 0:
        x_values = []
        y_values = []
        current_best = -np.inf
        best_models = []

        # Group by date and get the best model for each date
        daily_best = org_data.groupby("Release Date").first().reset_index()

        for _, row in daily_best.iterrows():
            if row["rating"] > current_best:
                if len(x_values) > 0:
                    # Create smooth transition
                    transition_days = (row["Release Date"] - x_values[-1]).days
                    transition_points = pd.date_range(
                        x_values[-1],
                        row["Release Date"],
                        periods=max(100, transition_days),
                    )
                    x_values.extend(transition_points)

                    transition_y = current_best + (
                        row["rating"] - current_best
                    ) * sigmoid_transition(
                        np.linspace(-6, 6, len(transition_points)), 0, k=1
                    )
                    y_values.extend(transition_y)

                x_values.append(row["Release Date"])
                y_values.append(row["rating"])
                current_best = row["rating"]
                best_models.append(row)

        # Extend the line to the current date
        if x_values[-1] < current_date:
            x_values.append(current_date)
            y_values.append(current_best)

        # Get org color and flag
        color, flag = org_info.get(org, ("#808080", ""))

        # Add line plot
        fig.add_trace(
            go.Scatter(
                x=x_values,
                y=y_values,
                mode="lines",
                name=f"{i + 1}. {org} {flag}",
                line=dict(color=color, width=2),
                hoverinfo="skip",
            )
        )

        # Add scatter plot for best model points
        best_models_df = pd.DataFrame(best_models)
        fig.add_trace(
            go.Scatter(
                x=best_models_df["Release Date"],
                y=best_models_df["rating"],
                mode="markers",
                name=org,
                showlegend=False,
                marker=dict(color=color, size=8, symbol="circle"),
                text=best_models_df["Model"],
                hovertemplate="<b>%{text}</b><br>Date: %{x}<br>ELO Score: %{y:.2f}<extra></extra>",
            )
        )

# Update layout
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="ELO score on Chatbot Arena",
    legend_title="Ranking as of Jan '25",
    hovermode="closest",
    xaxis_range=[pd.Timestamp("2024-01-01"), current_date],  # Extend x-axis for labels
    yaxis_range=[1103, 1400],
)
apply_template(fig, annotation_text="Aymeric Roucher")

# Update axis labels font
# fig.write_image("elo_race.png", scale=2)
fig.show()


Discarding nonzero nanoseconds in conversion.

