# Chargement du jeu de données

In [None]:
import os
from getpass import getpass

cache_dir = input("Indicate path to all Hugging Face caches:")
os.environ["HF_DATASETS_CACHE"] = cache_dir
os.environ["HF_HUB_CACHE"] = cache_dir
os.environ["HF_TOKEN"] = getpass("Enter your HuggingFace token:")

In [2]:
from rank_comparia.utils import load_comparia

reactions = load_comparia("ministere-culture/comparia-reactions")

Using the latest cached version of the dataset since ministere-culture/comparia-reactions couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'default' at /home/jupyterhub-users/shared/projet_comparia/huggingface_hub/ministere-culture___comparia-reactions/default/0.0.0/80befa851337d9f295096cef3d100b40d220dc07 (last modified on Mon Jul 28 10:06:54 2025).
Using the latest cached version of the dataset since ministere-culture/comparia-conversations couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'default' at /home/jupyterhub-users/shared/projet_comparia/huggingface_hub/ministere-culture___comparia-conversations/default/0.0.0/dc40af6af1c14e68bf39d55f6e1573d2d6582f19 (last modified on Wed Jun  4 17:40:30 2025).


## Calcul des scores

On calcule des scores comme dans le notebook `rankers.ipynb`.

In [3]:
from rank_comparia.data_transformation import get_matches_with_score

matches = get_matches_with_score(reactions)

In [4]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""mixtral-8x7b-instruct-v0.1""","""gemma-2-9b-it""","""622ef8352512492fa2239daa1f785f…",2,0
"""llama-3.1-70b""","""ministral-8b-instruct-2410""","""3c340eca372942b78191a9988326b3…",0,1
"""mixtral-8x22b-instruct-v0.1""","""gpt-4o-2024-08-06""","""428999cee1f44c0ba0bba50a7b86d0…",-1,3
"""lfm-40b""","""ministral-8b-instruct-2410""","""497ad5a7b27844b3bb71c40941e63d…",-2,-2
"""mixtral-8x22b-instruct-v0.1""","""gpt-4o-2024-08-06""","""4708a0e330c341d09d7d875b3194d5…",0,1


In [5]:
from rank_comparia.elo import ELORanker
from rank_comparia.ranker import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [6]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

In [7]:
ranker = ELORanker(K=40)

random.seed(1337)
matches = random.sample(matches, k=len(matches))
ranker.add_players(model_names)  # type: ignore
ranker.compute_scores(matches=matches)
ranker.get_scores()

{'gemini-2.0-flash-001': 1258.9655833200165,
 'deepseek-v3-0324': 1177.1091683714053,
 'deepseek-v3-chat': 1159.9438645364053,
 'gemma-3-27b': 1158.247500469015,
 'command-a': 1157.1991483691977,
 'gemma-3-12b': 1141.1285927642145,
 'claude-3-7-sonnet': 1140.5209524446682,
 'gpt-4.1-mini': 1130.9506425045497,
 'llama-3.1-nemotron-70b-instruct': 1091.9081381237697,
 'gemini-2.0-flash-exp': 1076.808884411677,
 'gemini-1.5-pro-002': 1073.939447114147,
 'qwq-32b': 1062.3915259252617,
 'grok-3-mini-beta': 1062.0263937587404,
 'gemma-3-4b': 1058.852285307133,
 'mistral-small-3.1-24b': 1041.5666417376424,
 'deepseek-r1': 1041.0442108000136,
 'llama-4-scout': 1031.917419216266,
 'llama-3.1-8b': 1028.4494332999684,
 'llama-3.1-405b': 1020.942776220401,
 'hermes-3-llama-3.1-405b': 1011.016273848348,
 'llama-3.1-70b': 1006.2484033416458,
 'mistral-large-2411': 1001.1414437303283,
 'gpt-4o-mini-2024-07-18': 995.8648045943313,
 'mistral-saba': 993.2139637662532,
 'gpt-4.1-nano': 986.1600880739813,


## Calcul d'un score de frugalité

Le score de frugalité est calculé à partir de données de consommation présentes dans le jeu de données `comparia-conversations`.

### Calcul du nombre de match et du nombre total de tokens générés par modèle

In [8]:
import polars as pl
from rank_comparia.frugality import get_n_match, get_models_output_tokens

reactions = reactions.rename({"model_a_name": "model_a", "model_b_name": "model_b"})
number_by_model = get_n_match(reactions)
total_tokens = get_models_output_tokens(reactions)

number_by_model = number_by_model.join(total_tokens, on="model_name")

number_by_model

model_name,n_match,total_output_tokens
str,u32,f64
"""aya-expanse-8b""",965,961445.0
"""c4ai-command-r-08-2024""",2571,5.944341e6
"""chocolatine-2-14b-instruct-v2.…",1098,511086.0
"""claude-3-5-sonnet-v2""",3537,2.818225e6
"""claude-3-7-sonnet""",557,2.162331e6
…,…,…
"""phi-3.5-mini-instruct""",786,851005.0
"""phi-4""",2927,3.086417e6
"""qwen2.5-7b-instruct""",756,870902.0
"""qwen2.5-coder-32b-instruct""",3222,4.787749e6


### Calcul du score de frugalité

Calcul du score énergétique. Il est possible de moyenner les scores avec le paramètre `mean` (si True, le score est moyenné, sinon non).  
Si on décide de moyenner, le moyennage par tokens et par nombre de match est effectué.

In [9]:
from rank_comparia.frugality import calculate_frugality_score

frugal_scores = calculate_frugality_score(reactions, number_by_model)

frugal_scores

model_name,total_output_tokens,conso_all_conv,n_match,total_output_tokens_right,mean_conso_per_match,mean_conso_per_token
str,f64,f64,u32,f64,f64,f64
"""aya-expanse-8b""",961445.0,3.62259,965,961445.0,0.003754,0.000004
"""c4ai-command-r-08-2024""",5.944341e6,44.921088,2571,5.944341e6,0.017472,0.000008
"""chocolatine-2-14b-instruct-v2.…",511086.0,1.853976,1098,511086.0,0.001689,0.000004
"""claude-3-5-sonnet-v2""",2.818225e6,378.314297,3537,2.818225e6,0.106959,0.000134
"""claude-3-7-sonnet""",2.162331e6,290.26807,557,2.162331e6,0.521128,0.000134
…,…,…,…,…,…,…
"""phi-3.5-mini-instruct""",851005.0,2.609332,786,851005.0,0.00332,0.000003
"""phi-4""",3.086417e6,14.228012,2927,3.086417e6,0.004861,0.000005
"""qwen2.5-7b-instruct""",870902.0,3.159217,756,870902.0,0.004179,0.000004
"""qwen2.5-coder-32b-instruct""",4.787749e6,34.16509,3222,4.787749e6,0.010604,0.000007


In [10]:
elo_scores = pl.DataFrame(
    {
        "model_name": ranker.players.keys(),
        "elo_score": ranker.players.values(),
    },
    strict=False,
).sort(by="elo_score", descending=True)

elo_scores

model_name,elo_score
str,f64
"""gemini-2.0-flash-001""",1258.965583
"""deepseek-v3-0324""",1177.109168
"""deepseek-v3-chat""",1159.943865
"""gemma-3-27b""",1158.2475
"""command-a""",1157.199148
…,…
"""claude-3-5-sonnet-v2""",860.092331
"""c4ai-command-r-08-2024""",846.079343
"""gemma-2-9b-it""",845.842519
"""mistral-nemo-2407""",771.322046


## Création du graphique de frugalité

### Chargement des informations concernant les modèles du comparateur

In [11]:
from pathlib import Path

info_model = pl.read_json(source=Path(".").resolve().parent / "data" / "models_data.json")

In [12]:
final_df = info_model.join(elo_scores, on="model_name").join(frugal_scores, on="model_name")

final_df

name,model_name,organization,license,elo_score,total_output_tokens,conso_all_conv,n_match,total_output_tokens_right,mean_conso_per_match,mean_conso_per_token
str,str,str,str,f64,f64,f64,u32,f64,f64,f64
"""Aya-Expanse-8B""","""aya-expanse-8b""","""Cohere""","""CC-BY-NC-4.0""",941.580344,961445.0,3.62259,965,961445.0,0.003754,0.000004
"""Command R (08-2024)""","""c4ai-command-r-08-2024""","""Cohere""","""CC-BY-NC-4.0""",846.079343,5.944341e6,44.921088,2571,5.944341e6,0.017472,0.000008
"""Chocolatine-2-14b Instruct""","""chocolatine-2-14b-instruct-v2.…","""jpacifico (individual)""","""Apache 2.0""",767.874411,511086.0,1.853976,1098,511086.0,0.001689,0.000004
"""Claude 3.5 Sonnet V2""","""claude-3-5-sonnet-v2""","""Anthropic""","""Proprietary""",860.092331,2.818225e6,378.314297,3537,2.818225e6,0.106959,0.000134
"""Command A""","""command-a""","""Cohere""","""CC-BY-NC-4.0""",1157.199148,1.03253e6,18.815316,760,1.03253e6,0.024757,0.000018
…,…,…,…,…,…,…,…,…,…,…
"""Phi-3.5 Mini Instruct""","""phi-3.5-mini-instruct""","""Microsoft""","""MIT""",951.666106,851005.0,2.609332,786,851005.0,0.00332,0.000003
"""Phi 4""","""phi-4""","""Microsoft""","""MIT""",980.025146,3.086417e6,14.228012,2927,3.086417e6,0.004861,0.000005
"""Qwen2.5-7B""","""qwen2.5-7b-instruct""","""Alibaba""","""Apache 2.0""",930.475634,870902.0,3.159217,756,870902.0,0.004179,0.000004
"""Qwen2.5-Coder-32B-Instruct""","""qwen2.5-coder-32b-instruct""","""Alibaba""","""Apache 2.0""",946.851164,4.787749e6,34.16509,3222,4.787749e6,0.010604,0.000007


In [13]:
from rank_comparia.utils import save_data

save_path = Path(".").resolve().parent / "data"
save_data(final_df, "all_info_for_chart_drawing", save_path)

### Génération du graphique de frugalité

Les paramètres possibles :  
- `log` : Ajuster l'échelle du graphique en linéaire (`log = False`) ou en log (`log = True`) ; 
- `mean` : Utiliser les consommation moyenné (`mean = True`) ou non (`mean = False`) ;  
- `scale` :  choix du moyennage si `mean = True`. `token` si on utilise le moyennage par token, `match` si on utilise le moyennage par nombre de match ;  
- `save` : Enregistrement du graphique au format html.

In [14]:
from rank_comparia.plot import draw_frugality_chart

final_df = final_df.rename({"elo_score": "median"})
draw_frugality_chart(final_df, title="consommation selon classement", log=True, scale="token")