In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
#!/usr/bin/env python3
import os
import pandas as pd
from google import genai
from processor import load_data, generate_score_for_student
from config import client, MODEL_NAME;

usage_log = []
_original_generate = client.models.generate_content

def _patched_generate_content(*args, **kwargs):
    resp = _original_generate(*args, **kwargs)
    u = resp.usage_metadata
    usage_log.append({
        "prompt_tokens":     u.prompt_token_count,
        "completion_tokens": u.candidates_token_count,
        "total_tokens":      u.total_token_count,
    })
    return resp

client.models.generate_content = _patched_generate_content

profiles_df, qa_groups = load_data(
    profiles_path="data/langchain_structured_profiles.csv",
    answers_path="data/finetune_personalized_answers.csv"
)

sample_profile = profiles_df.iloc[0].to_dict()
sample_qas     = qa_groups[0]  # list of 4 {question, rag_answer} dicts

usage_log.clear()
_ = generate_score_for_student(sample_profile, sample_qas)  # logs one call

# Ensure we captured the call
if not usage_log:
    raise RuntimeError("No token usage recorded—check that generate_content was patched")

pilot = usage_log[0]
n_score_prompt   = pilot["prompt_tokens"]
n_score_out      = pilot["completion_tokens"]
n_score_total    = pilot["total_tokens"]

print("Pilot judging token usage (per 4-QA batch):")
print(f"  prompt_tokens     = {n_score_prompt}")
print(f"  completion_tokens = {n_score_out}")
print(f"  total_tokens      = {n_score_total}")

N_profiles        = 1000
QAs_per_student   = 40
group_size        = len(sample_qas)          # 4 QAs per API call
calls_per_student = QAs_per_student // group_size  # = 10
total_calls       = N_profiles * calls_per_student  # = 10 000

estimated_total      = total_calls * n_score_total
estimated_prompt     = total_calls * n_score_prompt
estimated_completion = total_calls * n_score_out

print(f"\nTotal scoring calls needed: {total_calls} "
      f"({calls_per_student} calls per student)")
print(f"Estimated total tokens needed for judging: {estimated_total:,}")
print(f"  → Prompt-only budget:     {estimated_prompt:,}")
print(f"  → Completion-only budget: {estimated_completion:,}")


Pilot judging token usage (per 4-QA batch):
  prompt_tokens     = 2080
  completion_tokens = 110
  total_tokens      = 2190

Total scoring calls needed: 10000 (10 calls per student)
Estimated total tokens needed for judging: 21,900,000
  → Prompt-only budget:     20,800,000
  → Completion-only budget: 1,100,000
