In [3]:
from tqdm import tqdm
from openai import OpenAI
import os
from dotenv import load_dotenv
from huggingface_hub import login
import requests
import json

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
load_dotenv()
login(os.getenv('HF_TOKEN'))

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv('OPENROUTER_API_KEY'),
    # api_key=os.getenv("GITHUB_TOKEN"),
)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [17]:
user_input = "Um — do you know where the world’s largest ice sheet is located today?"

classifier_input = f'''Please assess what personality best fits the following text. The categories are:  
- Formal  
- Casual  
- Confident  
- Hesitant  
- Analytical  
- Emotional  
- Optimistic  
- Pessimistic  

<text>  
{user_input}
</text>  

Please respond with a single word.'''

### Simple Classifier

In [21]:
completion = client.chat.completions.create(
    model='google/gemini-2.5-flash',
    messages=[
        {
        "role": "user",
        "content": classifier_input,
        }
    ]
)

In [22]:
print(completion.choices[0].message.content)

Hesitant


### Classifier with logits

In [4]:
completion = client.chat.completions.create(
    model='google/gemini-2.5-flash',
    messages=[
        {
        "role": "user",
        "content": classifier_input,
        }
    ],
    logprobs=True,
    top_logprobs=5,
)

In [5]:
print(completion.choices[0].message.content)

Hesitant


In [6]:
completion.choices[0]

Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Hesitant', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning=None), native_finish_reason='STOP')

In [43]:
content = completion.choices[0].logprobs.content

In [51]:
content[0].__dict__

{'token': 'H',
 'bytes': [72],
 'logprob': -4.320199877838604e-07,
 'top_logprobs': [TopLogprob(token='H', bytes=[72], logprob=-4.320199877838604e-07),
  TopLogprob(token='Cur', bytes=[67, 117, 114], logprob=-15.25),
  TopLogprob(token='Cas', bytes=[67, 97, 115], logprob=-15.75),
  TopLogprob(token=' hesitant', bytes=[32, 104, 101, 115, 105, 116, 97, 110, 116], logprob=-17.375),
  TopLogprob(token='P', bytes=[80], logprob=-17.75)]}

In [49]:
content[0].top_logprobs

[TopLogprob(token='H', bytes=[72], logprob=-4.320199877838604e-07),
 TopLogprob(token='Cur', bytes=[67, 117, 114], logprob=-15.25),
 TopLogprob(token='Cas', bytes=[67, 97, 115], logprob=-15.75),
 TopLogprob(token=' hesitant', bytes=[32, 104, 101, 115, 105, 116, 97, 110, 116], logprob=-17.375),
 TopLogprob(token='P', bytes=[80], logprob=-17.75)]

### Classifier with logits – using GitHub models

In [None]:
# Judge model -- assessing logprobs from different models to determine max persona

def get_model_completion(model_input: str, model: str = 'openai/gpt-4.1-mini'): 
    url = "https://models.github.ai/inference/chat/completions"
    github_token = os.getenv("GITHUB_TOKEN")

    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {github_token}",
        "Content-Type": "application/json",
        "X-GitHub-Api-Version": "2022-11-28",
    }
    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": model_input
            }
        ],
        "logprobs": True, 
        "top_logprobs": 5,
    }

    resp = requests.post(url, json=payload, headers=headers, timeout=30)
    completion = json.loads(resp.text)

    return completion, resp


def print_rate_limits(response):
    print(f'total rate limit requests per hour: {response.headers['x-ratelimit-limit-requests']}')
    print(f'rate limit requests remaining this hour: {response.headers['x-ratelimit-remaining-requests']}')

    print(f'total rate limit tokens per hour: {response.headers['x-ratelimit-limit-tokens']}')
    print(f'rate limit tokens remaining this hour: {response.headers['x-ratelimit-remaining-tokens']}')


In [19]:
completion, response = get_model_completion(model_input=classifier_input)

In [20]:
output = completion['choices'][0]['message']
output

{'annotations': [],
 'content': 'Hesitant',
 'refusal': None,
 'role': 'assistant'}

In [21]:
all_logprobs = completion['choices'][0]['logprobs']['content']
# assert len(all_logprobs) == 1   # ie. the model should respond with a single token

all_logprobs[0]['top_logprobs']

[{'bytes': [72], 'logprob': -9.088346359931165e-07, 'token': 'H'},
 {'bytes': [67, 97, 115], 'logprob': -14.500000953674316, 'token': 'Cas'},
 {'bytes': [104, 101, 115], 'logprob': -14.750000953674316, 'token': 'hes'},
 {'bytes': [32, 104, 101, 115, 105, 116, 97, 110, 116],
  'logprob': -20.375,
  'token': ' hesitant'},
 {'bytes': [32, 72, 101, 115], 'logprob': -20.5, 'token': ' Hes'}]

### Getting Output from 2 Models -- Meta, OpenAI

In [1]:
# Define the 2 models via GitHub
MODELS = {
    "Meta Llama 3.1 8B": "meta/Meta-Llama-3.1-8B-Instruct",
    "OpenAI gpt-4.1-mini": "gpt-4.1-mini",
    # "Google Gemma 3 4B": "gemma-3-4b-instruct",
}

def get_github_model_response(model_name, model_id, prompt, max_tokens=100):
    """
    Get response from GitHub model using direct API.
    Returns dict with status, model name, response, and any errors.
    """
    url = "https://models.github.ai/inference/chat/completions"
    github_token = os.getenv("GITHUB_TOKEN")
    
    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {github_token}",
        "Content-Type": "application/json",
        "X-GitHub-Api-Version": "2022-11-28",
    }
    
    payload = {
        "model": model_id,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "max_tokens": max_tokens,
    }
    
    try:
        resp = requests.post(url, json=payload, headers=headers, timeout=30)
        completion = resp.json()
        
        if resp.status_code == 200:
            message = completion['choices'][0]['message']['content']
            return {
                "model": model_name,
                "model_id": model_id,
                "status": "success",
                "response": message,
            }
        else:
            error_msg = completion.get('error', {}).get('message', f'HTTP {resp.status_code}')
            return {
                "model": model_name,
                "model_id": model_id,
                "status": "error",
                "error": error_msg,
            }
    
    except Exception as e:
        error_msg = str(e)
        status = "error"
        if "429" in error_msg or "rate" in error_msg.lower():
            status = "rate_limited"
        elif "401" in error_msg or "unauthorized" in error_msg.lower():
            status = "auth_error"
        
        return {
            "model": model_name,
            "model_id": model_id,
            "status": status,
            "error": error_msg,
        }


def get_all_github_model_responses(prompt, max_tokens=100):
    """
    Get responses from all GitHub models in sequence.
    Returns list of response dicts.
    """
    responses = []
    
    for model_name, model_id in MODELS.items():
        print(f"\nCalling {model_name}...")
        response = get_github_model_response(model_name, model_id, prompt, max_tokens)
        responses.append(response)
        
        if response["status"] == "success":
            resp_text = response['response'].strip() if response['response'] else "(empty)"
            print(f"✓ {model_name}: {resp_text[:100]}")
        else:
            print(f"✗ {model_name}: {response['status']} - {response.get('error', 'Unknown error')}")
    
    return responses

In [4]:
# Get responses from models
user_input = "I'm not sure about this decision."
responses = get_all_github_model_responses(user_input, max_tokens=100)
# Display results
for resp in responses:
    print(f"\n{resp['model']}:")
    print(f"  Status: {resp['status']}")
    if resp['status'] == 'success':
        print(f"  Response: {resp['response']}")
    else:
        print(f"  Error: {resp.get('error', 'N/A')}")


Calling Meta Llama 3.1 8B...
✓ Meta Llama 3.1 8B: It can be tough to make decisions, especially when you're not sure about the outcome. Would you like

Calling OpenAI gpt-4.1-mini...
✓ OpenAI gpt-4.1-mini: It sounds like you’re feeling uncertain. If you’d like, you can share more about the decision you’re

Meta Llama 3.1 8B:
  Status: success
  Response: It can be tough to make decisions, especially when you're not sure about the outcome. Would you like to talk more about what's on your mind and what's making you unsure? Sometimes sharing your thoughts and feelings with someone (or in this case, a conversational AI) can help you clarify your thoughts and gain a new perspective. What's going on?

OpenAI gpt-4.1-mini:
  Status: success
  Response: It sounds like you’re feeling uncertain. If you’d like, you can share more about the decision you’re facing, and I can help you think through the options.
