In [1]:
from openai import AsyncOpenAI

client = AsyncOpenAI()
await client.chat.completions.create(
    model="o1-mini",
    messages=[{"role": "user", "content": "Develop an exhaustive rubric and rank all the presidents of the United States from best to worst."}],
)

ChatCompletion(id='chatcmpl-Avax6KV8klyDUhaylq4PeO1KiIE6s', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Ranking all the Presidents of the United States from best to worst is a complex and inherently subjective task. To approach this systematically, I have developed an exhaustive rubric that evaluates each president based on multiple key criteria. This rubric aims to provide a balanced and comprehensive assessment of each president's performance and legacy.\n\n**Ranking Rubric for U.S. Presidents**\n\nThe following criteria are used to evaluate each president, along with their respective weightings to reflect their importance:\n\n1. **Leadership and Vision (20 points)**\n   - Ability to provide clear direction and inspire the nation.\n   - Articulation of a compelling and effective vision for the country’s future.\n\n2. **Economic Management (15 points)**\n   - Success in promoting economic growth and stability.\n   - Effectivenes

In [2]:
1728 / 64

27.0

In [6]:
import torch

class Attention(torch.nn.Module):
    def __init__(self, dim: int) -> None:
        super().__init__()
        self.wq = torch.nn.Linear(dim, dim, bias=False)
        self.wk = torch.nn.Linear(dim, dim, bias=False)
        self.wv = torch.nn.Linear(dim, dim, bias=False)

    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
        Q = self.wq(x)  # [batch_size, seq_len, dim]
        K = self.wk(x)  # [batch_size, seq_len, dim]
        V = self.wv(x)  # [batch_size, seq_len, dim]

        # Compute scaled dot-product attention
        attention = Q @ K.transpose(-2, -1)  # [batch_size, seq_len, seq_len]
        attention /= Q.shape[-1] ** 0.5  # Scale by sqrt of dimension
        
        if mask is not None:  # Check for mask presence
            attention += mask.to(attention.dtype) * -1e9
        
        attention = torch.softmax(attention, dim=-1)  # Apply softmax along seq_len

        # Weighted sum of values
        output = attention @ V  # [batch_size, seq_len, dim]
        return output


# Test the updated Attention module

# Create a random input tensor
batch_size = 2
seq_len = 5
dim = 8
x = torch.randn(batch_size, seq_len, dim)

# Instantiate the Attention module
attention_module = Attention(dim=dim)

# Test 1: Without Mask
output_no_mask = attention_module(x)

# Test 2: With Mask (causal mask example)
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).unsqueeze(0).repeat(batch_size, 1, 1) * -1e9
output_with_mask = attention_module(x, mask=mask)

# Output shapes
output_shapes = {
    "without_mask": output_no_mask.shape,
    "with_mask": output_with_mask.shape
}
output_shapes


{'without_mask': torch.Size([2, 5, 8]), 'with_mask': torch.Size([2, 5, 8])}