In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
import re


device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device)
datapath = "../data/24.csv"
instructions = "Using only basic arithmetic operations (BEDMAS, excluding exponents), solve the following puzzle using the numbers provided to get the result 24"

mps


In [4]:
# student model
student_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-large")
student_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-large")



In [19]:
student_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
student_model.resize_token_embeddings(len(student_tokenizer))

Embedding(50258, 1280)

In [5]:
df = pd.read_csv(datapath)


In [6]:
puzzles = df['Puzzles'].tolist()

In [7]:
class Game24Dataset(Dataset):
    def __init__(self, puzzles, tokenizer, instructions, max_length=512):
        self.puzzles = puzzles
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.instructions = instructions

    def __len__(self):
        return len(self.puzzles)

    def __getitem__(self, idx):
        puzzle = self.puzzles[idx]
        prompt = f"{self.instructions}: {puzzle}"
        encoding = self.tokenizer(
            prompt,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids
        }


In [14]:
# prototype of a loss function

def enhanced_loss_function(outputs, puzzles, tokenizer):
    total_loss = 0
    batch_size = len(puzzles)
    
    for i in range(batch_size):
        solution = tokenizer.decode(outputs[i], skip_special_tokens=True)
        puzzle = puzzles[i]
        
        # Check for correctness
        correct, partial_loss = evaluate_solution(solution, puzzle)
        
        total_loss += partial_loss if not correct else 0
    
    return total_loss / batch_size

def evaluate_solution(solution, puzzle):
   # Extract numbers and operators from the solution
    numbers = re.findall(r'\d+', solution)
    operators_and_brackets = re.findall(r'[+\-*/()]', solution)
    
    # Ensure the numbers used are exactly the ones in the puzzle
    puzzle_numbers = sorted(puzzle.split())
    solution_numbers = sorted(numbers)
    
    # Calculate a partial loss based on the incorrect use of numbers
    partial_loss = len(set(puzzle_numbers) - set(solution_numbers)) / len(puzzle_numbers)
    
    if puzzle_numbers != solution_numbers:
        return False, 1 + partial_loss
    
    # Check if valid operators and brackets are used
    valid_operators = set('+-*/')
    valid_brackets = set('()')
    invalid_chars = [char for char in operators_and_brackets if char not in valid_operators and char not in valid_brackets]
    
    if invalid_chars:
        partial_loss += 0.5  # Arbitrary penalty for invalid operators or brackets
    
    # Check for balanced brackets
    if not are_brackets_balanced(solution):
        partial_loss += 0.5  # Arbitrary penalty for unbalanced brackets
    
    # Evaluate the expression
    try:
        result = eval(solution)
        if result == 24:
            return True, 0
        else:
            return False, abs(24 - result) / 24 + partial_loss
    except Exception as e:
        print(f"Error evaluating solution: {e}")
        return False, 1 + partial_loss 

def are_brackets_balanced(expression):
    stack = []
    brackets = {'(': ')'}
    
    for char in expression:
        if char in brackets.keys():
            stack.append(char)
        elif char in brackets.values():
            if not stack or brackets[stack.pop()] != char:
                return False
    return not stack

In [8]:
dataset = Game24Dataset(puzzles, student_tokenizer, instructions)

In [9]:
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [15]:
student_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [16]:
def evaluate_model(model, dataloader, tokenizer):
    total_correct = 0
    total_puzzles = 0
    
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        
        with torch.no_grad():
            # Generate solutions from the model
            generated_outputs = model.generate(input_ids, attention_mask=attention_mask)
        
        # Decode puzzles for evaluation
        puzzles = [tokenizer.decode(ids, skip_special_tokens=True).replace(instructions, "").strip() for ids in input_ids]
        
        for i in range(len(generated_outputs)):
            solution = tokenizer.decode(generated_outputs[i], skip_special_tokens=True)
            puzzle = puzzles[i]
            correct, _ = evaluate_solution(solution, puzzle)
            total_correct += int(correct)
            total_puzzles += 1
    
    accuracy = total_correct / total_puzzles if total_puzzles > 0 else 0
    return accuracy

In [21]:
accuracy = evaluate_model(student_model, dataloader, student_tokenizer)
print(f"Model accuracy: {accuracy * 100:.2f}%")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
