In [34]:
from transformers import AutoTokenizer
import codecs
import re

n_digits = 8

# List of specific problems to skip
problems_to_skip = [
    "1000 + 1000 = 2000",
    "520 + 890 = 1410",
    "100 + 200 = 300",
    "1000 + 100 = 1100"
]

# Load the tokenizer for the model
model_name = "google/gemma-2-2b-it"  # Update with the actual model you're using
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [20]:
def generate_expected_tokens(problem):
    """
    Generate the expected tokens based on the addition problem and the values of x and y.
    Example: For x = 6, y = 1, and the problem "532901 + 5 = 532906", the expected tokens are:
    ['<bos>', '5', '3', '2', '9', '0', '1', '_+', '_', '5', '_=', '_', '5', '3', '2', '9', '0', '6']
    """
    try:
        # Split the problem into the x_term and the rest, ensuring it has the format "x_term + y_term = result"
        x_y_part, result_part = problem.split(" = ")
        x_term, y_term = x_y_part.split(" + ")
        
        # Expected token pattern (every digit separated, and with the necessary symbols)
        expected_tokens = ['<bos>'] + list(x_term) + ['▁+', '▁'] + list(y_term) + ['▁=', '▁'] + list(result_part)
    except ValueError as e:
        print(f"Error parsing problem: {problem}, error: {e}")
        return []

    return expected_tokens


In [30]:
# Function to sanitize tokens for comparison
def sanitize_tokens(tokens):
    """
    Cleans up the token list to remove any unwanted characters or whitespace issues.
    """
    return [token.strip() for token in tokens if token.strip()]

In [31]:
def should_skip_problem(x, y, problem):
    # Skip if the problem is in the list of specific problems to skip
    if problem in problems_to_skip:
        return True
    
    # Skip if the problem doesn't follow the format (only digits, '+', '=', and spaces)
    # Regex: Only allow digits, +, =, and spaces
    if not re.match(r'^\d+ \+ \d+ = \d+$', problem):
        return True

    # Ensure that the problem is an x-digit by y-digit problem
    try:
        x_term, y_term, result = re.split(r' \+ | = ', problem)
        if len(x_term) != x or len(y_term) != y:
            return True
    except ValueError:
        return True

    return False

In [32]:
# Function to tokenize a list of problems and check if the tokenization matches the expected format
def tokenize_problems_and_check(x, y, file_path):
    with codecs.open(file_path, 'r', 'utf-8', 'ignore') as file:
        problems = file.readlines()

    # Create an output folder and file path
    output_folder = f"{max(x, y)}_problems"
    output_file_path = f"{output_folder}/{x}_{y}_tokens.txt"

    passed_count = 0
    total_count = 0

    with open(output_file_path, 'w') as output_file:
        for problem in problems:
            problem = problem.strip()

            if problem:
                # Skip problems that do not match the format or specific problems
                if should_skip_problem(x, y, problem):
                    output_file.write(f"Skipping problem: {problem}\n")
                    continue
                
                total_count += 1
                # Tokenize the problem
                tokenized_output = tokenizer(problem, return_tensors="pt")

                # Convert the token IDs back to tokens
                tokens = tokenizer.convert_ids_to_tokens(tokenized_output['input_ids'][0])

                # Generate the expected tokens based on the values of x and y
                expected_tokens = generate_expected_tokens(problem)

                if not expected_tokens:
                    output_file.write(f"Problem: {problem}\nError: Problem format is incorrect.\n\n")
                    continue

                # Sanitize both actual and expected tokens for comparison
                actual_tokens = sanitize_tokens(tokens)
                expected_tokens = sanitize_tokens(expected_tokens)

                # Compare actual tokens to expected tokens
                if actual_tokens == expected_tokens:
                    passed_count += 1

                # Write the results to the output file
                output_file.write(f"accessing: {file_path}\n")
                output_file.write(f"Problem: {problem}\n")
                output_file.write(f"Tokens: {tokens}\n")
                output_file.write(f"Expected Tokens: {expected_tokens}\n")
                output_file.write(f"Tokenization match: {'PASS' if actual_tokens == expected_tokens else 'FAIL'}\n\n")

    # Calculate the percentage of problems that passed the tokenization check
    percentage_passed = (passed_count / total_count) * 100 if total_count > 0 else 0
    print(f"Percentage of correctly tokenized problems for x = {x} and y = {y}: {percentage_passed:.2f}%")

    return percentage_passed

In [35]:

for i in range(1, n_digits+1):
    for j in range(0, i+1):
        if j == 1:
            continue
        if j == 0:
            j += 1
        
        folder = f"{max(i, j)}_results"
        x_file_path = f"{folder}/{i}_by_{j}_results/{i}_by_{j}_at_1.0_results.pkl"
        y_file_path = f"{folder}/{j}_by_{i}_results/{j}_by_{i}_at_1.0_results.pkl"

        # Call the function to tokenize and retrieve the tokens from the file
        tokenize_problems_and_check(i, j, x_file_path)
        tokenize_problems_and_check(j, i, y_file_path)


Percentage of correctly tokenized problems for x = 1 and y = 1: 100.00%
Percentage of correctly tokenized problems for x = 1 and y = 1: 100.00%
Percentage of correctly tokenized problems for x = 2 and y = 1: 100.00%
Percentage of correctly tokenized problems for x = 1 and y = 2: 100.00%
Percentage of correctly tokenized problems for x = 2 and y = 2: 100.00%
Percentage of correctly tokenized problems for x = 2 and y = 2: 100.00%
Percentage of correctly tokenized problems for x = 3 and y = 1: 100.00%
Percentage of correctly tokenized problems for x = 1 and y = 3: 100.00%
Percentage of correctly tokenized problems for x = 3 and y = 2: 100.00%
Percentage of correctly tokenized problems for x = 2 and y = 3: 100.00%
Percentage of correctly tokenized problems for x = 3 and y = 3: 100.00%
Percentage of correctly tokenized problems for x = 3 and y = 3: 100.00%
Percentage of correctly tokenized problems for x = 4 and y = 1: 100.00%
Percentage of correctly tokenized problems for x = 1 and y = 4: 