## Setup

In [117]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm
import csv
import json
from time import sleep
import os
import matplotlib.pyplot as plt
from pathlib import Path, PosixPath

In [137]:
def generate_response(
    model,
    prompt,
    system_prompt="You're a helpful assistant",
    suffix_prompt='',
    model_config=None
):
    """
    Returns the response from the model

    Args:
        model (str): Model name
        prompt (str): prompt text
        system_prompt (str): System prompt
        suffix_prompt (str): Suffix prompt to append to the prompt
        model_config (dict): Model parameters
    Returns:
        response_text (str): Response text from the model
        final_answer (str): Final numerical answer
    """
    input = tokenizer.apply_chat_template(
        [
            # System role not supported in Gemma2, not trained with system instructions
            # Prepend system prompt to user prompt
            { "role": "user", "content": "\n".join([system_prompt, prompt, suffix_prompt]) },
        ],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    response = model.generate(
        input,
        **model_config
    )
    response_text = tokenizer.decode(response[0], skip_special_tokens=True)
    final_answer = response_text.split("#### ")[-1].strip()

    try:
        cleaned_answer = final_answer.replace(",", "")  # remove commas
        cleaned_answer = cleaned_answer.replace("$", "")    # remove dollar signs
        final_answer = float(cleaned_answer)
    except Exception:
        pass

    return (response_text, final_answer)

In [87]:
def save_results(filepath, results, model, model_config, system_prompt, suffix_prompt):
    # Add config details to results
    results_with_config = {}
    results_with_config["config"] = {
        "model": model,
        "model_config": model_config,
        "prompt_config": {
            "system_prompt": system_prompt,
            "suffix_prompt": suffix_prompt
        }
    }
    results_with_config["results"] = results

    with open(filepath, "w") as fd:
        json.dump(results_with_config, fd, indent=4)

    print(f"Results saved to {filepath}")

## Sample Inference

In [4]:
model_name = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [75]:
system_prompt = "You are a helpful assistant. Do not use or generate code in your responses."
question = "What is 546588 * 493181?"
suffix_prompt = "Also append the final numerical answer on a new line starting with '#### '"

In [70]:
input = tokenizer.apply_chat_template(
    [
        # { "role": "system", "content": system_prompt },  # System role not supported in Gemma2, not trained with system instructions
        { "role": "user", "content": "\n".join([system_prompt, question, suffix_prompt]) },
    ],
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

print("Prompt:", tokenizer.decode(input[0]))
print("="*100)

response = model.generate(
    input,
    max_new_tokens=256
)

print("Response:", tokenizer.decode(response[0], skip_special_tokens=True))

Prompt: <bos><start_of_turn>user
You are a helpful assistant. Do not use or generate code in your responses.
What is 546588 * 493181?
Also append the final numerical answer on a new line starting with '#### '<end_of_turn>
<start_of_turn>model

Response: user
You are a helpful assistant. Do not use or generate code in your responses.
What is 546588 * 493181?
Also append the final numerical answer on a new line starting with '#### '
model
To calculate 546588 * 493181, you would need to perform the multiplication.  

The result of this calculation is a very large number. 

#### 27,699,999,999 



In [71]:
# input_ids = tokenizer("\n".join([system_prompt, question, suffix_prompt]), return_tensors="pt").to(model.device)
# suffix_prompt = "Show working and give the numerical answer at the end in the format ####:{}"
suffix_prompt = "A: "
input_ids = tokenizer("Q: " + question + "\n" + suffix_prompt, return_tensors="pt").to(model.device)
print("Prompt:", tokenizer.decode(input_ids['input_ids'][0]))

print("="*100)

response = model.generate(**input_ids, max_new_tokens = 256)
print("Response:", tokenizer.decode(response[0], skip_special_tokens=True))

Prompt: <bos>Q: What is 546588 * 493181?
A: 
Response: Q: What is 546588 * 493181?
A: 26,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,9


In [80]:
model_config = {
    "max_new_tokens": 256,   # max number of completion tokens, including visible output tokens and reasoning tokens
    "temperature": 0.7,      # sampling temperature to use, between 0 and 2
    "top_p": 0.9             # considers the results of the tokens with top_p probability mass
}

In [81]:
def generate_response(
    model,
    prompt,
    system_prompt="You're a helpful assistant",
    suffix_prompt='',
    model_config=None
):
    """
    Returns the response from the model

    Args:
        model (str): Model name
        prompt (str): prompt text
        system_prompt (str): System prompt
        suffix_prompt (str): Suffix prompt to append to the prompt
        model_config (dict): Model parameters
    Returns:
        response_text (str): Response text from the model
        final_answer (str): Final numerical answer
    """
    input = tokenizer.apply_chat_template(
        [
            # System role not supported in Gemma2, not trained with system instructions
            # Prepend system prompt to user prompt
            { "role": "user", "content": "\n".join([system_prompt, question, suffix_prompt]) },
        ],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    response = model.generate(
        input,
        **model_config
    )
    response_text = tokenizer.decode(response[0], skip_special_tokens=True)
    final_answer = response_text.split("#### ")[-1].strip()

    try:
        cleaned_answer = final_answer.replace(",", "")  # remove commas
        cleaned_answer = cleaned_answer.replace("$", "")    # remove dollar signs
        final_answer = float(cleaned_answer)
    except Exception:
        pass

    return (response_text, final_answer)

In [84]:
system_prompt = "You are a helpful assistant. Do not use or generate code in your responses."
question = "What is 546588 * 493181?"
suffix_prompt = "Also append the final numerical answer on a new line starting with '#### '"

response_text, final_answer = generate_response(model, question, system_prompt=system_prompt, suffix_prompt=suffix_prompt, model_config=model_config)
print(f"response_text: {response_text}")
print(f"final_answer: {final_answer}")

response_text: user
You are a helpful assistant. Do not use or generate code in your responses.
What is 546588 * 493181?
Also append the final numerical answer on a new line starting with '#### '
model
To calculate 546588 * 493181, you would need to perform the multiplication.  

The result of this calculation is a very large number. 

#### 27,699,999,999 

final_answer: 27699999999.0


## Arithmetic Length Generalization Dataset (ALGD) Inference

In [108]:
model_name = "google/gemma-2b-it"
# Uncomment and load the model if not loaded already
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [133]:
system_prompt = "You are a helpful assistant. Do not use or generate code in your responses."
suffix_prompt = "Also append the final numerical answer on a new line starting with '#### '"

In [145]:
model_config = {
    "max_new_tokens": 512,   # max number of completion tokens, including visible output tokens and reasoning tokens
    "temperature": 0.7,      # sampling temperature to use, between 0 and 2
    "do_sample": True,
    "top_p": 0.9             # considers the results of the tokens with top_p probability mass
}

In [146]:
def evaluate_file(
    file_path: PosixPath,
    results_dir: PosixPath = None,
    model=model,
    system_prompt=system_prompt,
    suffix_prompt=suffix_prompt,
    model_config=model_config
):
    # Load CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Assuming the file has "Question" and "Answer" columns
    correct_count = 0
    results = []

    for _, row in tqdm(data.iterrows(), total=len(data)):
        question = row["Question"]
        correct_answer = row["Answer"]

        # convert to float
        correct_answer = float(correct_answer)

        # print(f"Question: {question}")
        # print(f"Correct Answer: {correct_answer}")

        # Get predicted answer
        response_text, predicted_final_answer = generate_response(
            model,
            question,
            system_prompt=system_prompt,
            suffix_prompt=suffix_prompt,
            model_config=model_config
        )

        # Evaluate correctness
        is_correct = predicted_final_answer == correct_answer

        if is_correct:
            correct_count += 1

        # Append result
        result = {
            "Question": question,
            "Response Text": response_text,
            "Predicted Answer": predicted_final_answer,
            "Correct Answer": correct_answer,
            "Is Correct": is_correct
        }
        results.append(result)

        # print(result)

        # sleep(1)    # no rate limiting, model loaded locally

    # Calculate accuracy
    total_questions = len(data)
    accuracy = (correct_count / total_questions) * 100
    print(f"Accuracy for {file_path.name}: {accuracy:.2f}%")

    # Save results for this file
    output_file = Path(file_path.stem + "_evaluation_results.csv")
    if results_dir:
        output_file = results_dir / output_file
    with open(output_file, "w", newline="") as csvfile:
        fieldnames = ["Question", "Response Text", "Predicted Answer", "Correct Answer", "Is Correct"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for result in results:
            writer.writerow(result)

    print(f"Results for {file_path} saved to {output_file.name}")

### Connect Google Drive

In [118]:
from google.colab import drive
drive.mount('/content/drive')

root_dir = Path('/content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval')
dataset_dir = root_dir / 'dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Addition

In [147]:
file_paths = [
    dataset_dir / 'add/1-d-add.csv',
    dataset_dir / 'add/2-d-add.csv',
    dataset_dir / 'add/3-d-add.csv',
    dataset_dir / 'add/4-d-add.csv',
    dataset_dir / 'add/5-d-add.csv',
    dataset_dir / 'add/6-d-add.csv',
    dataset_dir / 'add/7-d-add.csv'
]

results_dir = root_dir / model_name.split('/')[1]

In [148]:
for file_path in file_paths:
    evaluate_file(file_path, results_dir=results_dir)

100%|██████████| 100/100 [01:37<00:00,  1.03it/s]


Accuracy for 1-d-add.csv: 93.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/add/1-d-add.csv saved to 1-d-add_evaluation_results.csv


100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


Accuracy for 2-d-add.csv: 89.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/add/2-d-add.csv saved to 2-d-add_evaluation_results.csv


100%|██████████| 100/100 [02:01<00:00,  1.22s/it]


Accuracy for 3-d-add.csv: 90.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/add/3-d-add.csv saved to 3-d-add_evaluation_results.csv


100%|██████████| 100/100 [02:24<00:00,  1.45s/it]


Accuracy for 4-d-add.csv: 66.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/add/4-d-add.csv saved to 4-d-add_evaluation_results.csv


100%|██████████| 100/100 [02:44<00:00,  1.64s/it]


Accuracy for 5-d-add.csv: 52.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/add/5-d-add.csv saved to 5-d-add_evaluation_results.csv


100%|██████████| 100/100 [03:09<00:00,  1.89s/it]


Accuracy for 6-d-add.csv: 29.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/add/6-d-add.csv saved to 6-d-add_evaluation_results.csv


100%|██████████| 100/100 [03:18<00:00,  1.98s/it]

Accuracy for 7-d-add.csv: 24.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/add/7-d-add.csv saved to 7-d-add_evaluation_results.csv





### Multiplication

In [152]:
file_paths = [
    dataset_dir / 'multiply/1-d-MUL.csv',
    dataset_dir / 'multiply/2-d-MUL.csv',
    dataset_dir / 'multiply/3-d-MUL.csv',
    dataset_dir / 'multiply/4-d-MUL.csv',
    dataset_dir / 'multiply/5-d-MUL.csv',
    dataset_dir / 'multiply/6-d-MUL.csv',
    dataset_dir / 'multiply/7-d-MUL.csv'
]

results_dir = root_dir / model_name.split('/')[1] / 'multiply'

In [154]:
for file_path in file_paths:
    evaluate_file(file_path, results_dir=results_dir)

100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


Accuracy for 1-d-MUL.csv: 99.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/multiply/1-d-MUL.csv saved to 1-d-MUL_evaluation_results.csv


100%|██████████| 100/100 [01:50<00:00,  1.11s/it]


Accuracy for 2-d-MUL.csv: 27.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/multiply/2-d-MUL.csv saved to 2-d-MUL_evaluation_results.csv


100%|██████████| 100/100 [02:10<00:00,  1.31s/it]


Accuracy for 3-d-MUL.csv: 1.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/multiply/3-d-MUL.csv saved to 3-d-MUL_evaluation_results.csv


100%|██████████| 100/100 [02:18<00:00,  1.38s/it]


Accuracy for 4-d-MUL.csv: 0.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/multiply/4-d-MUL.csv saved to 4-d-MUL_evaluation_results.csv


100%|██████████| 100/100 [02:47<00:00,  1.67s/it]


Accuracy for 5-d-MUL.csv: 0.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/multiply/5-d-MUL.csv saved to 5-d-MUL_evaluation_results.csv


100%|██████████| 100/100 [03:17<00:00,  1.97s/it]


Accuracy for 6-d-MUL.csv: 0.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/multiply/6-d-MUL.csv saved to 6-d-MUL_evaluation_results.csv


100%|██████████| 100/100 [03:28<00:00,  2.08s/it]

Accuracy for 7-d-MUL.csv: 0.00%
Results for /content/drive/MyDrive/academic/fall24/natural-language-processing/project/arithmetic-ood-interp/ALGD-eval/dataset/multiply/7-d-MUL.csv saved to 7-d-MUL_evaluation_results.csv





## Plotting results

### Accuracy

### Digit Match plots

In [None]:
df = pd.read_csv('/Users/ashish/Documents/academic/fall24/nlp/project/arithmetic-ood/algd/llama-3.1-70B/multiply/7-d-MUL_evaluation_results.csv')

# Function to compare digits up to the length of the shorter number
def compare_digits(predicted, correct):
    predicted_str = str(int(predicted))
    correct_str = str(int(correct))
    min_len = min(len(predicted_str), len(correct_str))

    # Compare digits up to the length of the shorter number
    digit_matches = [predicted_str[i] == correct_str[i] for i in range(min_len)]
    return digit_matches

# Apply the function and create new columns
for index, row in df.iterrows():
    digit_matches = compare_digits(row['Predicted Answer'], row['Correct Answer'])
    for i, match in enumerate(digit_matches):
        column_name = f'Digit_{i+1}_Match'
        df.at[index, column_name] = match

# Save the updated DataFrame to a new CSV file
df.to_csv('llama-3.1-70b-Instruct-7d-mul_digit_matches.csv', index=False)

In [None]:
results_csv = '/Users/ashish/Documents/academic/fall24/nlp/project/arithmetic-ood/algd/gpt-4o-mini/Multiplication/7-d-MUL_evaluation_results.csv'
df = pd.read_csv(results_csv)

# Function to compare digits up to the length of the shorter number
def compare_digits(predicted, correct):
    predicted_str = str(int(predicted))
    correct_str = str(int(correct))
    min_len = min(len(predicted_str), len(correct_str))

    # Compare digits up to the length of the shorter number
    digit_matches = [predicted_str[i] == correct_str[i] for i in range(min_len)]
    return digit_matches

# Apply the function and create new columns
for index, row in df.iterrows():
    digit_matches = compare_digits(row['GPT-4 Answer'], row['Correct Answer'])
    for i, match in enumerate(digit_matches):
        column_name = f'Digit_{i+1}_Match'
        df.at[index, column_name] = match

# Save the updated DataFrame to a new CSV file
outfile = os.path.join(os.path.dirname(results_csv), 'gpt-4o-mini-7d-mul-digit_matches.csv')
df.to_csv(outfile, index=False)