In [46]:
import random
import json
from typing import List, Dict
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
from tqdm import tqdm



In [47]:
torch.cuda.is_available()

True

In [48]:
torch.cuda.empty_cache()


In [49]:
# Load Pegasus paraphrasing model
model_name = "tuner007/pegasus_paraphrase"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)



Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
basic_templates = {
    "add": [
        "Add {a} and {b}.", "What is the result of adding {a} and {b}?",
        "Can you add {a} to {b}?", "Please calculate the sum of {a} and {b}.",
        "{a} + {b} equals what?", "I need the result of {a} plus {b}.",
        "Hey, what do you get if you add {a} and {b}?"
    ],
    "subtract": [
        "Subtract {b} from {a}.", "What is the result of subtracting {b} from {a}?",
        "Can you subtract {b} from {a}?", "Please calculate {a} minus {b}.",
        "How much is {a} minus {b}?"
    ],
    "multiply": [
        "Multiply {a} by {b}.", "What is the result of multiplying {a} by {b}?",
        "Can you calculate the product of {a} and {b}?", "What do you get when you multiply {a} and {b}?",
        "Please compute {a} times {b}."
    ],
    "divide": [
        "Divide {a} by {b}.", "What is the result of dividing {a} by {b}?",
        "Can you compute {a} divided by {b}?", "How much is {a} over {b}?",
        "Please perform the division of {a} and {b}."
    ],
    "sqrt": [
        "What is the square root of {a}?", "Find the square root of {a}.",
        "Please calculate sqrt({a}).", "Compute the square root of {a}.",
        "I need the root of {a}."
    ],
    "power": [
        "What is {a} raised to the power of {b}?", "Calculate {a} to the power of {b}.",
        "Compute {a} raised to {b}.", "Please find the power of {a} and {b}.",
        "What do you get if you raise {a} to the {b}th power?"
    ]
}

matrix_templates = {
    "add": [
        "Add the matrices {m1} and {m2}.", "What is the sum of matrix {m1} and matrix {m2}?",
        "Please compute matrix addition for {m1} and {m2}.", "Can you add matrix {m1} with {m2}?",
        "Calculate {m1} plus {m2}.", "Find the result of adding matrices {m1} and {m2}."
    ],
    "subtract": [
        "Subtract matrix {m2} from matrix {m1}.", "What is the result of subtracting {m2} from {m1}?",
        "Please compute matrix subtraction for {m1} and {m2}.", "Can you subtract matrix {m2} from {m1}?",
        "Calculate {m1} minus {m2}.", "What is {m1} less {m2}?"
    ],
    "multiply": [
        "Multiply matrix {m1} with {m2}.", "Compute the matrix product of {m1} and {m2}.",
        "Please perform matrix multiplication on {m1} and {m2}.", "What do you get when you multiply matrix {m1} with {m2}?",
        "Calculate {m1} times {m2}."
    ],
    "inverse": [
        "Find the inverse of matrix {m1}.", "Please compute the inverse of {m1}.",
        "What is the inverse of matrix {m1}?", "Can you calculate the matrix inverse of {m1}?",
        "Calculate inverse({m1})."
    ],
    "determinant": [
        "Find the determinant of matrix {m1}.", "Please compute the determinant of {m1}.",
        "What is the determinant of matrix {m1}?", "Calculate determinant of {m1}."
    ]
}


In [51]:
def paraphrase_template(template: str, num_return_sequences=5) -> List[str]:
    batch = tokenizer([template], truncation=True, padding="longest", return_tensors="pt").to(device)
    translated = model.generate(**batch, max_length=60, num_beams=5, num_return_sequences=num_return_sequences)
    decoded = tokenizer.batch_decode(translated, skip_special_tokens=True)
    filtered = [
        phr for phr in decoded
        if ("{a}" in template and "{b}" in template and "a" not in phr and "b" not in phr)
    ]
    return filtered or [template]  # fallback if no valid paraphrase


def generate_matrix(rows=2, cols=2) -> List[List[int]]:
    return [[random.randint(1, 10) for _ in range(cols)] for _ in range(rows)]

def matrix_to_xml(matrix: List[List[int]]) -> str:
    return "\n".join(f"<row>{' '.join(map(str, row))}</row>" for row in matrix)

# def generate_samples(num_samples=1000) -> List[Dict[str, str]]:
#     dataset = []
#     paraphrased_templates = {"basic": {}, "matrix": {}}

#     for op, templates in basic_templates.items():
#         paraphrased_templates["basic"][op] = []
#         for tmpl in templates:
#             paraphrased_templates["basic"][op].extend(paraphrase_template(tmpl))

#     for op, templates in matrix_templates.items():
#         paraphrased_templates["matrix"][op] = []
#         for tmpl in templates:
#             paraphrased_templates["matrix"][op].extend(paraphrase_template(tmpl))

#     for _ in tqdm(range(num_samples)):
#         if random.random() < 0.8:
#             op = random.choice(list(paraphrased_templates["basic"].keys()))
#             a = random.randint(1, 100)
#             b = random.randint(1, 100)
#             phr = random.choice(paraphrased_templates["basic"][op])
#             prompt = phr.format(a=a, b=b)
#             if " a " in prompt or " b " in prompt:
#                 continue  # skip this sample

#             xml = f"<calc>\n  <expression>\n    <type>{'unary' if op == 'sqrt' else 'binary'}</type>\n    <operation>{op}</operation>\n    <operands>\n"
#             if op == "sqrt":
#                 xml += f"      <operand>{a}</operand>\n"
#             else:
#                 xml += f"      <operand>{a}</operand>\n      <operand>{b}</operand>\n"
#             xml += "    </operands>\n  </expression>\n</calc>"

#         else:
#             op = random.choice(list(paraphrased_templates["matrix"].keys()))
#             m1 = generate_matrix()
#             m2 = generate_matrix() if op != "inverse" else None
#             phr = random.choice(paraphrased_templates["matrix"][op])
#             prompt = phr.format(m1=str(m1), m2=str(m2) if m2 else "")

#             xml = f"<calc>\n  <expression>\n    <type>matrix</type>\n    <operation>{op}</operation>\n    <operands>\n"
#             xml += f"      <matrix>\n{matrix_to_xml(m1)}\n      </matrix>\n"
#             if m2:
#                 xml += f"      <matrix>\n{matrix_to_xml(m2)}\n      </matrix>\n"
#             xml += "    </operands>\n  </expression>\n</calc>"

#         dataset.append({"prompt": prompt, "output": xml})

#     return dataset

def generate_basic_samples(num_samples=800) -> List[Dict[str, str]]:
    dataset = []
    paraphrased_templates = {}

    # Paraphrase all basic templates once
    for op, templates in basic_templates.items():
        paraphrased_templates[op] = []
        for tmpl in templates:
            paraphrased_templates[op].extend(paraphrase_template(tmpl))

    for _ in tqdm(range(num_samples), desc="Generating basic samples"):
        op = random.choice(list(paraphrased_templates.keys()))
        a = random.randint(1, 100)
        b = random.randint(1, 100)
        phr = random.choice(paraphrased_templates[op])
        prompt = phr.format(a=a, b=b)

        if op == "sqrt":
            output = f"sqrt({a})"
        elif op == "power":
            output = f"power({a}, {b})"
        else:
            output = f"{op}({a}, {b})"

        dataset.append({"prompt": prompt, "output": output})

    return dataset

def matrix_to_str(matrix: List[List[int]]) -> str:
    # Return string of matrix as nested list, e.g. [[1,2],[3,4]]
    return str(matrix)

def generate_matrix_samples(num_samples=200) -> List[Dict[str, str]]:
    dataset = []
    paraphrased_templates = {}

    # Map to corresponding function names in output
    op_func_map = {
        "add": "mat_add",
        "subtract": "mat_subtract",
        "multiply": "mat_mul",
        "inverse": "mat_inverse",
        "determinant": "mat_determinant"
    }

    # Paraphrase all matrix templates once
    for op, templates in matrix_templates.items():
        paraphrased_templates[op] = []
        for tmpl in templates:
            paraphrased_templates[op].extend(paraphrase_template(tmpl))

    for _ in tqdm(range(num_samples), desc="Generating matrix samples"):
        op = random.choice(list(paraphrased_templates.keys()))
        m1 = generate_matrix()
        m2 = generate_matrix() if op not in ["inverse", "determinant"] else None
        phr = random.choice(paraphrased_templates[op])
        prompt = phr.format(m1=str(m1), m2=str(m2) if m2 else "")

        func_name = op_func_map[op]

        if m2:
            output = f"{func_name}({matrix_to_str(m1)}, {matrix_to_str(m2)})"
        else:
            output = f"{func_name}({matrix_to_str(m1)})"

        dataset.append({"prompt": prompt, "output": output})

    return dataset

In [52]:

def generate_full_dataset(samples=500) -> List[Dict[str, str]]:
    basic_data = generate_basic_samples(samples)
    matrix_data = generate_matrix_samples(samples)
    return basic_data + matrix_data

In [53]:
# Generate and save
final_dataset = generate_full_dataset(50000)
with open("./nlp_to_math_func.jsonl", "w") as f:
    for item in final_dataset:
        json.dump(item, f)
        f.write("\n")

print("saved to ./nlp_to_math_func.jsonl")


Generating basic samples: 100%|██████████| 50000/50000 [00:00<00:00, 399029.61it/s]
Generating matrix samples: 100%|██████████| 50000/50000 [00:00<00:00, 133237.36it/s]


saved to ./nlp_to_math_func.jsonl
