In [22]:
pwd

'/workspace/lm-evaluation-harness'

In [3]:
from datasets import load_dataset

dataset = load_dataset("large-traversaal/math500_urdu")
# Load the dataset to check its struct
print("\nFirst sample:")
print(dataset['test'][0])

README.md: 0.00B [00:00, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/463k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]


First sample:
{'problem': 'Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$', 'solution': 'We have that $r = \\sqrt{0^2 + 3^2} = 3.$  Also, if we draw the line connecting the origin and $(0,3),$ this line makes an angle of $\\frac{\\pi}{2}$ with the positive $x$-axis.\n\n[asy]\nunitsize(0.8 cm);\n\ndraw((-0.5,0)--(3.5,0));\ndraw((0,-0.5)--(0,3.5));\ndraw(arc((0,0),3,0,90),red,Arrow(6));\n\ndot((0,3), red);\nlabel("$(0,3)$", (0,3), W);\ndot((3,0), red);\n[/asy]\n\nTherefore, the polar coordinates are $\\boxed{\\left( 3, \\frac{\\pi}{2} \\right)}.$', 'answer': '\\left( 3, \\frac{\\pi}{2} \\right)', 'urdu_problem': 'نقطہ $(0,3)$ کو مستطیلی احداثیات سے قطبی احداثیات میں تبدیل کریں۔ اپنا جواب $(r,\\theta),$ کی شکل میں درج کریں، جہاں $r > 0$ اور $0 \\le \\theta < 2 \\pi.$', 'urdu_solution': 'ہمارے پاس ہے کہ $r = \\sqrt{0^2 + 3^2} = 3.$ اس کے علاوہ، اگر ہم مبدا اور $(0,3),$ کو

# minerva_math500_ur task

In [3]:
import os

# Create directory structure
os.makedirs('lm_eval/tasks/minerva_math_urdu/direct', exist_ok=True)

# 1. Create utils_urdu.py
utils_urdu_content = """import logging
import re
import signal
from importlib.metadata import version
from typing import Dict, List, Optional

import datasets


eval_logger = logging.getLogger(__name__)


try:
    import antlr4
    import sympy
    from math_verify import parse, verify
    from sympy.parsing.latex import parse_latex

    assert version("antlr4-python3-runtime").startswith("4.11")
except (ModuleNotFoundError, AssertionError) as e:
    raise type(e)(
        "`sympy`, `math_verify` and `antlr4-python3-runtime==4.11` are required for generating translation task prompt templates. "
        "Please install the required packages via pip install lm-eval[math] or pip install -e .[math]"
    ) from e


def doc_to_text(doc: dict) -> str:
    return "مسئلہ:" + "\\n" + doc["urdu_problem"] + "\\n\\n" + "مرحلہ وار حل:"


def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
    def _process_doc(doc: dict) -> dict:
        out_doc = {
            "urdu_problem": doc["urdu_problem"],
            "urdu_solution": doc["urdu_solution"],
            "urdu_answer": normalize_final_answer(
                remove_boxed(last_boxed_only_string(doc["urdu_solution"]))
            ),
        }
        if getattr(doc, "few_shot", None) is not None:
            out_doc["few_shot"] = True
        return out_doc

    return dataset.map(_process_doc)


def process_results(doc: dict, results: list[str]) -> dict[str, int]:
    candidates = results[0]

    unnormalized_answer = get_unnormalized_answer(candidates)
    answer = normalize_final_answer(unnormalized_answer)

    if is_equiv(answer, doc["urdu_answer"]):
        retval = 1
    else:
        retval = 0

    # math_verify
    _mvres = verify(
        gold=parse(doc["urdu_solution"]),
        target=parse(candidates),
    )
    mathval = 1 if _mvres else 0

    res = {
        "exact_match": retval,
        "math_verify": mathval,
    }
    return res


def last_boxed_only_string(string: str) -> Optional[str]:
    idx = string.rfind("\\\\boxed")
    if "\\\\boxed " in string:
        return "\\\\boxed " + string.split("\\\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx : right_brace_idx + 1]

    return retval


def remove_boxed(s: str) -> str:
    if "\\\\boxed " in s:
        left = "\\\\boxed "
        assert s[: len(left)] == left
        return s[len(left) :]

    left = "\\\\boxed{"

    assert s[: len(left)] == left
    assert s[-1] == "}"

    return s[len(left) : -1]


class timeout:
    def __init__(self, seconds=1, error_message="Timeout"):
        self.seconds = seconds
        self.error_message = error_message

    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)

    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)

    def __exit__(self, type, value, traceback):
        signal.alarm(0)


def is_equiv(x1: str, x2: str) -> bool:
    \"\"\"
    x1 and x2 are normalized latex string
    \"\"\"
    try:
        with timeout(seconds=5):
            try:
                parsed_x1 = parse_latex(x1)
                parsed_x2 = parse_latex(x2)
            except (
                sympy.parsing.latex.errors.LaTeXParsingError,
                sympy.SympifyError,
                TypeError,
            ):
                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
                return False

            try:
                diff = parsed_x1 - parsed_x2
            except TypeError:
                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
                return False

            try:
                if sympy.simplify(diff) == 0:
                    return True
                else:
                    return False
            except ValueError:
                eval_logger.debug(
                    f"Had some trouble simplifying when comparing {x1} and {x2}"
                )
    except TimeoutError:
        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
        return False
    except ImportError as e:
        eval_logger.error(e)
        raise
    except Exception as e:
        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
        return False


def get_unnormalized_answer(text: str) -> str:
    INVALID_ANSWER = "[invalidanswer]"
    end_seq = "I hope it is correct."
    text += end_seq
    match = re.search(
        r"Final Answer: The final answer is(.*?). I hope it is correct.",
        text,
    )
    if match:
        return match.group(1).strip()
    else:
        return INVALID_ANSWER


SUBSTITUTIONS = [
    ("an ", ""),
    ("a ", ""),
    (".$", "$"),
    ("\\\\$", ""),
    (r"\\\\ ", ""),
    (" ", ""),
    ("mbox", "text"),
    (",\\\\text{and}", ","),
    ("\\\\text{and}", ","),
    ("\\\\text{m}", "\\\\text{}"),
]
REMOVED_EXPRESSIONS = [
    "square",
    "ways",
    "integers",
    "dollars",
    "mph",
    "inches",
    "ft",
    "hours",
    "km",
    "units",
    "\\\\ldots",
    "sue",
    "points",
    "feet",
    "minutes",
    "digits",
    "cents",
    "degrees",
    "cm",
    "gm",
    "pounds",
    "meters",
    "meals",
    "edges",
    "students",
    "childrentickets",
    "multiples",
    "\\\\text{s}",
    "\\\\text{.}",
    "\\\\text{\\ns}",
    "\\\\text{}^2",
    "\\\\text{}^3",
    "\\\\text{\\n}",
    "\\\\text{}",
    r"\\mathrm{th}",
    r"^\\circ",
    r"^{\\circ}",
    r"\\;",
    r",\\!",
    "{,}",
    '"',
    "\\\\dots",
]


def normalize_final_answer(final_answer: str) -> str:
    \"\"\"
    Normalize a final answer to a quantitative reasoning question.

    Copied character for character from appendix D of Lewkowycz et al. (2022)
    \"\"\"
    final_answer = final_answer.split("=")[-1]

    for before, after in SUBSTITUTIONS:
        final_answer = final_answer.replace(before, after)
    for expr in REMOVED_EXPRESSIONS:
        final_answer = final_answer.replace(expr, "")

    # Extract answer that is in LaTeX math, is bold,
    # is surrounded by a box, etc.
    final_answer = re.sub(r"(.*?)(\\$)(.*?)(\\$)(.*)", "$\\\\3$", final_answer)
    final_answer = re.sub(r"(\\\\text\\{)(.*?)(\\})", "\\\\2", final_answer)
    final_answer = re.sub(r"(\\\\textbf\\{)(.*?)(\\})", "\\\\2", final_answer)
    final_answer = re.sub(r"(\\\\overline\\{)(.*?)(\\})", "\\\\2", final_answer)
    final_answer = re.sub(r"(\\\\boxed\\{)(.*)(\})", "\\\\2", final_answer)

    # Normalize shorthand TeX:
    #  \\fracab -> \\frac{a}{b}
    #  \\frac{abc}{bef} -> \\frac{abc}{bef}
    #  \\fracabc -> \\frac{a}{b}c
    #  \\sqrta -> \\sqrt{a}
    #  \\sqrtab -> sqrt{a}b
    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\\\2}{\\\\3}", final_answer)
    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\\\2}", final_answer)
    final_answer = final_answer.replace("$", "")

    # Normalize 100,000 -> 100000
    if final_answer.replace(",", "").isdigit():
        final_answer = final_answer.replace(",", "")

    return final_answer
"""

with open('lm_eval/tasks/minerva_math_urdu/direct/utils_urdu.py', 'w', encoding='utf-8') as f:
    f.write(utils_urdu_content)
print(" utils_urdu.py created")

# 2. Create minerva_math500_ur.yaml
task_config = """task: minerva_math500_ur
dataset_path: large-traversaal/math500_urdu
dataset_name: default
output_type: generate_until
training_split: null
test_split: test
process_docs: !function utils_urdu.process_docs
doc_to_text: !function utils_urdu.doc_to_text
process_results: !function utils_urdu.process_results
doc_to_target: "{{urdu_answer if few_shot is undefined else urdu_solution}}"
generation_kwargs:
  until:
    - "مسئلہ:"
  do_sample: false
  temperature: 0
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
  - metric: math_verify
    aggregation: mean
    higher_is_better: true
num_fewshot: 0
metadata:
  version: 1.0
"""

with open('lm_eval/tasks/minerva_math_urdu/direct/minerva_math500_ur.yaml', 'w', encoding='utf-8') as f:
    f.write(task_config)
print(" minerva_math500_ur.yaml created")

# 3. Create __init__.py (empty file to make it a package)
with open('lm_eval/tasks/minerva_math_urdu/__init__.py', 'w') as f:
    f.write("")
print("__init__.py created")

with open('lm_eval/tasks/minerva_math_urdu/direct/__init__.py', 'w') as f:
    f.write("")
print(" direct/__init__.py created")

print("\nAll files created successfully!")
print("\nDirectory structure:")
print("lm_eval/tasks/minerva_math_urdu/")
print("├── __init__.py")
print("└── direct/")
print("    ├── __init__.py")
print("    ├── utils_urdu.py")
print("    └── minerva_math500_ur.yaml")
print("\n Prompt now uses:")
print("   - مسئلہ: (Problem)")
print("   - مرحلہ وار حل: (Step-by-step Solution)")

 utils_urdu.py created
 minerva_math500_ur.yaml created
__init__.py created
 direct/__init__.py created

All files created successfully!

Directory structure:
lm_eval/tasks/minerva_math_urdu/
├── __init__.py
└── direct/
    ├── __init__.py
    ├── utils_urdu.py
    └── minerva_math500_ur.yaml

 Prompt now uses:
   - مسئلہ: (Problem)
   - مرحلہ وار حل: (Step-by-step Solution)


# mgsm_direct_ur task

In [6]:
# Create directory
!mkdir -p lm_eval/tasks/mgsm_urdu_large/direct

base_config = """# Base config for MGSM Urdu 
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
tag: mgsm_direct
dataset_path: large-traversaal/mgsm_urdu_cleaned
dataset_name: null  # Overridden by language-specific config.
output_type: generate_until
training_split: train
test_split: test
target_delimiter: ""
generation_kwargs:
  until:
    - "\n\n"
    - "\n"
  do_sample: false
  temperature: 0.0
filter_list:
  - name: remove_whitespace
    filter:
      - function: remove_whitespace
      - function: take_first
  - filter:
    - function: regex
      group_select: -1
      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
    - function: take_first
    name: flexible-extract
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: true
metadata:
  version: 3.0
"""

with open('lm_eval/tasks/mgsm_urdu_large/direct/direct_yaml', 'w') as f:
    f.write(base_config)

print("✅ direct_yaml created")

✅ direct_yaml created


In [7]:
#urdu yml
task_config = """# MGSM Urdu
dataset_name: null
doc_to_target: '{% if urdu_answer is not none %}{{urdu_answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if urdu_answer is not none %}{{urdu_question+"\\nجواب:"}}{% else %}{{"سوال: "+urdu_question+"\\nجواب:"}}{% endif %}'
generation_kwargs:
  do_sample: false
  until:
    - 'سوال:'
    - </s>
    - <|im_end|>
include: direct_yaml
task: mgsm_direct_ur
"""

with open('lm_eval/tasks/mgsm_urdu_large/direct/mgsm_direct_ur.yaml', 'w') as f:
    f.write(task_config)

print("✅ mgsm_direct_ur.yaml created")

✅ mgsm_direct_ur.yaml created


In [8]:
!ls -la lm_eval/tasks/mgsm_urdu_large/direct

total 1956
drwxrwxrwx 2 root root 1000134 Jan 16 14:54 .
drwxrwxrwx 3 root root 1000134 Jan 16 14:53 ..
-rw-rw-rw- 1 root root     951 Jan 16 14:53 direct_yaml
-rw-rw-rw- 1 root root     425 Jan 16 14:54 mgsm_direct_ur.yaml


In [9]:
# Create __init__.py files for proper module discovery
!touch lm_eval/tasks/mgsm_urdu_large/__init__.py
!touch lm_eval/tasks/mgsm_urdu_large/direct/__init__.py
print("✅ __init__.py files created")

✅ __init__.py files created


# mgsm_cot_ur task

In [10]:
# Create directory
!mkdir -p lm_eval/tasks/mgsm_urdu_large/cot

base_config = r"""# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
tag: mgsm_cot_native
dataset_path: large-traversaal/mgsm_urdu_cleaned
dataset_name: null  # Overridden by language-specific config.
output_type: generate_until
training_split: train
test_split: test
# target_delimiter: ""
generation_kwargs:
  until:
    - "\n\n"
    - "\n"
  do_sample: false
  temperature: 0.0
target_delimiter: " "
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: true
filter_list:
  - name: "get-answer"
    filter:
      - function: "regex"
        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
      - function: "take_first"
metadata:
  version: 4.0
"""

with open('lm_eval/tasks/mgsm_urdu_large/cot/cot_yaml', 'w') as f:
    f.write(base_config)

print("✅ cot_yaml created")

✅ cot_yaml created


In [19]:
task_config_ur=r"""
# Generated by utils.py
doc_to_target: '{% if urdu_answer is not none %}{{urdu_answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if urdu_answer is not none %}{{urdu_question+"\nمرحلہ وار جواب:"}}{% else %}{{"سوال: "+urdu_question+"\nمرحلہ وار جواب:"}}{% endif %}'
filter_list:
- filter:
  - function: regex
    regex_pattern: جواب (\-?[0-9\.\,]+) ہے
  - function: take_first
  name: strict-match
- filter:
  - function: regex
    group_select: -1
    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
  - function: take_first
  name: flexible-extract
generation_kwargs:
  do_sample: false
  until:
  - 'سوال:'
  - </s>
  - <|im_end|>
include: cot_yaml
task: mgsm_cot_ur
"""

with open('lm_eval/tasks/mgsm_urdu_large/cot/mgsm_cot_ur.yaml', 'w') as f:
    f.write(task_config_ur)

In [20]:
!ls -la lm_eval/tasks/mgsm_urdu_large/cot

total 1956
drwxrwxrwx 2 root root 1000153 Jan 16 15:04 .
drwxrwxrwx 4 root root 1000287 Jan 16 15:03 ..
-rw-rw-rw- 1 root root       0 Jan 16 15:36 __init__.py
-rw-rw-rw- 1 root root     838 Jan 16 15:03 cot_yaml
-rw-rw-rw- 1 root root     729 Jan 16 15:48 mgsm_cot_ur.yaml


In [21]:
# Create __init__.py files for proper module discovery
!touch lm_eval/tasks/mgsm_urdu_large/__init__.py
!touch lm_eval/tasks/mgsm_urdu_large/cot/__init__.py
print("✅ __init__.py files created")

✅ __init__.py files created


# openbookqa_urdu

In [23]:
!mkdir -p lm_eval/tasks/openbookqa_urdu/direct

task_config="""
task: openbookqa_urdu
dataset_path: large-traversaal/openbookqa_urdu_cleaned
output_type: multiple_choice
training_split: train
test_split: test
doc_to_text: "{{urdu_question_stem}}"
doc_to_target: "{{urdu_choices.label.index(answerKey)}}"
doc_to_choice: "{{urdu_choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: urdu_question_stem
metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
metadata:
  version: 1.0
"""

with open('lm_eval/tasks/openbookqa_urdu/direct/openbookqa_urdu.yaml', 'w') as f:
    f.write(task_config)

print("✅ openbookqa_urdu.yaml created")

✅ openbookqa_urdu.yaml created


In [24]:
!ls -la lm_eval/tasks/openbookqa_urdu/direct

total 105
drwxrwxrwx 2 root root 52900 Jan 16 15:55 .
drwxrwxrwx 3 root root 52900 Jan 16 15:55 ..
-rw-rw-rw- 1 root root   529 Jan 16 15:55 openbookqa_urdu.yaml


In [25]:
# Create __init__.py files for proper module discovery
!touch lm_eval/tasks/openbookqa_urdu/__init__.py
!touch lm_eval/tasks/openbookqa_urdu/direct/__init__.py
print("✅ __init__.py files created")

✅ __init__.py files created


# commonsense_qa_urdu

In [26]:
!mkdir -p lm_eval/tasks/commonsense_qa_urdu/direct

task_config="""
task: commonsense_qa_urdu
dataset_path: large-traversaal/commonsenseqa_urdu_cleaned
training_split: train
validation_split: validation
output_type: multiple_choice
doc_to_text: "سوال: {{urdu_question.strip() }}\\nA. {{urdu_choices['text'][0]}}\\nB. {{urdu_choices['text'][1]}}\\nC. {{urdu_choices['text'][2]}}\\nD. {{urdu_choices['text'][3]}}\\nE. {{urdu_choices['text'][4]}}\\nجواب:"
doc_to_target: answerKey
doc_to_choice: ['A', 'B', 'C', 'D', 'E']
metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
"""

with open('lm_eval/tasks/commonsense_qa_urdu/direct/commonsense_qa_urdu.yaml', 'w') as f:
    f.write(task_config)

print("✅ commonsense_qa_urdu.yaml created")

✅ commonsense_qa_urdu.yaml created


In [27]:
!ls -la lm_eval/tasks/commonsense_qa_urdu/direct

total 119
drwxrwxrwx 2 root root 60200 Jan 16 15:57 .
drwxrwxrwx 3 root root 60200 Jan 16 15:57 ..
-rw-rw-rw- 1 root root   602 Jan 16 15:57 commonsense_qa_urdu.yaml


In [28]:
# Create __init__.py files for proper module discovery
!touch lm_eval/tasks/commonsense_qa_urdu/__init__.py
!touch lm_eval/tasks/commonsense_qa_urdu/direct/__init__.py
print("✅ __init__.py files created")

✅ __init__.py files created
