In [3]:
import random
from misc import companies, currencies
import re
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain.schema import HumanMessage
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data


In [4]:
import re

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def convert_to_float(text):
    # Remove non-numeric characters and convert to float
    # Extract numbers and handle special characters
    text = text.replace('~', '').strip()  # Remove approximate symbol
    numbers = re.findall(r'[\d.]+', text)
    if not numbers:
        return None
    
    try:
        number = float(numbers[0])
    except:
        return None

    # Handle multipliers

    if 'billion' in text.lower():
        number *= 1000000000
    elif 'million' in text.lower():
        number *= 1000000
    elif 'thousand' in text.lower():
        number *= 1000

    return number

def extract_steps(text):
    # Find all instances of "Step" at the beginning of a line or after newline
    steps = re.finditer(r'(?:^|\n)\s*Step\s+(\d+)', text)
    step_starts = [step.span()[0] for step in steps]
    step_spans = [(step_starts[idx], step_starts[idx+1]) for idx in range(len(step_starts) - 1)]
    # Add the last step span
    step_spans.append((step_starts[-1], len(text)))
    step_strings = [text[start:end].strip() for start, end in step_spans]
    # Remove leading "Step X" from each step
    step_strings = [re.sub(r'(?i)Step\s+\d+\s*:', '', step).strip() for step in step_strings]
    # Find final answer
    # Find the last step
    last_step = step_strings[-1]
    # Find the last '=' and extract everything after it until the end
    final_answer_match = re.search(r'=\s*([^=]+)$', last_step)
    final_answer = final_answer_match.group(1).strip() if final_answer_match else ''
    final_answer = final_answer.split('\n')[0]
    final_answer = convert_to_float(final_answer)
    # Remove extra spaces
    step_strings = [re.sub(r'\s+', ' ', step) for step in step_strings]
    return step_strings, final_answer


def eval(gt, pred):
    
    gt_steps, gt_final_answer = extract_steps(gt)
    pred_steps, pred_final_answer = extract_steps(pred)

    gt_steps_embeddings = model.encode(gt_steps)
    pred_steps_embeddings = model.encode(pred_steps)

    similarity_matrix = cosine_similarity(gt_steps_embeddings, pred_steps_embeddings)
    max_similarities_backward = np.max(similarity_matrix, axis=1)
    max_similarities_forward = np.max(similarity_matrix, axis=0)
    binarized_similarity_backward = max_similarities_backward > 0.7
    binarized_similarity_forward = max_similarities_forward > 0.7
    recall = float(np.sum(binarized_similarity_backward) / len(gt_steps))
    precision = float(np.sum(binarized_similarity_forward) / len(pred_steps))

    # Check final answer match
    if gt_final_answer is None or pred_final_answer is None:
        final_answer_match = 0
    else:
        final_answer_match = int(abs(gt_final_answer - pred_final_answer)/gt_final_answer < 0.05)

    return recall, precision, final_answer_match

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [49]:
# def convert_to_float(text):
#     # Remove non-numeric characters and convert to float
#     # Extract numbers and handle special characters
#     text = text.replace('~', '').strip()  # Remove approximate symbol
#     text = text.replace('$', '').replace(',', '')  # Remove dollar sign and commas
#     numbers = re.findall(r'[\d.]+', text)
#     if not numbers:
#         return None
    
#     try:
#         number = float(numbers[0])
#     except:
#         return None

#     # Handle multipliers

#     if 'billion' in text.lower():
#         number *= 1000000000
#     elif 'million' in text.lower():
#         number *= 1000000
#     elif 'thousand' in text.lower():
#         number *= 1000

#     return number

# def extract_final_answer(step):
#     final_answer_match = re.search(r'=\s*([^=]+)$', step)
#     final_answer = final_answer_match.group(1).strip() if final_answer_match else ''
#     final_answer = final_answer.split('\n')[0]
#     final_answer = convert_to_float(final_answer)
#     return final_answer


def convert_to_float(text):
    # Remove non-numeric characters and convert to float
    # Extract numbers and handle special characters
    text = text.replace('~', '').strip()  # Remove approximate symbol
    text = text.replace('$', '').replace(',', '')  # Remove dollar sign and commas
    numbers = re.findall(r'[\d.]+', text)
    if not numbers:
        return None
    
    try:
        number = float(numbers[0])
    except:
        return None

    # Handle multipliers

    if 'billion' in text.lower():
        number *= 1000000000
    elif 'million' in text.lower():
        number *= 1000000
    elif 'thousand' in text.lower():
        number *= 1000

    return number

def extract_final_answer(step):
    # Try capturing explicit "Answer: $" pattern
    answer_match = re.search(r'\*\*Answer:\*\*\s*.*?\$?([\d,]+\.\d{2})', step)
    if answer_match:
        return convert_to_float(answer_match.group(1))

    # Fallback: last valid dollar-format number after =
    final_answer_match = re.search(r'=\s*\$?([\d,]+\.\d{2})', step)
    if final_answer_match:
        return convert_to_float(final_answer_match.group(1))

    return None

for line in open('../human_eval/gemma_3_27b_instruct.jsonl').readlines():
    data = json.loads(line)
    print(data)
    generation = data['generation']
    solution = data['solution']
    gt_final_answer = extract_final_answer(generation)
    pred_final_answer = extract_final_answer(solution)
    print(solution)
    print(gt_final_answer)
    print(pred_final_answer)
    # print(pred_final_answer)
    
    break

{'seed': 2763771162, 'id': '5', 'level': 'Advanced', 'question': 'A systemic risk event causes equities to lose 29.5% and bonds to lose 15.5%. If a portfolio is 73% equities and 27% bonds worth $1,274,342, what is the portfolio value after the event?', 'solution': 'Step 1: Calculate equity value:\n  73% of 1,274,342 = 930,269.66.\nStep 2: Calculate bond value:\n  27% of 1,274,342 = 344,072.34.\nStep 3: Calculate losses:\n  Equities lose 930,269.66 * 0.29 = 274,429.55, Bonds lose 344,072.34 * 0.15 = 53,331.21.\nStep 4: Subtract losses:\n  The portfolio value = 1,274,342 - 274,429.55 - 53,331.21 = 946,581.24.', 'topic': 'risk_management', 'subtopic': 'scenario_plan', 'generation': "Here's a step-by-step solution to calculate the portfolio value after the systemic risk event:\n\n**Step 1: Calculate the initial value of the equity portion of the portfolio.**\n\n*   Portfolio value = $1,274,342\n*   Equity allocation = 73%\n*   Initial equity value = $1,274,342 * 0.73 = $930,269.66\n\n**Ste

In [31]:
import re
import json

def add_dollar_signs(text):
    # Match candidate numbers
    pattern = re.compile(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+(?:\.\d+)?')

    def is_money(match):
        span = match.span()
        start, end = span
        number = match.group()

        # Check what comes before and after the number
        before = text[max(0, start-8):start]
        after = text[end:end+2]

        # Exclude if already has a dollar sign
        if before.endswith('$'):
            return False
        # Exclude step numbers
        if "Step " in before:
            return False
        # Exclude percentages
        if after.startswith('%'):
            return False
        # Exclude multipliers
        if before.strip().endswith('*') or before.strip().endswith('/'):
            return False

        return True

    # Replace only valid monetary numbers
    result = []
    last_idx = 0
    for match in pattern.finditer(text):
        if is_money(match):
            result.append(text[last_idx:match.start()] + '$' + match.group())
        else:
            result.append(text[last_idx:match.end()])
        last_idx = match.end()
    result.append(text[last_idx:])
    return ''.join(result)

def process_entry(entry):
    if "solution" in entry and isinstance(entry["solution"], str):
        entry["solution"] = add_dollar_signs(entry["solution"])
    return entry

input_file = "../human_eval/gemma_3_27b_instruct.jsonl"
output_file = "../human_eval/gemma_3_27b_instruct_new.jsonl"

with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
    for line in fin:
        data = json.loads(line)
        updated = process_entry(data)
        fout.write(json.dumps(updated, ensure_ascii=False) + '\n')





In [22]:
def convert_to_float(text):
    # Remove non-numeric characters and convert to float
    # Extract numbers and handle special characters
    text = text.replace('~', '').strip()  # Remove approximate symbol
    text = text.replace('$', '').replace(',', '')  # Remove dollar sign and commas
    numbers = re.findall(r'[\d.]+', text)
    if not numbers:
        return None
    
    try:
        number = float(numbers[0])
    except:
        return None

    # Handle multipliers

    if 'billion' in text.lower():
        number *= 1000000000
    elif 'million' in text.lower():
        number *= 1000000
    elif 'thousand' in text.lower():
        number *= 1000

    return number

print(convert_to_float('$946,681.24'))

946681.24


In [5]:
model_results = {
    model: load_jsonl(f'../results/ci/{model}') 
            for model in os.listdir(f'../results/ci/') if model.endswith('.jsonl')
    }

for model in os.listdir('../results/ci'):
    if model.endswith('.jsonl'):
        with open(f'../results/ci/{model.split(".")[0]}_evals.jsonl', 'w') as f_eval:
            with open(f'../results/ci/{model}', 'r') as f_pred:
                for line in f_pred:
                    json_line = json.loads(line)
                    recall, precision, final_answer_match = eval(json_line['solution'], json_line['generation'])
                    json_line['recall'] = recall
                    json_line['precision'] = precision
                    json_line['final_answer_match'] = final_answer_match
                    f_eval.write(json.dumps(json_line) + '\n')
                

TypeError: encode() argument 'encoding' must be str, not list