In [1]:
# ✅ Import Required Libraries
import os
import json
import torch
import gc
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ✅ Clear GPU memory before starting fresh
gc.collect()
torch.cuda.empty_cache()

print("\n✅ Environment Reset & Libraries Loaded!")



✅ Environment Reset & Libraries Loaded!


In [2]:
# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the filtered 200 shortest JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

print(f"\n✅ Loaded {len(filtered_files)} filtered JSON files for evaluation.")



✅ Loaded 200 filtered JSON files for evaluation.


In [3]:
# ✅ Function to construct a well-formatted prompt from JSON files
def construct_fixed_prompt(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    # ✅ Extract training & test examples
    training_examples = data.get("train", [])
    test_examples = data.get("test", [])

    if not test_examples or not training_examples:
        return None  # Skip if missing data

    test_input = test_examples[0]["input"]
    
    # ✅ Build a strong prompt
    prompt = "Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.\n\n"

    for ex in training_examples:
        prompt += f"Input: {ex['input']}\n"
        prompt += f"Output: {ex['output']}\n\n"

    prompt += f"Test Input Matrix:\n{test_input}\n\n"
    prompt += "**Provide ONLY the final output matrix for the test input. Do NOT include any other text.**"

    return prompt

print("\n✅ `construct_fixed_prompt` function defined successfully!")



✅ `construct_fixed_prompt` function defined successfully!


In [4]:
import re

# ✅ Function to extract matrix after instruction text and stop at first `]]`
def extract_final_matrix(generated_text):
    instruction_text = "Provide ONLY the final output matrix for the test input. Do NOT include any other text."
    
    # ✅ Find where the instruction appears
    start_idx = generated_text.find(instruction_text)
    if start_idx == -1:
        return None  # ✅ Return None instead of an alert

    # ✅ Extract everything after the instruction
    output_after_instruction = generated_text[start_idx + len(instruction_text):].strip()

    # ✅ Use regex to find the first valid matrix **after the instruction text**
    matrix_match = re.search(r"\[\[.*?\]\]", output_after_instruction, re.DOTALL)
    
    if matrix_match:
        return matrix_match.group(0)  # ✅ Return the extracted matrix
    else:
        return None  # ✅ Return None if no valid matrix is found

print("\n✅ `extract_final_matrix` function updated: Now returns `None` when no matrix is found.")



✅ `extract_final_matrix` function updated: Now returns `None` when no matrix is found.


In [5]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Use all 200 files
num_samples = 200
test_files = filtered_files[:num_samples]

# ✅ Define model & assigned GPU
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1"
device = "cuda:0"

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for 200 examples (same logic as 10-example version)
def run_inference_for_200_examples(model_name, device):
    print(f"\n🚀 Running Model: {model_name} on {device} for {num_samples} datapoints")

    # ✅ Load Model on Assigned GPU
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map={"" : device}
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to(device)

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Clear memory after every 50 examples
        if (idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name}!")

# ✅ Run inference on all 200 examples
print("\n🚀 Running Model on CUDA:0 for 200 Examples...\n")
run_inference_for_200_examples(model_name, device)

print("\n✅ **Completed inference for 200 examples!** 🚀")



🚀 Running Model on CUDA:0 for 200 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1 on cuda:0 for 200 datapoints


2025-03-10 01:55:07.923845: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 01:55:08.169534: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-10 01:55:08.169563: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-10 01:55:08.169587: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 01:55:08.616425: I tensorflow/core/platform/cpu_feature_g

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 1/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 2/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

📊 **Comparison Result:** ✅ Correct

----------------------------------------------------------------------------------------------------


🚀 Running inference 3/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running i

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.10 GiB. GPU 0 has a total capacty of 31.73 GiB of which 1.56 GiB is free. Process 539717 has 7.63 GiB memory in use. Process 559429 has 13.25 GiB memory in use. Including non-PyTorch memory, this process has 9.28 GiB memory in use. Of the allocated memory 6.74 GiB is allocated by PyTorch, and 2.16 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [6]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 32
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1 across both GPUs for 168 remaining datapoints


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



🚀 Running inference 33/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/bc4146bd.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 34/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/4cd1b7b2.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 35/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e133d23d.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Runn

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.16 GiB. GPU 0 has a total capacty of 31.73 GiB of which 103.44 MiB is free. Process 539717 has 7.63 GiB memory in use. Process 559429 has 13.25 GiB memory in use. Including non-PyTorch memory, this process has 10.74 GiB memory in use. Of the allocated memory 9.21 GiB is allocated by PyTorch, and 1.16 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [7]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 70
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1 across both GPUs for 130 remaining datapoints


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 71/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e7dd8335.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 72/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/c8b7cc0f.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 73/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/332efdb3.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Runn

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.78 GiB. GPU 1 has a total capacty of 31.73 GiB of which 3.94 GiB is free. Process 539717 has 6.58 GiB memory in use. Process 559429 has 396.00 MiB memory in use. Including non-PyTorch memory, this process has 20.81 GiB memory in use. Of the allocated memory 13.60 GiB is allocated by PyTorch, and 6.84 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 87
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1 across both GPUs for 113 remaining datapoints


2025-03-10 03:47:34.124815: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 03:47:34.736442: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-10 03:47:34.736478: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-10 03:47:34.736506: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 03:47:35.054276: I tensorflow/core/platform/cpu_feature_g

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 88/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/50a16a69.json

📊 **Comparison Result:** ✅ Correct

----------------------------------------------------------------------------------------------------


🚀 Running inference 89/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/1c0d0a4b.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 90/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/195ba7dc.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Runnin

In [5]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 101
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1 across both GPUs for 99 remaining datapoints


2025-03-10 04:26:54.485391: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 04:26:55.216206: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-10 04:26:55.216243: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-10 04:26:55.216273: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 04:26:55.258253: I tensorflow/core/platform/cpu_feature_g

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 102/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/4852f2fa.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 103/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/55783887.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 104/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/72207abc.json

📊 **Comparison Result:** ✅ Correct

----------------------------------------------------------------------------------------------------


🚀 Run

phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings

In [None]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 0
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings across both GPUs for 200 remaining datapoints


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



🚀 Running inference 1/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 2/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 3/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------

In [5]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 132
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings across both GPUs for 68 remaining datapoints


2025-03-10 17:18:29.903067: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 17:18:30.667636: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-10 17:18:30.667671: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-10 17:18:30.667701: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 17:18:31.098305: I tensorflow/core/platform/cpu_feature_g

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 133/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/da2b0fe3.json

📊 **Comparison Result:** ⚠️ Invalid Output

----------------------------------------------------------------------------------------------------


🚀 Running inference 134/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/6df30ad6.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 135/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/f3cdc58f.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------

In [6]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 0
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01 across both GPUs for 200 remaining datapoints


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 1/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

📊 **Comparison Result:** ⚠️ Error in matrix comparison: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (14,) + inhomogeneous part.

----------------------------------------------------------------------------------------------------


🚀 Running inference 2/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

📊 **Comparison Result:** ✅ Correct

----------------------------------------------------------------------------------------------------


🚀 Running inference 3/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluat

In [None]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 0
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings across both GPUs for 200 remaining datapoints


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 1/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 2/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 3/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json

📊 **Comparison Result:** ❌ Incorrect

-------------------------------------------------------

In [9]:
import numpy as np
import json
import os
import torch
import gc

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the 200 filtered JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

# ✅ Resume from Datapoint 32
num_samples = 200
start_from = 29
test_files = filtered_files[start_from:num_samples]  # ✅ Skip first 31 files

# ✅ Define model & assigned GPUs
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings"
device_map = "auto"  # ✅ Auto-distributes across both GPUs

# ✅ Function to load ground truth matrix
def load_ground_truth_matrix(json_file):
    file_path = os.path.join(dataset_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)

    test_examples = data.get("test", [])
    if not test_examples:
        return None

    return test_examples[0]["output"]

# ✅ Function to compare matrices
def compare_matrices(predicted_matrix, ground_truth):
    if predicted_matrix is None:
        return "⚠️ Invalid Output"

    try:
        predicted_np = np.array(eval(predicted_matrix), dtype=int)
        ground_truth_np = np.array(ground_truth, dtype=int)

        if np.array_equal(predicted_np, ground_truth_np):
            return "✅ Correct"
        else:
            return "❌ Incorrect"
    except Exception as e:
        return "⚠️ Error in matrix comparison: " + str(e)

# ✅ Function to run inference for remaining examples across both GPUs
def run_inference_on_both_gpus(model_name):
    print(f"\n🚀 Running Model: {model_name} across both GPUs for {num_samples - start_from} remaining datapoints")

    # ✅ Load Model across both GPUs
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Distributes model across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    results = []

    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {start_from + idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # ✅ Auto-assigns to available GPU

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                repetition_penalty=1.2,
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Compare extracted matrix with ground truth
        result = compare_matrices(extracted_matrix, ground_truth_matrix)

        # ✅ Store the result
        results.append({
            "file": file,
            "extracted_matrix": extracted_matrix,
            "ground_truth": ground_truth_matrix,
            "comparison_result": result
        })

        print(f"\n📊 **Comparison Result:** {result}")

        print("\n" + "-" * 100 + "\n")

        # ✅ Save every 50 datapoints to prevent progress loss
        if (start_from + idx + 1) % 50 == 0:
            save_path = f"./{model_name.replace('/', '_')}_results_batch_{start_from + idx + 1}.json"
            with open(save_path, "w") as f:
                json.dump(results, f, indent=4)

            print(f"\n💾 Saved batch {start_from + idx + 1} results for {model_name}")

            # ✅ Clear memory
            gc.collect()
            torch.cuda.empty_cache()

    print(f"\n✅ Completed inference for {model_name} across both GPUs!")

# ✅ Run inference for remaining 168 datapoints using both GPUs
print("\n🚀 Running Model across GPUs for remaining 168 Examples...\n")
run_inference_on_both_gpus(model_name)

print("\n✅ **Completed inference for all 200 examples!** 🚀")



🚀 Running Model across GPUs for remaining 168 Examples...


🚀 Running Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings across both GPUs for 171 remaining datapoints


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 30/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/7953d61e.json

📊 **Comparison Result:** ❌ Incorrect

----------------------------------------------------------------------------------------------------


🚀 Running inference 31/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/66e6c45b.json

📊 **Comparison Result:** ✅ Correct

----------------------------------------------------------------------------------------------------


🚀 Running inference 32/200 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/5b6cbef5.json

📊 **Comparison Result:** ❌ Incorrect

------------------------------------------------------

In [3]:
import json
import os
import pandas as pd
import glob

# ✅ Define the six models
models = [
    "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings",
    "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings",
    "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings",
    "phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.1",
    "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1",
    "phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01"
]

# ✅ Initialize results dictionary
all_model_results = {}

# ✅ Load results for each model
for model in models:
    model_key = model.replace("/", "_")  # Replace slashes to match saved file names
    result_files = sorted(glob.glob(f"./{model_key}_results_batch_*.json"))  # Get all saved batches
    
    model_results = []
    
    for file in result_files:
        with open(file, "r") as f:
            batch_results = json.load(f)
            model_results.extend(batch_results)  # Append batch results
    
    # ✅ Store model results
    all_model_results[model] = model_results

# ✅ Display Results for Each Model
for model, results in all_model_results.items():
    print(f"\n🚀 **Results for {model}**\n")
    
    correct_count = 0
    invalid_count = 0
    total_count = len(results)

    

    # ✅ Compute Adjusted Accuracy
    valid_cases = 200 - invalid_count  # Ignore invalid cases
    accuracy = (correct_count / valid_cases) * 100 if valid_cases > 0 else 0

    print(f"\n📊 **Overall Adjusted Accuracy for {model}: {accuracy:.2f}%** (excluding invalid cases)")
    print(f"📊 **Total Invalid Cases: {invalid_count} / 200**")
    print("\n" + "="*100 + "\n")

# ✅ Summary Table of Results
summary_data = []
for model, results in all_model_results.items():
    correct_count = sum(1 for r in results if r["comparison_result"] == "✅ Correct")
    invalid_count = sum(1 for r in results if r["comparison_result"] == "⚠️ Invalid Output")
    valid_cases = 200 - invalid_count

    accuracy = (correct_count / valid_cases) * 100 if valid_cases > 0 else 0

    summary_data.append([model, accuracy, invalid_count])

# ✅ Convert to DataFrame for Display
summary_df = pd.DataFrame(summary_data, columns=["Model", "Adjusted Accuracy (%)", "Invalid Cases"])
import ace_tools as tools
tools.display_dataframe_to_user(name="Model Performance Summary", dataframe=summary_df)



🚀 **Results for phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings**


📊 **Overall Adjusted Accuracy for phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings: 0.00%** (excluding invalid cases)
📊 **Total Invalid Cases: 0 / 200**



🚀 **Results for phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings**


📊 **Overall Adjusted Accuracy for phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: 0.00%** (excluding invalid cases)
📊 **Total Invalid Cases: 0 / 200**



🚀 **Results for phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings**


📊 **Overall Adjusted Accuracy for phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: 0.00%** (excluding invalid cases)
📊 **Total Invalid Cases: 0 / 200**



🚀 **Results for phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.1**


📊 **Overall A

ModuleNotFoundError: No module named 'ace_tools'

In [1]:
import os
import gc
import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ✅ Prevent Memory Fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# ✅ Function to Clear GPU Memory
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("\n✅ GPU memory cleared.")

print("✅ Libraries loaded and GPU management set up.")


✅ Libraries loaded and GPU management set up.


In [2]:
from glob import glob

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the filtered 200 shortest JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

print(f"\n✅ Loaded {len(filtered_files)} filtered JSON files for evaluation.")



✅ Loaded 200 filtered JSON files for evaluation.


In [3]:
# ✅ Clear GPU Memory Before Loading Model
clear_gpu_memory()

# ✅ Define model name
model_name = "phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.1"

print(f"\n🚀 Loading new model: {model_name}")

# ✅ Set 4-bit quantization settings for efficient GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# ✅ Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="balanced"  # ✅ Distributes model across 2 GPUs automatically
)

print("\n✅ Model successfully loaded across both GPUs.")



✅ GPU memory cleared.

🚀 Loading new model: phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.1


tokenizer_config.json:   0%|          | 0.00/51.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

2025-03-09 02:17:39.269337: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-09 02:17:39.999118: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-09 02:17:39.999156: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-09 02:17:39.999183: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-09 02:17:40.469879: I tensorflow/core/platform/cpu_feature_g

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]


✅ Model successfully loaded across both GPUs.


In [4]:
# ✅ Function to construct a well-formatted prompt from JSON files
def construct_fixed_prompt(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    # ✅ Extract training & test examples
    training_examples = data.get("train", [])
    test_examples = data.get("test", [])

    if not test_examples or not training_examples:
        return None  # Skip if missing data

    test_input = test_examples[0]["input"]
    
    # ✅ Build an improved prompt that explicitly asks for only the output matrix
    prompt = "Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.\n\n"

    for ex in training_examples:
        prompt += f"Input: {ex['input']}\n"
        prompt += f"Output: {ex['output']}\n\n"

    prompt += f"Test Input Matrix:\n{test_input}\n\n"
    prompt += "**Provide ONLY the final output matrix for the test input. Do NOT include any other text.**"

    return prompt

print("\n✅ Function to construct prompt is ready.")



✅ Function to construct prompt is ready.


In [5]:
import re

# ✅ Function to extract output matrix after instruction text
def extract_final_matrix(generated_text):
    instruction_text = "Provide ONLY the final output matrix for the test input. Do NOT include any other text."
    
    # ✅ Find the position where the instruction appears
    start_idx = generated_text.find(instruction_text)
    if start_idx == -1:
        return "⚠️ Instruction text not found in output."

    # ✅ Extract everything after the instruction
    output_after_instruction = generated_text[start_idx + len(instruction_text):].strip()

    # ✅ Use regex to find the first valid matrix
    matrix_match = re.search(r"\[\[.*\]\]", output_after_instruction, re.DOTALL)
    
    if matrix_match:
        return matrix_match.group(0)  # ✅ Return the matrix
    else:
        return "⚠️ No valid matrix found in extracted output."

print("\n✅ Function to extract output matrix is ready.")



✅ Function to extract output matrix is ready.


In [6]:
# ✅ Select the first 10 files from the filtered shortest files
num_samples = 10  # Running on 10 examples first
test_files = filtered_files[:num_samples]

# ✅ Dictionary to store results
comparison_results = {}

# ✅ Function to load ground truth matrix from JSON
def load_ground_truth_matrix(file):
    file_path = os.path.join(dataset_folder, file)
    with open(file_path, "r") as f:
        data = json.load(f)
    
    test_examples = data.get("test", [])
    if not test_examples:
        return None  # No test data found

    return test_examples[0].get("output", None)  # Return the first test output matrix

# ✅ Function to compare matrices element-wise
def compare_matrices(extracted_matrix_text, ground_truth):
    if ground_truth is None:
        return "⚠️ No ground truth matrix found in JSON."

    try:
        # ✅ Convert extracted matrix string to a Python list
        extracted_matrix = eval(extracted_matrix_text)  # Convert string representation to list

        # ✅ Convert to numpy arrays for comparison
        extracted_np = np.array(extracted_matrix)
        ground_truth_np = np.array(ground_truth)

        # ✅ Check if matrices are the same
        if extracted_np.shape != ground_truth_np.shape:
            return f"❌ Shape Mismatch! Extracted: {extracted_np.shape}, Ground Truth: {ground_truth_np.shape}"

        # ✅ Check element-wise equality
        is_equal = np.array_equal(extracted_np, ground_truth_np)

        if is_equal:
            return "✅ The extracted matrix matches the ground truth!"
        else:
            return "❌ The extracted matrix does NOT match the ground truth."

    except Exception as e:
        return f"⚠️ Error in matrix comparison: {e}"

# ✅ Run inference and comparison for each file
for idx, file in enumerate(test_files):
    print(f"\n🚀 Running inference {idx + 1}/{num_samples}: {file}")

    # ✅ Construct file path
    file_path = os.path.join(dataset_folder, file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    comparison_results[file] = {
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Print results for verification
    print("\n🔍 **Extracted Output Matrix:**\n")
    print(extracted_matrix)
    print("\n🎯 **Ground Truth Matrix:**\n")
    print(np.array(ground_truth_matrix) if ground_truth_matrix else "⚠️ No ground truth available")
    print("\n📊 **Comparison Result:**", comparison_result)
    print("\n" + "-" * 100 + "\n")

# ✅ Summary of correct vs incorrect predictions
num_correct = sum(1 for r in comparison_results.values() if "matches" in r["comparison"])
num_incorrect = num_samples - num_correct

print("\n📊 **Summary of First 10 Examples:**")
print(f"✅ {num_correct}/{num_samples} matrices matched the ground truth.")
print(f"❌ {num_incorrect}/{num_samples} matrices did NOT match the ground truth.")



🚀 Running inference 1/10: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

🔍 **Extracted Output Matrix:**

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0]]

🎯 **Ground Truth Matrix:**

[[0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 0 0 0]
 [0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 0 0 0]
 [0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 0 0 0]]

📊 **Comparison Result:** ❌ The extract

In [8]:
import json

# ✅ Set batch save settings
num_samples = 200
batch_size = 50  # ✅ Save progress every 50 examples

# ✅ Dictionary to store all results
all_results = {}

# ✅ Function to save results periodically
def save_results(batch_index):
    save_path = f"./{model_name}_results_batch_{batch_index}.json"
    with open(save_path, "w") as f:
        json.dump(all_results, f, indent=4)
    print(f"\n💾 **Saved results for {batch_index} examples at:** {save_path}")

# ✅ Run inference and comparison for each file
for idx, file in enumerate(filtered_files[:num_samples]):
    # ✅ Construct file path
    file_path = os.path.join(dataset_folder, file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    all_results[file] = {
        "model_name": model_name,
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Display live progress with symbols
    if "matches" in comparison_result:
        print(f"✅ {idx + 1}/{num_samples} - Correct")
    elif "❌" in comparison_result:
        print(f"❌ {idx + 1}/{num_samples} - Incorrect")
    elif "⚠️" in comparison_result:
        print(f"⚠️ {idx + 1}/{num_samples} - Invalid Output")

    # ✅ Save results every 50 examples
    if (idx + 1) % batch_size == 0:
        save_results(idx + 1)

# ✅ Final save for remaining results
save_results(num_samples)
print("\n✅ **Completed inference & comparison for all 200 examples!** 🚀")


⚠️ 1/200 - Invalid Output




⚠️ 2/200 - Invalid Output
❌ 3/200 - Incorrect
⚠️ 4/200 - Invalid Output
❌ 5/200 - Incorrect
❌ 6/200 - Incorrect
❌ 7/200 - Incorrect
❌ 8/200 - Incorrect
❌ 9/200 - Incorrect
⚠️ 10/200 - Invalid Output
⚠️ 11/200 - Invalid Output
⚠️ 12/200 - Invalid Output
⚠️ 13/200 - Invalid Output
❌ 14/200 - Incorrect
⚠️ 15/200 - Invalid Output
⚠️ 16/200 - Invalid Output
⚠️ 17/200 - Invalid Output
⚠️ 18/200 - Invalid Output
⚠️ 19/200 - Invalid Output


KeyboardInterrupt: 

In [11]:
# ✅ Select one example to inspect
test_file = filtered_files[2]  # Change index if you want a different file
file_path = os.path.join(dataset_folder, test_file)

print(f"\n🚀 Running inference for: {test_file}")

# ✅ Construct prompt
test_prompt = construct_fixed_prompt(file_path)

# ✅ Tokenize the prompt
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

# ✅ Remove `token_type_ids` if present
if "token_type_ids" in inputs:
    inputs.pop("token_type_ids")

# ✅ Run inference with NO stopping conditions
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=1000,  # ✅ Allow long generation without cutting off
        do_sample=True,  
        temperature=0.7,  
        top_k=50,  
        top_p=0.9,  
        repetition_penalty=1.2,  # ✅ Reduces repetitive patterns
    )

# ✅ Decode full output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

# ✅ Print entire model output exactly as generated
print("\n🚀 **Full Generated Output (Unfiltered):**\n")
print(generated_text)



🚀 Running inference for: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json

🚀 **Full Generated Output (Unfiltered):**

<|begin_of_text|>Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.

Input: [[6, 5, 5], [5, 1, 7], [4, 5, 2]]
Output: [[6, 6, 5, 5, 5], [6, 6, 5, 5, 5], [5, 5, 1, 7, 7], [4, 4, 5, 2, 2], [4, 4, 5, 2, 2]]

Input: [[1, 3, 5], [1, 2, 8], [8, 3, 8]]
Output: [[1, 1, 3, 5, 5], [1, 1, 3, 5, 5], [1, 1, 2, 8, 8], [8, 8, 3, 8, 8], [8, 8, 3, 8, 8]]

Input: [[2, 3, 7], [2, 1, 6], [1, 5, 7]]
Output: [[2, 2, 3, 7, 7], [2, 2, 3, 7, 7], [2, 2, 1, 6, 6], [1, 1, 5, 7, 7], [1, 1, 5, 7, 7]]

Test Input Matrix:
[[1, 2, 5], [7, 3, 6], [7, 6, 5]]

**Provide ONLY the final output matrix for the test input. Do NOT include any other text.** 
[[1, 1, 2, 5, 5], [1, 1, 2, 5, 5], [7, 7, 3, 6, 6], [7, 7, 3, 6, 6]] 
[/column] [/column] [/row] [/table] [/column] [/columns] [/div] [/

In [12]:
import re

# ✅ Function to extract matrix after the instruction text and stop at `]]`
def extract_final_matrix(generated_text):
    instruction_text = "Provide ONLY the final output matrix for the test input. Do NOT include any other text."
    
    # ✅ Find where the instruction appears
    start_idx = generated_text.find(instruction_text)
    if start_idx == -1:
        return "⚠️ Instruction text not found in output."

    # ✅ Extract everything after the instruction
    output_after_instruction = generated_text[start_idx + len(instruction_text):].strip()

    # ✅ Find first matrix **only after instruction text**
    matrix_match = re.search(r"\[\[.*?\]\]", output_after_instruction, re.DOTALL)
    
    if matrix_match:
        return matrix_match.group(0)  # ✅ Return the extracted matrix
    else:
        return "⚠️ No valid matrix found after instruction."

print("\n✅ **Updated matrix extraction function is ready!**")



✅ **Updated matrix extraction function is ready!**


In [13]:
# ✅ Select the first 5 files from the filtered dataset
num_samples = 5  # Running on 5 examples first
test_files = filtered_files[:num_samples]

# ✅ Dictionary to store results
comparison_results = {}

# ✅ Run inference and comparison for each file
for idx, file in enumerate(test_files):
    print(f"\n🚀 Running inference {idx + 1}/{num_samples}: {file}")

    # ✅ Construct file path
    file_path = os.path.join(dataset_folder, file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix using updated method
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    comparison_results[file] = {
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Print results for verification
    print("\n🔍 **Extracted Output Matrix:**\n")
    print(extracted_matrix)
    print("\n🎯 **Ground Truth Matrix:**\n")
    print(np.array(ground_truth_matrix) if ground_truth_matrix else "⚠️ No ground truth available")
    print("\n📊 **Comparison Result:**", comparison_result)
    print("\n" + "-" * 100 + "\n")

# ✅ Summary of correct vs incorrect predictions
num_correct = sum(1 for r in comparison_results.values() if "matches" in r["comparison"])
num_incorrect = num_samples - num_correct

print("\n📊 **Summary of First 5 Examples:**")
print(f"✅ {num_correct}/{num_samples} matrices matched the ground truth.")
print(f"❌ {num_incorrect}/{num_samples} matrices did NOT match the ground truth.")



🚀 Running inference 1/5: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

🔍 **Extracted Output Matrix:**

⚠️ No valid matrix found after instruction.

🎯 **Ground Truth Matrix:**

[[0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 0 0 0]
 [0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 0 0 0]
 [0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 0 0 0]]

📊 **Comparison Result:** ⚠️ Error in matrix comparison: invalid character '⚠' (U+26A0) (<string>, line 1)

----------------------------------------------------------------------------------------------------


🚀 Running inference 2/5: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

🔍 **Extracted Output Matrix:**

[[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 

In [14]:
import json

# ✅ Set batch save settings
num_samples = 200
batch_size = 50  # ✅ Save progress every 50 examples

# ✅ Dictionary to store all results
all_results = {}

# ✅ Function to save results periodically
def save_results(batch_index):
    save_path = f"./{model_name}_results_batch_{batch_index}.json"
    with open(save_path, "w") as f:
        json.dump(all_results, f, indent=4)
    print(f"\n💾 **Saved results for {batch_index} examples at:** {save_path}")

# ✅ Run inference and comparison for each file
for idx, file in enumerate(filtered_files[:num_samples]):
    print(f"\n🚀 Running inference {idx + 1}/{num_samples}: {file}")

    # ✅ Construct file path
    file_path = os.path.join(dataset_folder, file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix using updated method
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    all_results[file] = {
        "model_name": model_name,
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Display live progress with symbols
    if "matches" in comparison_result:
        print(f"✅ {idx + 1}/{num_samples} - Correct")
    elif "❌" in comparison_result:
        print(f"❌ {idx + 1}/{num_samples} - Incorrect")
    elif "⚠️" in comparison_result:
        print(f"⚠️ {idx + 1}/{num_samples} - Invalid Output")

    # ✅ Save results every 50 examples
    if (idx + 1) % batch_size == 0:
        save_results(idx + 1)

# ✅ Final save for remaining results
save_results(num_samples)
print("\n✅ **Completed inference & comparison for all 200 examples!** 🚀")



🚀 Running inference 1/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json
❌ 1/200 - Incorrect

🚀 Running inference 2/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json
✅ 2/200 - Correct

🚀 Running inference 3/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json
❌ 3/200 - Incorrect

🚀 Running inference 4/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/c48954c1.json
⚠️ 4/200 - Invalid Output

🚀 Running inference 5/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/a59b95c0.json
❌ 5/200 - Incorrect

🚀 Running inference 6/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/8e2edd66.json
❌ 6/200 - Incorrect

🚀 Running inference 7/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/ad7e01d0.json
❌ 7/200 - Incorrect

🚀 Running inference 8/200: /p

FileNotFoundError: [Errno 2] No such file or directory: './phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.1_results_batch_50.json'

In [15]:
import json
import os

# ✅ Ensure save directory exists
save_dir = "./model_results"
os.makedirs(save_dir, exist_ok=True)  # ✅ Creates the directory if it does not exist

# ✅ Set batch save settings
start_idx = 50  # ✅ Start from 51st example
num_samples = 200  # ✅ Total dataset size
batch_size = 50  # ✅ Save progress every 50 examples

# ✅ Dictionary to store all results
all_results = {}

# ✅ Function to save results periodically
def save_results(batch_index):
    save_path = os.path.join(save_dir, f"{model_name}_results_batch_{batch_index}.json")
    with open(save_path, "w") as f:
        json.dump(all_results, f, indent=4)
    print(f"\n💾 **Saved results for {batch_index} examples at:** {save_path}")

# ✅ Run inference and comparison for the remaining 150 files
for idx, file in enumerate(filtered_files[start_idx:num_samples]):
    actual_idx = start_idx + idx + 1  # ✅ Adjust index to reflect the actual dataset position
    print(f"\n🚀 Running inference {actual_idx}/{num_samples}: {file}")

    # ✅ Construct file path
    file_path = os.path.join(dataset_folder, file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix using updated method
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    all_results[file] = {
        "model_name": model_name,
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Display live progress with symbols
    if "matches" in comparison_result:
        print(f"✅ {actual_idx}/{num_samples} - Correct")
    elif "❌" in comparison_result:
        print(f"❌ {actual_idx}/{num_samples} - Incorrect")
    elif "⚠️" in comparison_result:
        print(f"⚠️ {actual_idx}/{num_samples} - Invalid Output")

    # ✅ Save results every 50 examples
    if actual_idx % batch_size == 0:
        save_results(actual_idx)

# ✅ Final save for remaining results
save_results(num_samples)
print("\n✅ **Completed inference & comparison for the remaining 150 examples!** 🚀")



🚀 Running inference 51/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/c074846d.json
❌ 51/200 - Incorrect

🚀 Running inference 52/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/cad67732.json
⚠️ 52/200 - Invalid Output

🚀 Running inference 53/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/fb791726.json
❌ 53/200 - Incorrect

🚀 Running inference 54/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/8ba14f53.json
⚠️ 54/200 - Invalid Output

🚀 Running inference 55/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e5790162.json
❌ 55/200 - Incorrect

🚀 Running inference 56/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/bbb1b8b6.json
❌ 56/200 - Incorrect

🚀 Running inference 57/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/2a5f8217.json
❌ 57/200 - Incorrect

🚀 Runni

FileNotFoundError: [Errno 2] No such file or directory: './model_results/phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.1_results_batch_100.json'

In [None]:
import json
import os

# ✅ Set batch processing settings
start_idx = 100  # ✅ Start from 101st example
num_samples = 200  # ✅ Total dataset size

# ✅ Dictionary to store results
all_results = {}

# ✅ Run inference and comparison for the remaining files
for idx, file in enumerate(filtered_files[start_idx:num_samples]):
    actual_idx = start_idx + idx + 1  # ✅ Adjust index to reflect actual dataset position
    print(f"\n🚀 Running inference {actual_idx}/{num_samples}: {file}")

    # ✅ Construct file path
    file_path = os.path.join(dataset_folder, file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix using updated method
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    all_results[file] = {
        "model_name": model_name,
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Display live progress with symbols
    if "matches" in comparison_result:
        print(f"✅ {actual_idx}/{num_samples} - Correct")
    elif "❌" in comparison_result:
        print(f"❌ {actual_idx}/{num_samples} - Incorrect")
    elif "⚠️" in comparison_result:
        print(f"⚠️ {actual_idx}/{num_samples} - Invalid Output")

print("\n✅ **Completed inference & comparison for the remaining 100 examples!** 🚀")



🚀 Running inference 101/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/8597cfd7.json
❌ 101/200 - Incorrect

🚀 Running inference 102/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/4852f2fa.json
❌ 102/200 - Incorrect

🚀 Running inference 103/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/55783887.json
❌ 103/200 - Incorrect

🚀 Running inference 104/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/72207abc.json
⚠️ 104/200 - Invalid Output

🚀 Running inference 105/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/d37a1ef5.json
❌ 105/200 - Incorrect

🚀 Running inference 106/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/af24b4cc.json
⚠️ 106/200 - Invalid Output

🚀 Running inference 107/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/c87289bb.json
❌ 107/200 - Inco

In [2]:
import os
import json
import torch
import gc
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ✅ Clear GPU memory before starting
gc.collect()
torch.cuda.empty_cache()

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Load the filtered 200 shortest JSON files
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

print(f"\n✅ Loaded {len(filtered_files)} filtered JSON files for evaluation.")



✅ Loaded 200 filtered JSON files for evaluation.


In [17]:
import torch

# ✅ Select the first 5 files to inspect raw model outputs
num_samples = 5
test_files = filtered_files[:num_samples]

# ✅ Function to run inference and print raw model output
def run_debug_inference(model_name, device):
    print(f"\n🚀 Loading Model: {model_name} on {device}")

    # ✅ Load Model on Assigned GPU
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map={"" : device}  # ✅ Load on assigned GPU
    )

    # ✅ Run inference for 5 examples
    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        inputs = tokenizer(test_prompt, return_tensors="pt").to(device)

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,  
                do_sample=True,  
                temperature=0.7,  
                top_k=50,  
                top_p=0.9,  
                repetition_penalty=1.2,  
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Print the full raw output exactly as generated
        print("\n🚀 **Full Raw Model Output:**\n")
        print(generated_text)
        print("\n" + "-" * 100 + "\n")

# ✅ Run inference for both models
run_debug_inference("phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings", "cuda:0")
run_debug_inference("phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1", "cuda:1")



🚀 Loading Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings on cuda:0


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 1/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [4]:
# ✅ Function to construct a well-formatted prompt from JSON files
def construct_fixed_prompt(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    # ✅ Extract training & test examples
    training_examples = data.get("train", [])
    test_examples = data.get("test", [])

    if not test_examples or not training_examples:
        return None  # Skip if missing data

    test_input = test_examples[0]["input"]
    
    # ✅ Build an improved prompt that explicitly asks for only the output matrix
    prompt = "Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.\n\n"

    for ex in training_examples:
        prompt += f"Input: {ex['input']}\n"
        prompt += f"Output: {ex['output']}\n\n"

    prompt += f"Test Input Matrix:\n{test_input}\n\n"
    prompt += "**Provide ONLY the final output matrix for the test input. Do NOT include any other text.**"

    return prompt

print("\n✅ Function `construct_fixed_prompt` is now defined and ready!")



✅ Function `construct_fixed_prompt` is now defined and ready!


In [6]:
import re

# ✅ Function to extract matrix after instruction text and stop at first `]]`
def extract_final_matrix(generated_text):
    instruction_text = "Provide ONLY the final output matrix for the test input. Do NOT include any other text."
    
    # ✅ Find where the instruction appears
    start_idx = generated_text.find(instruction_text)
    if start_idx == -1:
        return "⚠️ Instruction text not found in output."

    # ✅ Extract everything after the instruction
    output_after_instruction = generated_text[start_idx + len(instruction_text):].strip()

    # ✅ Use regex to find the first valid matrix **after the instruction text**
    matrix_match = re.search(r"\[\[.*?\]\]", output_after_instruction, re.DOTALL)
    
    if matrix_match:
        return matrix_match.group(0)  # ✅ Return the extracted matrix
    else:
        return "⚠️ No valid matrix found in extracted output."

print("\n✅ **Updated matrix extraction function is ready!**")



✅ **Updated matrix extraction function is ready!**


In [10]:
import torch

# ✅ Select the first 5 files to inspect
num_samples = 5
test_files = filtered_files[:num_samples]

# ✅ Function to run inference, extract matrix, and compare with ground truth
def run_debug_inference(model_name, device):
    print(f"\n🚀 Loading Model: {model_name} on {device}")

    # ✅ Load Model on Assigned GPU
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map={"" : device}  # ✅ Load on assigned GPU
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # ✅ Run inference for 5 examples
    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt and move to correct data type
        inputs = tokenizer(test_prompt, return_tensors="pt").to(device)
        
        # ✅ Ensure inputs are in `torch.long` (required for embedding layers)
        inputs = {k: v.to(dtype=torch.long) for k, v in inputs.items()}

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with safe parameters
        with torch.no_grad():
            logits = model(**inputs).logits[:, -1, :]
            
            # ✅ Sanitize probabilities
            probs = torch.softmax(logits, dim=-1)
            probs = torch.nan_to_num(probs, nan=0.0, posinf=1.0, neginf=0.0)
            probs = probs / probs.sum(dim=-1, keepdim=True)  # ✅ Normalize

            outputs = model.generate(
                **inputs,
                max_new_tokens=700,  # ✅ Prevents excessive token generation
                do_sample=True,  
                temperature=0.8,  # ✅ Balanced randomness
                top_k=50,  # ✅ Ensure `top_k` is reasonable
                top_p=0.9,  # ✅ Avoid extreme filtering
                repetition_penalty=1.1,  # ✅ Reduce excessive repetition
            )

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Print Results
        print("\n🚀 **Full Raw Model Output:**\n")
        print(generated_text)

        print("\n🔍 **Extracted Output Matrix:**\n")
        print(extracted_matrix)

        print("\n🎯 **Ground Truth Matrix:**\n")
        print(np.array(ground_truth_matrix) if ground_truth_matrix else "⚠️ No ground truth available")

        print("\n" + "-" * 100 + "\n")

# ✅ Run inference for both models in parallel
from threading import Thread

thread_1 = Thread(target=run_debug_inference, args=("phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings", "cuda:0"))
thread_2 = Thread(target=run_debug_inference, args=("phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1", "cuda:1"))

# ✅ Start both threads
thread_1.start()
thread_2.start()

# ✅ Wait for both to complete
thread_1.join()
thread_2.join()

print("\n✅ **Completed inference for both models on 5 examples!** 🚀")



🚀 Loading Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings on cuda:0

🚀 Loading Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1 on cuda:1


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🚀 Running inference 1/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json


Exception in thread Thread-13:
Traceback (most recent call last):
  File "/usr/lib64/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/usr/lib64/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/scratch/slurm_tmpdir/job_25372695/ipykernel_539717/429513468.py", line 57, in run_debug_inference
  File "/opt/bwhpc/common/jupyter/tensorflow/2023-10-10/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/pfs/data5/home/ma/ma_ma/ma_abthomas/llama-env/lib64/python3.9/site-packages/transformers/generation/utils.py", line 2215, in generate
    result = self._sample(
  File "/pfs/data5/home/ma/ma_ma/ma_abthomas/llama-env/lib64/python3.9/site-packages/transformers/generation/utils.py", line 3249, in _sample
    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
RuntimeError: probability tensor contains either `inf`, `nan` or element <


🚀 Running inference 1/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json


Exception in thread Thread-14:
Traceback (most recent call last):
  File "/usr/lib64/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/usr/lib64/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/scratch/slurm_tmpdir/job_25372695/ipykernel_539717/429513468.py", line 57, in run_debug_inference
  File "/opt/bwhpc/common/jupyter/tensorflow/2023-10-10/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/pfs/data5/home/ma/ma_ma/ma_abthomas/llama-env/lib64/python3.9/site-packages/transformers/generation/utils.py", line 2215, in generate
    result = self._sample(
  File "/pfs/data5/home/ma/ma_ma/ma_abthomas/llama-env/lib64/python3.9/site-packages/transformers/generation/utils.py", line 3249, in _sample
    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
RuntimeError: probability tensor contains either `inf`, `nan` or element <


✅ **Completed inference for both models on 5 examples!** 🚀


In [15]:
import torch

# ✅ Select the first 5 files to inspect
num_samples = 5
test_files = filtered_files[:num_samples]

# ✅ Function to run inference, extract matrix, and compare with ground truth
def run_debug_inference(model_name):
    print(f"\n🚀 Loading Model: {model_name} across multiple GPUs (Auto-Device Mapping)")

    # ✅ Load Model with Auto GPU Distribution
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"  # ✅ Auto-distributes across GPUs
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # ✅ Run inference for 5 examples
    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {idx + 1}/{num_samples} on {model_name}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt and move to correct data type
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

        # ✅ Ensure inputs are in `torch.long` (required for embedding layers)
        inputs = {k: v.to(dtype=torch.long) for k, v in inputs.items()}

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with logit stabilization
        with torch.no_grad():
            logits = model(**inputs).logits[:, -1, :]

            # ✅ Clamp logits to prevent extreme values
            logits = torch.clamp(logits, min=-10, max=10)

            # ✅ Normalize logits before softmax
            logits = logits - logits.max(dim=-1, keepdim=True)[0]

            # ✅ Apply softmax safely
            probs = torch.softmax(logits, dim=-1) + 1e-9

            # ✅ Normalize probabilities again to avoid underflow
            probs = probs / probs.sum(dim=-1, keepdim=True)

            # ✅ Verify if NaN still occurs
            if torch.isnan(probs).any():
                print("❌ ERROR: NaN detected even after normalization.")
                print(f"⚠️ Logits Dump (First 5 Values): {logits[:, :5]}")
                print(f"⚠️ Probabilities Dump (First 5 Values): {probs[:, :5]}")
                continue  # Skip this sample

            # ✅ Run safe inference with stabilized params
            try:
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=600,  # ✅ Reduce generation length to avoid instability
                    do_sample=True,  
                    temperature=0.6,  # ✅ Lower temperature to prevent extreme token probabilities
                    top_k=50,  # ✅ Ensure `top_k` is reasonable
                    top_p=0.85,  # ✅ Lower `top_p` to avoid extreme sampling
                    repetition_penalty=1.05,  # ✅ Reduce repetition control
                )
            except RuntimeError as e:
                print("❌ ERROR: Failed to generate tokens due to NaN values in logits.")
                print(f"⚠️ Logits Dump (First 5 Values): {logits[:, :5]}")
                print(f"⚠️ Probabilities Dump (First 5 Values): {probs[:, :5]}")
                continue  # Skip this sample

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Print Results
        print("\n🚀 **Full Raw Model Output:**\n")
        print(generated_text)

        print("\n🔍 **Extracted Output Matrix:**\n")
        print(extracted_matrix)

        print("\n🎯 **Ground Truth Matrix:**\n")
        print(np.array(ground_truth_matrix) if ground_truth_matrix else "⚠️ No ground truth available")

        print("\n" + "-" * 100 + "\n")

# ✅ Run model inference across both GPUs
print("\n🚀 Running Model across CUDA:0 and CUDA:1...\n")
run_debug_inference("phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings")

print("\n✅ **Completed inference for model using multi-GPU!** 🚀")



🚀 Running Model across CUDA:0 and CUDA:1...


🚀 Loading Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings across multiple GPUs (Auto-Device Mapping)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



🚀 Running inference 1/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings
❌ ERROR: Failed to generate tokens due to NaN values in logits.
⚠️ Logits Dump (First 5 Values): tensor([[-8.2266, -4.2539, -6.5312, -8.6641, -6.4062]], device='cuda:1')
⚠️ Probabilities Dump (First 5 Values): tensor([[1.1802e-05, 6.2752e-04, 6.4373e-05, 7.6294e-06, 7.2896e-05]],
       device='cuda:1')

🚀 Running inference 2/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings
❌ ERROR: Failed to generate tokens due to NaN values in logits.
⚠️ Logits Dump (First 5 Values): tensor([[-7.6953, -3.9023, -6.4648, -7.6719, -6.3516]], device='cuda:1')
⚠️ Probabilities Dump (First 5 Values): tensor([[1.5676e-05, 6.9523e-04, 5.3585e-05, 1.6034e-05, 6.0022e-05]],
       device='cuda:1')

🚀 Running inference 3/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs5_wd0.1_WithoutReasonings
❌ ERROR: Failed to generate tokens due to NaN va

In [16]:
import torch
from threading import Thread

# ✅ Select the first 5 files to inspect
num_samples = 5
test_files = filtered_files[:num_samples]

# ✅ Function to run inference for a given model on a specific GPU
def run_model_inference(model_name, device):
    print(f"\n🚀 Loading Model: {model_name} on {device}")

    # ✅ Load Model on Assigned GPU
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map={"" : device}  # ✅ Load on assigned GPU
    )

    # ✅ Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # ✅ Run inference for 5 examples
    for idx, file in enumerate(test_files):
        print(f"\n🚀 Running inference {idx + 1}/{num_samples} on {model_name}: {file}")

        # ✅ Construct file path
        file_path = os.path.join(dataset_folder, file)

        # ✅ Construct prompt
        test_prompt = construct_fixed_prompt(file_path)

        # ✅ Tokenize the prompt and move to correct data type
        inputs = tokenizer(test_prompt, return_tensors="pt").to(device)

        # ✅ Ensure inputs are in `torch.long` (required for embedding layers)
        inputs = {k: v.to(dtype=torch.long) for k, v in inputs.items()}

        # ✅ Remove `token_type_ids` if present
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # ✅ Run inference with logit stabilization
        with torch.no_grad():
            logits = model(**inputs).logits[:, -1, :]

            # ✅ Clamp logits to prevent extreme values
            logits = torch.clamp(logits, min=-10, max=10)

            # ✅ Normalize logits before softmax
            logits = logits - logits.max(dim=-1, keepdim=True)[0]

            # ✅ Apply softmax safely
            probs = torch.softmax(logits, dim=-1) + 1e-9

            # ✅ Normalize probabilities again to avoid underflow
            probs = probs / probs.sum(dim=-1, keepdim=True)

            # ✅ Verify if NaN still occurs
            if torch.isnan(probs).any():
                print("❌ ERROR: NaN detected even after normalization.")
                print(f"⚠️ Logits Dump (First 5 Values): {logits[:, :5]}")
                print(f"⚠️ Probabilities Dump (First 5 Values): {probs[:, :5]}")
                continue  # Skip this sample

            # ✅ Run safe inference with stabilized params
            try:
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=600,  # ✅ Reduce generation length to avoid instability
                    do_sample=True,  
                    temperature=0.7,  # ✅ Balanced temperature
                    top_k=50,  # ✅ Ensure `top_k` is reasonable
                    top_p=0.9,  # ✅ Avoid extreme filtering
                    repetition_penalty=1.1,  # ✅ Reduce excessive repetition
                )
            except RuntimeError as e:
                print("❌ ERROR: Failed to generate tokens due to NaN values in logits.")
                print(f"⚠️ Logits Dump (First 5 Values): {logits[:, :5]}")
                print(f"⚠️ Probabilities Dump (First 5 Values): {probs[:, :5]}")
                continue  # Skip this sample

        # ✅ Decode full output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # ✅ Extract output matrix
        extracted_matrix = extract_final_matrix(generated_text)

        # ✅ Load ground truth matrix
        ground_truth_matrix = load_ground_truth_matrix(file)

        # ✅ Print Results
        print("\n🚀 **Full Raw Model Output:**\n")
        print(generated_text)

        print("\n🔍 **Extracted Output Matrix:**\n")
        print(extracted_matrix)

        print("\n🎯 **Ground Truth Matrix:**\n")
        print(np.array(ground_truth_matrix) if ground_truth_matrix else "⚠️ No ground truth available")

        print("\n" + "-" * 100 + "\n")

# ✅ Run two models in parallel on separate GPUs
thread_1 = Thread(target=run_model_inference, args=("phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings", "cuda:0"))
thread_2 = Thread(target=run_model_inference, args=("phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01", "cuda:1"))

# ✅ Start both threads
thread_1.start()
thread_2.start()

# ✅ Wait for both threads to finish
thread_1.join()
thread_2.join()

print("\n✅ **Completed inference for both models in parallel!** 🚀")



🚀 Loading Model: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings on cuda:0
🚀 Loading Model: phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01 on cuda:1



config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]


🚀 Running inference 1/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json
❌ ERROR: Failed to generate tokens due to NaN values in logits.
⚠️ Logits Dump (First 5 Values): tensor([[-3.9102, -3.4062, -2.3672, -4.8438, -5.3477]], device='cuda:1')
⚠️ Probabilities Dump (First 5 Values): tensor([[3.6478e-04, 6.0368e-04, 1.7061e-03, 1.4341e-04, 8.6665e-05]],
       device='cuda:1')

🚀 Running inference 2/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr5e-05_batch4_epochs1_wd0.01: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json
❌ ERROR: Failed to generate tokens due to NaN values in logits.
⚠️ Logits Dump (First 5 Values): tensor([[-5.2930, -3.9609, -3.3086, -5.2617, -5.7031]], device='cuda:1')
⚠️ Probabilities Dump (First 5 Values): tensor([[9.8228e-05, 3.7217e-04, 7.1430e-04, 1.0133e-04, 6.5148e-05]],
       device='cuda:1')

🚀 Running infer

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]


🚀 Running inference 1/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json
❌ ERROR: Failed to generate tokens due to NaN values in logits.
⚠️ Logits Dump (First 5 Values): tensor([[-5.4883, -3.7539, -3.2852, -6.0078, -4.7617]], device='cuda:0')
⚠️ Probabilities Dump (First 5 Values): tensor([[5.6684e-05, 3.2115e-04, 5.1308e-04, 3.3677e-05, 1.1718e-04]],
       device='cuda:0')

🚀 Running inference 2/5 on phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.01_WithoutReasonings: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json
❌ ERROR: Failed to generate tokens due to NaN values in logits.
⚠️ Logits Dump (First 5 Values): tensor([[-5.1953, -3.4180, -2.6602, -5.3867, -4.7500]], device='cuda:0')
⚠️ Probabilities Dump (First 5 Values): tensor([[7.0214e-05, 4.1533e-04, 8.8596e-04, 5.7995e-05, 1.0961e-04]],
  

In [1]:
import os
import gc
import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ✅ Prevent Memory Fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# ✅ Function to Clear GPU Memory
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("\n✅ GPU memory cleared.")

print("✅ Libraries loaded and GPU management set up.")


✅ Libraries loaded and GPU management set up.


In [2]:
import os
import json
import numpy as np
from glob import glob

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Get list of all JSON files
json_files = glob(os.path.join(dataset_folder, "*.json"))
print(f"\n✅ Found {len(json_files)} JSON files in dataset.")

# ✅ Function to calculate matrix size from JSON
def get_matrix_size(json_path):
    try:
        with open(json_path, "r") as file:
            data = json.load(file)
        test_examples = data.get("test", [])
        if not test_examples:
            return float('inf')  # Ignore files with no test data
        
        test_output = test_examples[0].get("output", [])
        return sum(len(row) for row in test_output)  # Total number of elements in matrix
    except:
        return float('inf')  # Ignore corrupted files

# ✅ Scan all files and get their sizes
file_sizes = [(file, get_matrix_size(file)) for file in json_files]

# ✅ Sort by matrix size (smallest first)
file_sizes.sort(key=lambda x: x[1])

# ✅ Select the 200 smallest files
smallest_files = [file for file, size in file_sizes[:200]]

# ✅ Save the filtered list to a file
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")
with open(filtered_files_path, "w") as f:
    json.dump(smallest_files, f)

print(f"\n✅ Selected 200 smallest JSON files. Saved list to {filtered_files_path}")



✅ Found 400 JSON files in dataset.

✅ Selected 200 smallest JSON files. Saved list to /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/filtered_200_files.json


In [3]:
# ✅ Load the filtered list of files
filtered_files_path = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/filtered_200_files.json"

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

print(f"\n✅ Loaded {len(filtered_files)} filtered JSON files.")

# ✅ Function to print matrix size for verification
def print_matrix_sizes(files, num_samples=5):
    print(f"\n✅ Displaying {num_samples} sample files:")
    for file in files[:num_samples]:
        with open(file, "r") as f:
            data = json.load(f)
        test_examples = data.get("test", [])
        test_output = test_examples[0].get("output", []) if test_examples else []
        matrix_size = sum(len(row) for row in test_output)
        print(f"📂 {os.path.basename(file)} - Matrix Size: {matrix_size} elements")

# ✅ Print first 5 files for verification
print_matrix_sizes(filtered_files, num_samples=5)



✅ Loaded 200 filtered JSON files.

✅ Displaying 5 sample files:
📂 642d658d.json - Matrix Size: 1 elements
📂 1a2e2828.json - Matrix Size: 1 elements
📂 e872b94a.json - Matrix Size: 3 elements
📂 be03b35f.json - Matrix Size: 4 elements
📂 8597cfd7.json - Matrix Size: 4 elements


In [4]:
# ✅ List of specific files to check (modify if needed)
selected_files = [
    "642d658d.json",
    "1a2e2828.json",
    "e872b94a.json",
    "be03b35f.json",
    "8597cfd7.json"
]

# ✅ Display contents of selected files
def view_json_files(files):
    for file in files:
        file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
        with open(file_path, "r") as f:
            data = json.load(f)

        print(f"\n📂 File: {file}")
        print("🔹 Test Input Matrix:")
        print(np.array(data.get("test", [])[0].get("input", [])))

        print("\n🔹 Expected Output Matrix:")
        print(np.array(data.get("test", [])[0].get("output", [])))

        print("-" * 50)

# ✅ View selected JSON files
view_json_files(selected_files)



📂 File: 642d658d.json
🔹 Test Input Matrix:
[[0 0 9 9 0 9 0 9 0 6 0 9 0 0 9 9 0 9 0 0 9 0]
 [0 0 9 9 9 9 3 9 9 9 0 9 0 0 9 9 0 6 0 9 9 0]
 [9 9 2 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 0 9]
 [9 2 4 2 9 9 9 0 9 9 0 9 0 3 9 9 9 1 9 9 2 9]
 [9 9 2 9 9 9 6 9 9 9 6 9 9 9 2 0 9 9 9 9 9 9]
 [9 9 9 9 9 0 9 9 0 0 9 9 9 9 0 9 9 9 9 9 9 9]
 [0 0 9 9 9 9 0 9 9 9 0 9 3 0 9 9 0 9 0 9 9 0]
 [9 9 9 0 9 9 9 3 9 9 9 9 0 9 9 9 9 0 9 9 9 9]
 [6 9 9 0 9 9 3 4 3 9 9 9 9 9 9 9 9 9 9 9 9 9]
 [9 9 9 0 9 9 9 3 9 3 9 9 9 9 9 9 6 9 9 0 0 3]
 [0 0 0 1 9 9 0 9 9 9 0 9 0 0 9 6 4 6 0 9 9 0]
 [9 9 9 9 9 9 9 0 9 9 9 0 2 9 9 9 6 9 9 0 9 1]
 [0 0 9 9 9 9 0 9 9 9 0 9 0 0 9 9 0 9 0 9 0 0]
 [0 0 9 2 9 9 3 9 0 6 0 9 0 0 9 9 0 9 0 9 9 0]
 [9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 0 9 9 9 9]
 [9 9 9 9 9 9 9 9 9 9 9 9 9 0 0 9 0 9 9 9 2 9]
 [0 6 3 9 9 9 0 9 9 0 0 9 0 3 9 9 0 2 0 0 9 0]
 [9 9 9 9 9 9 0 3 9 0 9 9 9 9 9 9 9 9 9 9 9 9]
 [0 0 9 9 9 9 0 9 9 2 0 9 0 0 9 9 0 9 0 9 9 0]
 [9 9 9 9 0 9 9 9 9 9 9 6 0 9 9 9 9 9 9 9 6 9]
 [9 9 9 9 1 9 9 

In [5]:
import os
import json
import numpy as np
from glob import glob

# ✅ Define dataset folder
dataset_folder = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/"

# ✅ Get list of all JSON files
json_files = glob(os.path.join(dataset_folder, "*.json"))
print(f"\n✅ Found {len(json_files)} JSON files in dataset.")

# ✅ Function to calculate the size of `train[0]` input matrix
def get_train_input_size(json_path):
    try:
        with open(json_path, "r") as file:
            data = json.load(file)

        # ✅ Get the first training example
        train_examples = data.get("train", [])
        if not train_examples:
            return float('inf')  # Ignore files with no training data

        # ✅ Get the input matrix from the first training example
        train_input = train_examples[0].get("input", [])

        # ✅ Calculate total number of elements in the input matrix
        train_input_size = sum(len(row) for row in train_input)

        return train_input_size

    except Exception as e:
        print(f"⚠️ Error processing {json_path}: {e}")
        return float('inf')  # Ignore corrupted files

# ✅ Scan all files and get their `train[0]` input matrix sizes
file_sizes = [(file, get_train_input_size(file)) for file in json_files]

# ✅ Sort by training input matrix size (smallest first)
file_sizes.sort(key=lambda x: x[1])

# ✅ Select the 200 smallest files
smallest_files = [file for file, size in file_sizes[:200]]

# ✅ Save the filtered list to a file
filtered_files_path = os.path.join(dataset_folder, "filtered_200_files.json")
with open(filtered_files_path, "w") as f:
    json.dump(smallest_files, f)

print(f"\n✅ Selected 200 smallest JSON files based on `train[0]` input matrix size. Saved list to {filtered_files_path}")



✅ Found 401 JSON files in dataset.
⚠️ Error processing /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/filtered_200_files.json: 'list' object has no attribute 'get'

✅ Selected 200 smallest JSON files based on `train[0]` input matrix size. Saved list to /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/filtered_200_files.json


In [6]:
# ✅ Load the filtered list of files
filtered_files_path = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/filtered_200_files.json"

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

print(f"\n✅ Loaded {len(filtered_files)} filtered JSON files.")

# ✅ Select a few files to display (modify the number if needed)
num_samples = 5  # Change this to see more examples
selected_files = filtered_files[:num_samples]  # Take first `num_samples` files

# ✅ Display contents of selected files
def view_selected_json_files(files):
    for file in files:
        file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
        with open(file_path, "r") as f:
            data = json.load(f)

        # ✅ Extract training[0] input matrix
        train_input = data.get("train", [])[0].get("input", [])

        print(f"\n📂 File: {file}")
        print("🔹 Training Input Matrix:")
        print(np.array(train_input))  # Print as a numpy array for readability
        print("-" * 50)

# ✅ View selected JSON files
view_selected_json_files(selected_files)



✅ Loaded 200 filtered JSON files.

📂 File: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json
🔹 Training Input Matrix:
[[0 0]
 [0 7]]
--------------------------------------------------

📂 File: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json
🔹 Training Input Matrix:
[[8 6]
 [6 4]]
--------------------------------------------------

📂 File: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json
🔹 Training Input Matrix:
[[6 5 5]
 [5 1 7]
 [4 5 2]]
--------------------------------------------------

📂 File: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/c48954c1.json
🔹 Training Input Matrix:
[[7 6 7]
 [2 7 6]
 [1 2 7]]
--------------------------------------------------

📂 File: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/a59b95c0.json
🔹 Training Input Matrix:
[[9 7 9]
 [9 6 7]
 [7 6 6]]
---------------------------

In [7]:
# ✅ Function to Clear GPU Memory (Run before and after model loading)
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("\n✅ GPU memory cleared.")

# ✅ Clear memory before loading the model
clear_gpu_memory()

# ✅ Define model path
model_id = "phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings"

print(f"\n✅ Loading model in 4-bit quantization across 2 GPUs: {model_id}")

# ✅ Set 4-bit quantization settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# ✅ Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="balanced"  # ✅ Distributes model across 2 GPUs automatically
)

print("\n✅ Model successfully loaded across both GPUs.")



✅ GPU memory cleared.

✅ Loading model in 4-bit quantization across 2 GPUs: phogen/FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings


2025-03-08 02:53:51.534439: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 02:53:52.433216: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-08 02:53:52.433260: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-08 02:53:52.433300: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 02:53:52.677701: I tensorflow/core/platform/cpu_feature_g

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


✅ Model successfully loaded across both GPUs.


In [8]:
# ✅ Load the filtered list of files
filtered_files_path = "/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/filtered_200_files.json"

with open(filtered_files_path, "r") as f:
    filtered_files = json.load(f)

print(f"\n✅ Loaded {len(filtered_files)} filtered JSON files.")

# ✅ Select a few files for inference (modify the number if needed)
num_samples = 5  # Change this to run on more files
selected_files = filtered_files[:num_samples]  # Take first `num_samples` files

# ✅ Function to construct prompts from JSON files
def construct_prompt(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    # ✅ Extract training & test examples
    training_examples = data.get("train", [])
    test_examples = data.get("test", [])

    if not test_examples or not training_examples:
        return None  # Skip if missing data

    test_input = test_examples[0]["input"]
    
    # ✅ Build prompt using training examples
    prompt = "Below are some training examples (Input-Output pairs):\n\n"
    for ex in training_examples:
        prompt += f"Input: {ex['input']}\n"
        prompt += f"Output: {ex['output']}\n\n"

    prompt += f"Test Input Matrix:\n{test_input}\n\n"
    prompt += "Based on the above training examples, provide only the final output matrix for the test input matrix."

    return prompt

# ✅ Construct prompts for selected files
prompts = []
for file in selected_files:
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    prompt = construct_prompt(file_path)
    if prompt:
        prompts.append((file, prompt))

print(f"\n✅ Prepared {len(prompts)} prompts for inference.")



✅ Loaded 200 filtered JSON files.

✅ Prepared 5 prompts for inference.


In [10]:
# ✅ Function to run inference
def run_inference(file, prompt):
    print(f"\n🚀 Running inference for: {file}")

    # ✅ Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present to prevent errors
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,  # ✅ Ensures enough space for matrix generation
            do_sample=True,  # ✅ Enables controlled randomness
            temperature=0.7,  # ✅ Balanced randomness for better predictions
            top_k=50,  
            top_p=0.9,  
            eos_token_id=tokenizer.eos_token_id  # ✅ Stops at EOS token
        )

    # ✅ Decode output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # ✅ Extract only the matrix part from the output
    final_output = generated_text.split("]]")[0] + "]]" if "]]" in generated_text else generated_text

    print("\n✅ Generated Output:")
    print(final_output)

    return final_output

# ✅ Run inference on all selected prompts
results = {}
for file, prompt in prompts:
    results[file] = run_inference(file, prompt)

print("\n✅ Inference completed for all selected files.")



🚀 Running inference for: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

✅ Generated Output:
Below are some training examples (Input-Output pairs):

Input: [[0, 0], [0, 7]]

🚀 Running inference for: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

✅ Generated Output:
Below are some training examples (Input-Output pairs):

Input: [[8, 6], [6, 4]]

🚀 Running inference for: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json

✅ Generated Output:
Below are some training examples (Input-Output pairs):

Input: [[6, 5, 5], [5, 1, 7], [4, 5, 2]]

🚀 Running inference for: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/c48954c1.json

✅ Generated Output:
Below are some training examples (Input-Output pairs):

Input: [[7, 6, 7], [2, 7, 6], [1, 2, 7]]

🚀 Running inference for: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_data

KeyboardInterrupt: 

In [11]:
# ✅ Function to construct a better prompt for inference
def construct_fixed_prompt(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    # ✅ Extract training & test examples
    training_examples = data.get("train", [])
    test_examples = data.get("test", [])

    if not test_examples or not training_examples:
        return None  # Skip if missing data

    test_input = test_examples[0]["input"]
    
    # ✅ Build an improved prompt that explicitly asks for only the output matrix
    prompt = "Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.\n\n"

    for ex in training_examples:
        prompt += f"Input: {ex['input']}\n"
        prompt += f"Output: {ex['output']}\n\n"

    prompt += f"Test Input Matrix:\n{test_input}\n\n"
    prompt += "**Provide ONLY the final output matrix for the test input. Do NOT include any other text.**"

    return prompt

# ✅ Reconstruct prompts using the improved structure
prompts = []
for file in selected_files:
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    prompt = construct_fixed_prompt(file_path)
    if prompt:
        prompts.append((file, prompt))

print(f"\n✅ Updated prompts for better inference.")



✅ Updated prompts for better inference.


In [12]:
# ✅ Select one file for testing (Modify index if needed)
test_file = selected_files[0]
file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", test_file)

# ✅ Construct the improved prompt for this file
test_prompt = construct_fixed_prompt(file_path)

print("\n🚀 Running inference for one datapoint:", test_file)
print("\n✅ Constructed Prompt:\n")
print(test_prompt)

# ✅ Tokenize the prompt
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

# ✅ Remove `token_type_ids` if present to prevent errors
if "token_type_ids" in inputs:
    inputs.pop("token_type_ids")

# ✅ Run inference for the selected datapoint
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=500,  # ✅ Ensures enough space for matrix generation
        do_sample=True,  # ✅ Enables controlled randomness
        temperature=0.7,  # ✅ Balanced randomness for better predictions
        top_k=50,  
        top_p=0.9,  
        eos_token_id=tokenizer.eos_token_id  # ✅ Stops at EOS token
    )

# ✅ Decode output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ Extract only the matrix part from the output
final_output = generated_text.split("]]")[0] + "]]" if "]]" in generated_text else generated_text

print("\n✅ Generated Output:")
print(final_output)



🚀 Running inference for one datapoint: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

✅ Constructed Prompt:

Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.

Input: [[0, 0], [0, 7]]
Output: [[2, 0, 2, 0, 2, 0], [0, 7, 0, 7, 0, 7], [2, 0, 2, 0, 2, 0], [0, 7, 0, 7, 0, 7], [2, 0, 2, 0, 2, 0], [0, 7, 0, 7, 0, 7]]

Input: [[0, 0, 0], [0, 0, 6], [6, 0, 0]]
Output: [[0, 2, 0, 0, 2, 0, 0, 2, 0], [0, 0, 6, 0, 0, 6, 0, 0, 6], [6, 0, 0, 6, 0, 0, 6, 0, 0], [0, 2, 0, 0, 2, 0, 0, 2, 0], [0, 0, 6, 0, 0, 6, 0, 0, 6], [6, 0, 0, 6, 0, 0, 6, 0, 0], [0, 2, 0, 0, 2, 0, 0, 2, 0], [0, 0, 6, 0, 0, 6, 0, 0, 6], [6, 0, 0, 6, 0, 0, 6, 0, 0]]

Input: [[0, 0, 0, 0, 0], [0, 8, 0, 0, 0], [0, 8, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
Output: [[2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0], [2, 8, 0, 0, 0, 2, 8, 0, 0, 0, 2, 8, 0, 0, 0], [0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0

In [14]:
# ✅ Function to run inference and extract complete matrix
def run_fixed_inference(file, prompt):
    print(f"\n🚀 Running inference for: {file}")

    # ✅ Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present to prevent errors
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference with increased token limit
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=800,  # ✅ Increased from 500 to 800 to avoid truncation
            do_sample=True,  # ✅ Enables controlled randomness
            temperature=0.7,  # ✅ Balanced randomness
            top_k=50,  
            top_p=0.9,  
            eos_token_id=tokenizer.eos_token_id  # ✅ Stops at EOS token
        )

    # ✅ Decode output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # ✅ Extract only the **last** matrix
    matrices = generated_text.split("[[")  # Find all matrices
    if len(matrices) > 1:
        final_output = "[[" + matrices[-1]  # Take the last one
        final_output = final_output.split("]]")[0] + "]]" if "]]" in final_output else final_output
    else:
        final_output = "⚠️ No valid matrix found."

    print("\n✅ Generated Output:")
    print(final_output)

    return final_output

# ✅ Run inference on one test example
test_file = filtered_files[0]  # Use the shortest file
file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", test_file)
test_prompt = construct_fixed_prompt(file_path)

run_fixed_inference(test_file, test_prompt)



🚀 Running inference for: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

✅ Generated Output:
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 


'[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, '

In [15]:
# ✅ Select the first 10 files from the filtered shortest files
num_samples = 10  # Number of examples to test
test_files = filtered_files[:num_samples]

# ✅ Store results
results = {}

# ✅ Function to check if the matrix is complete
def is_matrix_complete(matrix_text):
    return matrix_text.count("[[") == matrix_text.count("]]")  # Ensure equal open/close brackets

# ✅ Run inference for each selected example
for idx, test_file in enumerate(test_files):
    print(f"\n🚀 Running inference for example {idx + 1}/{num_samples}: {test_file}")

    # ✅ Construct prompt
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", test_file)
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=800,  # ✅ Increased to allow full generation
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            eos_token_id=tokenizer.eos_token_id  
        )

    # ✅ Decode output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # ✅ Extract only the **last** matrix
    matrices = generated_text.split("[[")  # Find all matrices
    if len(matrices) > 1:
        final_output = "[[" + matrices[-1]  # Take the last one
        final_output = final_output.split("]]")[0] + "]]" if "]]" in final_output else final_output
    else:
        final_output = "⚠️ No valid matrix found."

    # ✅ Check if matrix is complete
    is_complete = is_matrix_complete(final_output)
    results[test_file] = {"output": final_output, "complete": is_complete}

    print("\n✅ Generated Output:")
    print(final_output)
    print(f"🔍 **Matrix Complete:** {'✅ YES' if is_complete else '❌ NO'}")
    print("-" * 50)

# ✅ Count complete vs incomplete matrices
num_complete = sum(1 for r in results.values() if r["complete"])
num_incomplete = num_samples - num_complete

print(f"\n📊 **Summary of Results:**")
print(f"✅ {num_complete}/{num_samples} matrices were **complete**.")
print(f"❌ {num_incomplete}/{num_samples} matrices were **incomplete**.")



🚀 Running inference for example 1/10: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

✅ Generated Output:
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
🔍 **Matrix Complete:** ❌ NO
--------------------------------------------------

🚀 Running inference for example 2/10: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

✅ Generated Output:
[[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2
🔍 **Matrix Complete:** ❌ NO
--------------------------------------------------

🚀 Running inference for example 3/10: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json

✅ Generated Output:
[[1, 1, 2, 5, 5], [1, 1, 2, 5, 5], [1, 1, 2, 5, 5], [7, 7, 3, 6, 6], [7, 7, 3, 6, 6], [7, 7, 3, 6, 6], [7, 7, 6, 5, 5], [7, 7, 6, 5, 5], [7, 7, 6, 5, 5]]
🔍

In [16]:
# ✅ Select specific test files
test_files = ["59341089.json", "c48954c1.json"]

# ✅ Function to run inference and print full output
def run_debug_inference(file):
    print(f"\n🚀 Running inference for: {file}")

    # ✅ Construct file path
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference with default settings
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=800,  # ✅ Ensures space for full matrix
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            eos_token_id=tokenizer.eos_token_id  
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Print entire model output for analysis
    print("\n🚀 **Full Generated Output:**\n")
    print(generated_text)
    print("\n" + "-" * 100 + "\n")

# ✅ Run inference on both selected examples
for file in test_files:
    run_debug_inference(file)



🚀 Running inference for: 59341089.json

🚀 **Full Generated Output:**

<|begin_of_text|>Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.

Input: [[7, 5, 7], [5, 5, 7], [7, 7, 5]]
Output: [[7, 5, 7, 7, 5, 7, 7, 5, 7, 7, 5, 7], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [5, 7, 7, 7, 7, 5, 5, 7, 7, 7, 7, 5]]

Input: [[7, 7, 8], [5, 8, 8], [5, 8, 8]]
Output: [[8, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8]]

Input: [[8, 8, 8], [5, 5, 7], [5, 7, 8]]
Output: [[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Input: [[8, 8, 7], [7, 5, 5], [5, 7, 8]]
Output: [[7, 8, 8, 8, 8, 7, 7, 8, 8, 8, 8, 7], [5, 5, 7, 7, 5, 5, 5, 5, 7, 7, 5, 5], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Test Input Matrix:
[[8, 5, 7], [5, 7, 5], [8, 8, 5]]

**Provide ONLY the final output matrix for the test input. Do NOT 

In [17]:
# ✅ Select specific test files
test_files = ["59341089.json", "c48954c1.json"]

# ✅ Function to run inference and print raw output
def run_unfiltered_inference(file):
    print(f"\n🚀 Running inference for: {file}")

    # ✅ Construct file path
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference with NO stopping conditions
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  # ✅ Allow long generation without cutting off
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  # ✅ Reduces repetitive patterns
            # ❌ Removed `eos_token_id` to allow full output
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Print entire model output exactly as generated
    print("\n🚀 **Full Generated Output (Unfiltered):**\n")
    print(generated_text)
    print("\n" + "-" * 100 + "\n")

# ✅ Run inference on both selected examples
for file in test_files:
    run_unfiltered_inference(file)



🚀 Running inference for: 59341089.json

🚀 **Full Generated Output (Unfiltered):**

<|begin_of_text|>Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.

Input: [[7, 5, 7], [5, 5, 7], [7, 7, 5]]
Output: [[7, 5, 7, 7, 5, 7, 7, 5, 7, 7, 5, 7], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [5, 7, 7, 7, 7, 5, 5, 7, 7, 7, 7, 5]]

Input: [[7, 7, 8], [5, 8, 8], [5, 8, 8]]
Output: [[8, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8]]

Input: [[8, 8, 8], [5, 5, 7], [5, 7, 8]]
Output: [[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Input: [[8, 8, 7], [7, 5, 5], [5, 7, 8]]
Output: [[7, 8, 8, 8, 8, 7, 7, 8, 8, 8, 8, 7], [5, 5, 7, 7, 5, 5, 5, 5, 7, 7, 5, 5], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Test Input Matrix:
[[8, 5, 7], [5, 7, 5], [8, 8, 5]]

**Provide ONLY the final output matrix for the test i

In [18]:
import re

# ✅ Function to extract output matrix after instruction text
def extract_final_matrix(generated_text):
    instruction_text = "Provide ONLY the final output matrix for the test input. Do NOT include any other text."
    
    # ✅ Find the position where the instruction appears
    start_idx = generated_text.find(instruction_text)
    if start_idx == -1:
        return "⚠️ Instruction text not found in output."

    # ✅ Extract everything after the instruction
    output_after_instruction = generated_text[start_idx + len(instruction_text):].strip()

    # ✅ Use regex to find the first valid matrix
    matrix_match = re.search(r"\[\[.*\]\]", output_after_instruction, re.DOTALL)
    
    if matrix_match:
        return matrix_match.group(0)  # ✅ Return the matrix
    else:
        return "⚠️ No valid matrix found in extracted output."

# ✅ Function to run inference and extract only the matrix
def run_extraction_inference(file):
    print(f"\n🚀 Running inference for: {file}")

    # ✅ Construct file path
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference with NO stopping conditions
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  # ✅ Allow long generation without cutting off
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  # ✅ Reduces repetitive patterns
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix using our function
    extracted_matrix = extract_final_matrix(generated_text)

    print("\n🚀 **Full Generated Output (Unfiltered):**\n")
    print(generated_text)
    print("\n🔍 **Extracted Output Matrix:**\n")
    print(extracted_matrix)
    print("\n" + "-" * 100 + "\n")

# ✅ Run inference on both selected examples
test_files = ["59341089.json", "c48954c1.json"]
for file in test_files:
    run_extraction_inference(file)



🚀 Running inference for: 59341089.json

🚀 **Full Generated Output (Unfiltered):**

<|begin_of_text|>Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.

Input: [[7, 5, 7], [5, 5, 7], [7, 7, 5]]
Output: [[7, 5, 7, 7, 5, 7, 7, 5, 7, 7, 5, 7], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [5, 7, 7, 7, 7, 5, 5, 7, 7, 7, 7, 5]]

Input: [[7, 7, 8], [5, 8, 8], [5, 8, 8]]
Output: [[8, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8]]

Input: [[8, 8, 8], [5, 5, 7], [5, 7, 8]]
Output: [[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Input: [[8, 8, 7], [7, 5, 5], [5, 7, 8]]
Output: [[7, 8, 8, 8, 8, 7, 7, 8, 8, 8, 8, 7], [5, 5, 7, 7, 5, 5, 5, 5, 7, 7, 5, 5], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Test Input Matrix:
[[8, 5, 7], [5, 7, 5], [8, 8, 5]]

**Provide ONLY the final output matrix for the test i

In [19]:
import numpy as np

# ✅ Function to load ground truth matrix from JSON
def load_ground_truth_matrix(file):
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    with open(file_path, "r") as f:
        data = json.load(f)
    
    test_examples = data.get("test", [])
    if not test_examples:
        return None  # No test data found

    return test_examples[0].get("output", None)  # Return the first test output matrix

# ✅ Function to compare matrices element-wise
def compare_matrices(extracted_matrix_text, ground_truth):
    if ground_truth is None:
        return "⚠️ No ground truth matrix found in JSON."

    try:
        # ✅ Convert extracted matrix string to a Python list
        extracted_matrix = eval(extracted_matrix_text)  # Convert string representation to list

        # ✅ Convert to numpy arrays for comparison
        extracted_np = np.array(extracted_matrix)
        ground_truth_np = np.array(ground_truth)

        # ✅ Check if matrices are the same
        if extracted_np.shape != ground_truth_np.shape:
            return f"❌ Shape Mismatch! Extracted: {extracted_np.shape}, Ground Truth: {ground_truth_np.shape}"

        # ✅ Check element-wise equality
        is_equal = np.array_equal(extracted_np, ground_truth_np)

        if is_equal:
            return "✅ The extracted matrix matches the ground truth!"
        else:
            return "❌ The extracted matrix does NOT match the ground truth."

    except Exception as e:
        return f"⚠️ Error in matrix comparison: {e}"

# ✅ Function to run inference, extract the matrix, load ground truth, and compare
def run_comparison_inference(file):
    print(f"\n🚀 Running inference for: {file}")

    # ✅ Construct file path
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference with NO stopping conditions
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  # ✅ Allow long generation without cutting off
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  # ✅ Reduces repetitive patterns
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Print results
    print("\n🚀 **Full Generated Output (Unfiltered):**\n")
    print(generated_text)
    print("\n🔍 **Extracted Output Matrix:**\n")
    print(extracted_matrix)
    print("\n🎯 **Ground Truth Matrix:**\n")
    print(np.array(ground_truth_matrix) if ground_truth_matrix else "⚠️ No ground truth available")
    print("\n📊 **Comparison Result:**", comparison_result)
    print("\n" + "-" * 100 + "\n")

# ✅ Run inference and comparison for both selected examples
test_files = ["59341089.json", "c48954c1.json"]
for file in test_files:
    run_comparison_inference(file)



🚀 Running inference for: 59341089.json

🚀 **Full Generated Output (Unfiltered):**

<|begin_of_text|>Below are some training examples (Input-Output pairs). Use them to generate only the final output matrix for the given test input.

Input: [[7, 5, 7], [5, 5, 7], [7, 7, 5]]
Output: [[7, 5, 7, 7, 5, 7, 7, 5, 7, 7, 5, 7], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [5, 7, 7, 7, 7, 5, 5, 7, 7, 7, 7, 5]]

Input: [[7, 7, 8], [5, 8, 8], [5, 8, 8]]
Output: [[8, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8], [8, 8, 5, 5, 8, 8, 8, 8, 5, 5, 8, 8]]

Input: [[8, 8, 8], [5, 5, 7], [5, 7, 8]]
Output: [[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], [7, 5, 5, 5, 5, 7, 7, 5, 5, 5, 5, 7], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Input: [[8, 8, 7], [7, 5, 5], [5, 7, 8]]
Output: [[7, 8, 8, 8, 8, 7, 7, 8, 8, 8, 8, 7], [5, 5, 7, 7, 5, 5, 5, 5, 7, 7, 5, 5], [8, 7, 5, 5, 7, 8, 8, 7, 5, 5, 7, 8]]

Test Input Matrix:
[[8, 5, 7], [5, 7, 5], [8, 8, 5]]

**Provide ONLY the final output matrix for the test i

In [20]:
# ✅ Select the first 20 files from the filtered shortest files
num_samples = 20  # First batch to test
test_files = filtered_files[:num_samples]

# ✅ Dictionary to store results
comparison_results = {}

# ✅ Run inference and comparison for each file
for idx, file in enumerate(test_files):
    print(f"\n🚀 Running inference {idx + 1}/{num_samples}: {file}")

    # ✅ Construct file path
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference with NO stopping conditions
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  # ✅ Allow long generation without cutting off
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  # ✅ Reduces repetitive patterns
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    comparison_results[file] = {
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Print results
    print("\n🔍 **Extracted Output Matrix:**\n")
    print(extracted_matrix)
    print("\n🎯 **Ground Truth Matrix:**\n")
    print(np.array(ground_truth_matrix) if ground_truth_matrix else "⚠️ No ground truth available")
    print("\n📊 **Comparison Result:**", comparison_result)
    print("\n" + "-" * 100 + "\n")

# ✅ Summary of correct vs incorrect predictions
num_correct = sum(1 for r in comparison_results.values() if "matches" in r["comparison"])
num_incorrect = num_samples - num_correct

print("\n📊 **Summary of First 20 Examples:**")
print(f"✅ {num_correct}/{num_samples} matrices matched the ground truth.")
print(f"❌ {num_incorrect}/{num_samples} matrices did NOT match the ground truth.")



🚀 Running inference 1/20: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

🔍 **Extracted Output Matrix:**

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0]]

🎯 **Ground Truth Matrix:**

[[0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 0 0 0]
 [0 2 0 0 0 2 0 0 0 2 0 0]
 [0 0 4 0 0 0 4 0 0 0 4 0]
 [0 0 0 2 0 0 0 2 0 0 0 2]
 [4 0 0 0 4 0 0 0 4 

In [21]:
import json

# ✅ Model name (for saving results)
model_name = "FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings"

# ✅ Select all 200 filtered examples
num_samples = 200
batch_size = 50  # ✅ Save progress every 50 examples
test_files = filtered_files[:num_samples]

# ✅ Dictionary to store all results
all_results = {}

# ✅ Function to save results periodically
def save_results(batch_index):
    save_path = f"./{model_name}_results_batch_{batch_index}.json"
    with open(save_path, "w") as f:
        json.dump(all_results, f, indent=4)
    print(f"\n💾 **Saved results for {batch_index} examples at:** {save_path}")

# ✅ Run inference and comparison for each file
for idx, file in enumerate(test_files):
    print(f"\n🚀 Running inference {idx + 1}/{num_samples}: {file}")

    # ✅ Construct file path
    file_path = os.path.join("/pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/", file)
    
    # ✅ Construct prompt
    test_prompt = construct_fixed_prompt(file_path)

    # ✅ Tokenize the prompt
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    # ✅ Remove `token_type_ids` if present
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    # ✅ Run inference with NO stopping conditions
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,  # ✅ Allow long generation without cutting off
            do_sample=True,  
            temperature=0.7,  
            top_k=50,  
            top_p=0.9,  
            repetition_penalty=1.2,  # ✅ Reduces repetitive patterns
        )

    # ✅ Decode full output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # ✅ Extract output matrix
    extracted_matrix = extract_final_matrix(generated_text)

    # ✅ Load ground truth matrix
    ground_truth_matrix = load_ground_truth_matrix(file)

    # ✅ Compare matrices
    comparison_result = compare_matrices(extracted_matrix, ground_truth_matrix)

    # ✅ Store results
    all_results[file] = {
        "model_name": model_name,
        "extracted_matrix": extracted_matrix,
        "ground_truth": ground_truth_matrix,
        "comparison": comparison_result
    }

    # ✅ Print progress summary
    print("\n📊 **Comparison Result:**", comparison_result)
    print("\n" + "-" * 100 + "\n")

    # ✅ Save results every 50 examples
    if (idx + 1) % batch_size == 0:
        save_results(idx + 1)

# ✅ Final save for remaining results
save_results(num_samples)
print("\n✅ **Completed inference & comparison for all 200 examples!** 🚀")



🚀 Running inference 1/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/310f3251.json

📊 **Comparison Result:** ❌ The extracted matrix does NOT match the ground truth.

----------------------------------------------------------------------------------------------------


🚀 Running inference 2/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/00576224.json

📊 **Comparison Result:** ❌ Shape Mismatch! Extracted: (4, 4), Ground Truth: (6, 6)

----------------------------------------------------------------------------------------------------


🚀 Running inference 3/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/evaluation_dataset/e633a9e5.json

📊 **Comparison Result:** ❌ The extracted matrix does NOT match the ground truth.

----------------------------------------------------------------------------------------------------


🚀 Running inference 4/200: /pfs/data5/home/ma/ma_ma/ma_abthomas/json_files/training/eval

In [22]:
import json

# ✅ Model name used for saving results
model_name = "FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings"

# ✅ Paths to saved result files
result_files = [
    f"./{model_name}_results_batch_50.json",
    f"./{model_name}_results_batch_100.json",
    f"./{model_name}_results_batch_150.json",
    f"./{model_name}_results_batch_200.json"
]

# ✅ Load all results into one dictionary
all_results = {}
for file in result_files:
    try:
        with open(file, "r") as f:
            batch_results = json.load(f)
            all_results.update(batch_results)
        print(f"✅ Loaded results from: {file}")
    except FileNotFoundError:
        print(f"⚠️ Warning: Could not find {file}. Skipping...")

# ✅ Summary of Correct vs Incorrect Predictions
num_correct = sum(1 for r in all_results.values() if "matches" in r["comparison"])
num_incorrect = len(all_results) - num_correct

print("\n📊 **Summary of Model Performance:**")
print(f"✅ {num_correct}/{len(all_results)} matrices matched the ground truth.")
print(f"❌ {num_incorrect}/{len(all_results)} matrices did NOT match the ground truth.")

# ✅ Function to display detailed results for specific examples
def view_result(example_file):
    if example_file not in all_results:
        print("⚠️ Example not found in saved results.")
        return

    result = all_results[example_file]
    
    print(f"\n📂 **Example: {example_file}**")
    print(f"\n🎯 **Ground Truth Matrix:**")
    print(np.array(result["ground_truth"]) if result["ground_truth"] else "⚠️ No ground truth available")

    print("\n🔍 **Extracted Output Matrix:**")
    print(result["extracted_matrix"])

    print("\n📊 **Comparison Result:**", result["comparison"])
    print("\n" + "-" * 100 + "\n")

# ✅ View random example from results (modify filename as needed)
example_file = "59341089.json"  # Change this to any filename you want to inspect
view_result(example_file)


✅ Loaded results from: ./FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings_results_batch_50.json
✅ Loaded results from: ./FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings_results_batch_100.json
✅ Loaded results from: ./FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings_results_batch_150.json
✅ Loaded results from: ./FineLlama-3.1-8B_instruct_eval_lr0.0005_batch4_epochs1_wd0.1_WithoutReasonings_results_batch_200.json

📊 **Summary of Model Performance:**
✅ 2/200 matrices matched the ground truth.
❌ 198/200 matrices did NOT match the ground truth.
⚠️ Example not found in saved results.


In [23]:
# ✅ Filter valid results (exclude errors)
valid_results = {
    file: result
    for file, result in all_results.items()
    if "matches" in result["comparison"] or "does NOT match" in result["comparison"]
}

# ✅ Count correct vs incorrect results
num_correct = sum(1 for r in valid_results.values() if "matches" in r["comparison"])
num_invalid = len(all_results) - len(valid_results)
num_total_valid = len(valid_results)

# ✅ Compute accuracy percentage
accuracy = (num_correct / num_total_valid) * 100 if num_total_valid > 0 else 0

# ✅ Print Updated Summary
print("\n📊 **Updated Model Performance (Excluding Invalid Cases):**")
print(f"✅ {num_correct}/{num_total_valid} valid matrices matched the ground truth.")
print(f"❌ {num_total_valid - num_correct}/{num_total_valid} valid matrices did NOT match the ground truth.")
print(f"⚠️ {num_invalid} cases were excluded due to syntax errors.")
print(f"\n🎯 **Final Accuracy:** {accuracy:.2f}%")



📊 **Updated Model Performance (Excluding Invalid Cases):**
✅ 2/86 valid matrices matched the ground truth.
❌ 84/86 valid matrices did NOT match the ground truth.
⚠️ 114 cases were excluded due to syntax errors.

🎯 **Final Accuracy:** 2.33%


In [24]:
# ✅ Counters for each category
num_correct = 0
num_incorrect = 0
num_invalid = 0

# ✅ Filter valid and invalid results based on symbols
valid_results = {}

for file, result in all_results.items():
    comparison_text = result["comparison"]

    if "⚠️" in comparison_text:  # If comparison contains an error symbol
        num_invalid += 1
    elif "❌" in comparison_text:  # If prediction was incorrect but valid
        num_incorrect += 1
        valid_results[file] = result  # Still a valid case
    elif "✅" in comparison_text:  # If prediction was correct
        num_correct += 1
        valid_results[file] = result  # Still a valid case

# ✅ Compute valid case count
num_total_valid = num_correct + num_incorrect

# ✅ Compute accuracy percentage
accuracy = (num_correct / num_total_valid) * 100 if num_total_valid > 0 else 0

# ✅ Print Updated Summary
print("\n📊 **Updated Model Performance (Categorized Correctly):**")
print(f"✅ {num_correct}/{num_total_valid} valid matrices matched the ground truth.")
print(f"❌ {num_incorrect}/{num_total_valid} valid matrices did NOT match the ground truth.")
print(f"⚠️ {num_invalid}/{len(all_results)} cases had invalid matrix comparisons.")
print(f"\n🎯 **Final Accuracy (Excluding Invalid Cases):** {accuracy:.2f}%")



📊 **Updated Model Performance (Categorized Correctly):**
✅ 2/140 valid matrices matched the ground truth.
❌ 138/140 valid matrices did NOT match the ground truth.
⚠️ 60/200 cases had invalid matrix comparisons.

🎯 **Final Accuracy (Excluding Invalid Cases):** 1.43%
