# Evaluate Generated Code
This notebook loads the code generated by an LLM and uses the modified human-eval package to evaluate how well the code was generated by computing the Pass@1 score. Pass@5 and Pass@10 scores can also be computed.
* [Original package](https://github.com/openai/human-eval) - evaluates code generaion models only on the [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval) dataset.
* [My modifications](https://github.com/agnedil/code-generation/tree/main/modified-openai-human-eval-code) - I made an extensive addition to enable the evaluation on three more datasets: [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp), [LBPP](https://huggingface.co/datasets/CohereForAI/lbpp), and [Big Code Benchmark](https://huggingface.co/datasets/bigcode/bigcodebench)

__This version walks over the entire directory with all models and evaluated them in bulk, populating the dataframe with pass@1 scores for all models / prompt types / cleaning modes__.

The models should be evaluated in a specific order. This is required to populate the dataframe with results in the correct order. The order of prompts should be as follows:
* prompt_basic
* prompt
* prompt_full

Within each prompt, the order of cleaning modes should be as follows:
* raw
* cleaned_partially
* cleaned_fully
* cleaned_fully_light

The code in Section 2 enforces this order for prompts. The code in Section 3 enforces it for cleaning modes.

In [1]:
import os
import numpy as np
import pandas as pd
from human_eval.evaluation import evaluate_functional_correctness
from human_eval.data import stream_jsonl, write_jsonl
from datasets import load_dataset
from collections import defaultdict
from helpers import get_code_in_fences, clean_code, clean_code_light, read_problems

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
%load_ext autoreload
%autoreload 2

## 1. Load dataset

In [2]:
# select dataset
idx = 2
dataset_names = ['human_eval', 'big_code', 'lbpp', 'mbpp']
dataset_name  = dataset_names[idx]
mode          = dataset_names[idx]
print('Dataset:      ', dataset_name, '\nCleaning mode:', mode)

Dataset:       lbpp 
Cleaning mode: lbpp


In [3]:
# read dataset for 'human_eval' and 'big_code' (func signature needed)
# no need to load for 'lbpp' and 'mbpp'
if dataset_name == 'human_eval':
    from datasets import load_dataset
    dataset = load_dataset("openai/openai_humaneval")
    tasks   = {i['task_id']:i for i in dataset['test']}
elif dataset_name == 'big_code':
    file = 'data/Big_Code_Bench_Test.jsonl.gz'
    tasks = read_problems(file)
    
if dataset_name in ['human_eval', 'big_code',]:    
    print(f'Type of tasks: {type(tasks)}\n')
    counter = 0
    for k,v in tasks.items():
        if counter == 2:
            break
        print(f'task_id: {k}', type(k))
        for k2, v2 in v.items():
            print(f'\n{k2}:\n{v2}')
        print('\n' + '='*100 + '\n')
        counter += 1

In [4]:
if dataset_name == 'human_eval':
    for task_id in ['HumanEval/1', 'HumanEval/10']:
        print(task_id, '\n', tasks[task_id]['prompt'], sep='')
elif dataset_name == 'big_code':
    for task_id in ['BigCodeBench/1', 'BigCodeBench/10']:
        print(task_id, '\n', tasks[task_id]['prompt'], sep='')

## 2. Get filepaths

In [5]:
# select prompt type used in filenames: complete_code OR complete_task
idx = 1
objectives = ['code', 'task']
objective  = objectives[idx]
print('Objective: ', objective)
prompt_basic_key  = f'complete_{objective}_prompt_basic'
prompt_medium_key = f'complete_{objective}_prompt'
prompt_full_key   = f'complete_{objective}_prompt_full'

# models to be evaluated
model_names = [
    'phixtral-2x2',
    'Solar-10.7B',
    'Llama-3.1-8B',
    'codegemma-7b-it',
    'deepseek-coder-6.7b',
    'OpenCodeInterpreter-DS-6.7B',
    'Artigenz-Coder-DS-6.7B',
    'CodeQwen1.5-7B-Chat',
    'Nxcode-CQ-7B-orpo',
    #'phixtral-4x2',
    #'mistral_7b',
    #'mistral_3b',
    #'mistral_8B',
    #'nemo',
    #'codestral',
]
print('Num models:', len(model_names))

Objective:  task
Num models: 9


In [6]:
# group files by model
wdir = 'logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087'
jsonl_files = defaultdict(list)

for root, dirs, files in os.walk(wdir):
    for filename in files:
        if filename.endswith('.jsonl'):
            model_name = [name for name in model_names if name in filename]
            assert len(model_name)==1, f'None or too many model names in file {filename}'
            filepath = os.path.join(root, filename)            
            jsonl_files[model_name[0]].append(filepath)

print("Models with JSONL files found:")
print(len(jsonl_files))
for model, files in jsonl_files.items():
    print(model)
    print('Num files:', len(files))
    print('\n'.join(files), '\n')

Models with JSONL files found:
9
phixtral-2x2
Num files: 3
logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411.jsonl
logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025.jsonl
logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186.jsonl 

Nxcode-CQ-7B-orpo
Num files: 3
logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547.jsonl
logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030

In [7]:
# convert to dict of dict with prompt types as keys
prompt_dict = dict()
for model_name, files in jsonl_files.items():
    result = dict()
    for file in files:
        if prompt_basic_key in file:
            result[prompt_basic_key] = file
        elif prompt_full_key in file:
            result[prompt_full_key] = file
        else:
            result[prompt_medium_key] = file       # the remaining file - neither full nor basic prompt
    #assert len(result)==3, f'One or more files are missing in {result}'
    prompt_dict[ model_name ] = result

for model_name, prompt_file_pair in prompt_dict.items():
    print(model_name)
    print(len(prompt_file_pair))
    for prompt, file in prompt_file_pair.items():
        print(f'\t{prompt}: {file}', '\n')

phixtral-2x2
3
	complete_task_prompt_basic: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411.jsonl 

	complete_task_prompt: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025.jsonl 

	complete_task_prompt_full: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186.jsonl 

Nxcode-CQ-7B-orpo
3
	complete_task_prompt_full: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547.jsonl 

	complete_task_prompt: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcod

## 2. Clean and Evaluate Generated Code

```
:param mode:
       'human_eval' - using the HumanEval dataset with or without func header;
       'mbpp' - using the MBPP dataset;
       'lbpp' - using the LBPP dataset;
       'big_code' - using the Big Code Benchmark dataset.
:param k:
        [1] - pass@1 score
        [1,5] - pass@1 and pass@5 scores, etc.
```

In [8]:
final_res = defaultdict(list)
for model_name, prompt_file_pair in prompt_dict.items():
    print('\nModel:', model_name)
    
    # enforce the order of prompts    
    prompt_keys = [prompt_basic_key, prompt_medium_key, prompt_full_key]

    for prompt_key in prompt_keys:
        print('\nPrompt key:', prompt_key)

        # load file with results for cleaning
        file_name = prompt_file_pair[ prompt_key ]
        results   = list(stream_jsonl(file_name))
        print(f'\nFile: {file_name}\nNumber of completions: {len(results)}')

        # apply different degrees of cleaning
        results_cleaned_partially, results_cleaned_fully, results_cleaned_fully_light = [], [], []
        for item in results:
            task_id    = item['task_id']
            completion = item['completion']
            
            # no signature if lbpp and mbpp
            if dataset_name in ['lbpp', 'mbpp']:
                signature = None
            else:
                signature  = tasks[task_id]['prompt']
            completion_cleaned_partially   = get_code_in_fences(completion)
            completion_cleaned_fully       = clean_code(completion, signature)
            completion_cleaned_fully_light = clean_code_light(completion, signature)

            results_cleaned_partially.append(  {'task_id': task_id, 'completion': completion_cleaned_partially} )
            results_cleaned_fully.append(      {'task_id': task_id, 'completion': completion_cleaned_fully} )
            results_cleaned_fully_light.append( {'task_id': task_id, 'completion': completion_cleaned_fully_light} )
            
        # save cleaned files
        file_name_cleaned_partially   = file_name.replace('.jsonl', '') + '_cleaned_partially.jsonl'
        file_name_cleaned_fully       = file_name.replace('.jsonl', '') + '_cleaned_fully.jsonl'
        file_name_cleaned_fully_light = file_name.replace('.jsonl', '') + '_cleaned_fully_light.jsonl'

        write_jsonl(file_name_cleaned_partially, results_cleaned_partially)
        write_jsonl(file_name_cleaned_fully, results_cleaned_fully)
        write_jsonl(file_name_cleaned_fully_light, results_cleaned_fully_light)
        
        print('\nEVALUATING RAW FILE')
        pass1_score_raw                   = evaluate_functional_correctness(
            file_name,
            k=[1],
            mode=mode, )
        
        print('\nEVALUATING PARTIALLY CLEANED FILE')
        pass1_score_cleaned_partially     = evaluate_functional_correctness(
            file_name_cleaned_partially,
            k=[1],
            mode=mode, )
        
        print('\nEVALUATING FULLY CLEANED FILE')
        pass1_score_cleaned_fully          = evaluate_functional_correctness(
            file_name_cleaned_fully,
            k=[1],
            mode=mode, )
        
        print('\nEVALUATING FULLY CLEANED LIGHT FILE')
        pass1_score_cleaned_fully_light    = evaluate_functional_correctness(
            file_name_cleaned_fully_light,
            k=[1],
            mode=mode, )       
                
        # enforce the order of cleaning modes
        local_scores = [ pass1_score_raw['pass@1'], pass1_score_cleaned_partially['pass@1'],
                         pass1_score_cleaned_fully['pass@1'], pass1_score_cleaned_fully_light['pass@1'], ]        
        final_res[model_name].extend( local_scores )
        
        print('\nResults for:', file_name)
        print('\nRaw score')
        print('Partially cleaned score')
        print('Fully cleaned score')
        print('Fully light cleaned score\n')
        print(*local_scores, sep='\n')
        print('\n', '='*75, '\n', sep='')
    print('Interim  results:', *final_res.items(), sep='\n')
    print('\n', '='*100, '\n', sep='')


Model: phixtral-2x2

Prompt key: complete_task_prompt_basic

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 16140.37it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.56it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 42242.91it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 6246.80it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.81it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 50201.50it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 4344.43it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.87it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51393.79it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 4438.59it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.91it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 48258.33it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110343_5411.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.024691358024691357
0.12962962962962962
0.12345679012345678
0.12962962962962962



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 7505.13it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.20it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 45274.34it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 5281.72it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.61it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51608.48it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 4309.00it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.48it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51231.04it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 5334.25it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.12it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 48841.09it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110455_4025.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.012345679012345678
0.12345679012345678
0.12962962962962962
0.13580246913580246



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 4928.39it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.40it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 47314.06it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 5208.00it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.20it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 57325.34it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 6910.59it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.47it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 47803.38it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 5967.55it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 11.97it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 46510.87it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/phixtral-2x2_8/phixtral-2x2_8_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110509_7186.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.012345679012345678
0.08641975308641975
0.11728395061728394
0.12345679012345678


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])



Model: Nxcode-CQ-7B-orpo

Prompt key: complete_task_prompt_basic

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030642_7246.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 6880.43it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.16it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030642_7246.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 42809.81it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 4915.31it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.43it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030642_7246_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 55617.36it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 3496.04it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.48it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030642_7246_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 58104.78it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 6033.15it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.71it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030642_7246_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 53063.43it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030642_7246.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.006172839506172839
0.19753086419753085
0.20987654320987653
0.20987654320987653



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030705_0898.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 4886.88it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.37it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030705_0898.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 49198.27it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 10323.11it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.43it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030705_0898_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 55440.38it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 38134.32it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.36it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030705_0898_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 54580.87it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 43690.67it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.40it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030705_0898_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 49862.57it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030705_0898.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.012345679012345678
0.21604938271604937
0.20987654320987653
0.21604938271604937



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 4798.13it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.38it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56783.99it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 7029.71it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.45it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56510.08it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 6975.87it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 11.80it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 58464.74it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 4992.60it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 11.87it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 52187.19it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nxcode-CQ-7B/Nxcode-CQ-7B-orpo_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030726_5547.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.018518518518518517
0.15432098765432098
0.19753086419753085
0.19753086419753085


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])



Model: Artigenz-Coder-DS-6.7B

Prompt key: complete_task_pro

162it [00:00, 1345.07it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.48it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030434_5546.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 39735.51it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 10291.06it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.59it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030434_5546_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 47931.52it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 4246.07it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.25it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030434_5546_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 48855.14it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 4266.25it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.71it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030434_5546_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 45988.31it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030434_5546.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.024691358024691357
0.2777777777777778
0.2345679012345679
0.2777777777777778



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030500_9121.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 11076.15it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.45it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030500_9121.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 39754.11it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 6141.50it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.75it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030500_9121_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 48097.77it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 5716.52it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.20it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030500_9121_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 48299.49it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 5207.52it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.81it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030500_9121_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 49586.02it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030500_9121.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.24074074074074073
0.22839506172839505
0.24691358024691357



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030507_7872.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 7593.88it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.46it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030507_7872.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 44358.09it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 4317.60it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.84it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030507_7872_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 58144.55it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 7859.50it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.76it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030507_7872_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 59666.07it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 8168.95it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.82it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030507_7872_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 53150.60it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Artigenz-Coder-DS-6.7B/Artigenz-Coder-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030507_7872.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.24074074074074073
0.2222222222222222
0.24074074074074073


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])
('Artigenz-Coder-DS-6.7B', [0.024691358024691357, 0.27777777777777

162it [00:00, 39793.69it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.49it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005046_4507.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 46741.23it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 7268.92it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.64it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005046_4507_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 45453.02it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 11381.91it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.54it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005046_4507_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 46847.58it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 5919.15it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.63it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005046_4507_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 43511.61it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005046_4507.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.11728395061728394
0.1419753086419753
0.14814814814814814



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_004929_5516.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 6511.02it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.46it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_004929_5516.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51219.45it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 5978.05it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.72it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_004929_5516_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51592.81it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 5205.96it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.69it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_004929_5516_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51038.63it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 11676.27it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.67it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_004929_5516_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51188.58it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_004929_5516.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.1111111111111111
0.17901234567901234
0.17901234567901234



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_004945_3624.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 12162.19it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.47it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_004945_3624.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 59167.30it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 7448.29it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.44it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_004945_3624_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 52534.19it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 12514.55it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.46it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_004945_3624_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 55571.87it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 5555.55it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.42it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_004945_3624_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 54502.07it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/codegemma-7b-it/codegemma-7b-it_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_004945_3624.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.13580246913580246
0.16666666666666666
0.16666666666666666


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])
('Artigenz-Coder-DS-6.7B', [0.024691358024691357, 0.2777777777777778, 0.2345679

162it [00:00, 6925.74it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.50it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110633_1799.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 32824.99it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 4515.91it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.48it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110633_1799_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 43865.54it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 10582.60it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.44it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110633_1799_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 36876.00it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 5983.84it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.40it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110633_1799_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 42823.30it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250620_110633_1799.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.2222222222222222
0.17901234567901234
0.2222222222222222



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110703_7061.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 7290.06it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.46it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110703_7061.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 36331.80it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 5376.89it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.77it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110703_7061_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 35983.54it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 6945.06it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:14<00:00, 11.47it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110703_7061_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 39444.87it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 11555.15it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.77it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110703_7061_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 37368.82it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250620_110703_7061.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.16049382716049382
0.19135802469135801
0.21604938271604937



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110711_1287.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 52578.91it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.50it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110711_1287.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 64289.64it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 7095.55it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.52it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110711_1287_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 59105.54it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 7755.70it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.44it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110711_1287_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 54580.87it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 4309.76it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.40it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110711_1287_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 50492.48it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Llama-3.1-8B/Meta-Llama-3.1-8B-Instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250620_110711_1287.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.2037037037037037
0.2345679012345679
0.2345679012345679


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])
('Artigenz-Coder-DS-6.7B', [0.024691358024691357, 0.2777777777777778, 0.23

162it [00:00, 5102.83it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.42it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030558_0798.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 45377.14it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 47999.24it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.00it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030558_0798_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 55422.29it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 4774.66it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.86it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030558_0798_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 55156.85it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 10270.06it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 12.18it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030558_0798_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 62423.27it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_030558_0798.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.024691358024691357
0.19753086419753085
0.2037037037037037
0.2037037037037037



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030618_0182.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 5398.33it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.49it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030618_0182.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 50108.94it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 12870.35it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:14<00:00, 11.41it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030618_0182_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 57060.57it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 11858.66it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.34it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030618_0182_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 52255.42it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 7429.96it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:14<00:00, 11.49it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030618_0182_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 54673.10it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_030618_0182.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.20987654320987653
0.2222222222222222
0.2222222222222222



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030618_5304.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 49705.72it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.55it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030618_5304.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 58951.70it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 11448.65it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.98it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030618_5304_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 57773.76it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 7425.74it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.92it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030618_5304_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56477.20it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 11079.22it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 12.77it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030618_5304_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 52721.70it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/CodeQwen1.5-7B-Chat/CodeQwen1.5-7B-Chat_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_030618_5304.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.16049382716049382
0.18518518518518517
0.18518518518518517


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])
('Artigenz-Coder-DS-6.7B', [0.024691358024691357, 0.2777777777777778, 0

162it [00:00, 6572.81it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.52it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250624_022840_4744.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 43372.73it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 10920.91it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.14it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250624_022840_4744_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51987.55it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 4305.86it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.13it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250624_022840_4744_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 46860.50it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 4145.56it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:15<00:00, 10.33it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250624_022840_4744_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 44427.70it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250624_022840_4744.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.024691358024691357
0.05555555555555555
0.09259259259259259
0.09259259259259259



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250624_022858_4894.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 5351.82it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.52it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250624_022858_4894.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 45985.20it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 4707.38it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 11.57it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250624_022858_4894_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 52869.38it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 4658.36it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 11.60it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250624_022858_4894_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56627.82it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 5435.12it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:13<00:00, 11.66it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250624_022858_4894_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 51428.80it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250624_022858_4894.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.04938271604938271
0.07407407407407407
0.07407407407407407



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250624_022934_3062.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 5902.75it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.50it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250624_022934_3062.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 61647.36it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 4576.93it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.69it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250624_022934_3062_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 57980.82it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 7232.48it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.76it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250624_022934_3062_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 59865.84it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 7744.21it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.78it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250624_022934_3062_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 61736.98it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/Nous-Hermes-2-Solar-10.7B/Nous-Hermes-2-Solar-10.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250624_022934_3062.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.037037037037037035
0.08641975308641975
0.08641975308641975


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])
('Artigenz-Coder-DS-6.7B', [0.024691358024691357, 0.277777

162it [00:00, 11170.10it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.42it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005353_7884.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 41098.24it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 5345.71it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.51it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005353_7884_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 55143.42it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 4325.51it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.31it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005353_7884_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 58757.98it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 10548.27it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.46it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005353_7884_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 52578.91it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005353_7884.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.006172839506172839
0.2716049382716049
0.2716049382716049
0.2716049382716049



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005425_3382.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 7041.58it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.46it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005425_3382.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 45851.76it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 8050.20it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.19it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005425_3382_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 52175.17it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 6973.72it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.02it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005425_3382_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 54933.89it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 10890.80it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.20it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005425_3382_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 58931.24it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005425_3382.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.2716049382716049
0.25925925925925924
0.2716049382716049



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005453_2758.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 4233.95it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.49it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005453_2758.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 55767.99it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 8340.62it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.62it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005453_2758_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 58040.25it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 6173.53it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.37it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005453_2758_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 62641.95it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 4502.74it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:12<00:00, 13.49it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005453_2758_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 57383.43it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/deepseek-coder-6.7b/deepseek-coder-6.7b-instruct_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005453_2758.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.21604938271604937
0.22839506172839505
0.22839506172839505


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])
('Artigenz-Coder-DS-6.7B', [0.024691358024691357, 0.2777777777

162it [00:00, 43913.74it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.39it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005525_4427.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 39959.85it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 5699.98it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [16:17<00:00,  6.03s/it]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005525_4427_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 60242.69it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 5542.64it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.73it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005525_4427_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56727.10it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 11222.50it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.77it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005525_4427_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56016.26it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_basic_temperature1.0_topP0.87_completions_20250621_005525_4427.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.024691358024691357
0.2777777777777778
0.2654320987654321
0.2777777777777778



Prompt key: complete_task_prompt

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005551_7730.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 5443.18it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.75it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005551_7730.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 45175.00it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 11985.84it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [16:46<00:00,  6.22s/it]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005551_7730_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56651.43it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 13316.82it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:14<00:00, 10.87it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005551_7730_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 59582.36it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 53067.58it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:14<00:00, 10.92it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005551_7730_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 57582.82it/s]



Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_temperature1.0_topP0.87_completions_20250621_005551_7730.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.2654320987654321
0.2654320987654321
0.2654320987654321



Prompt key: complete_task_prompt_full

File: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005626_7242.jsonl
Number of completions: 162

EVALUATING RAW FILE
Reading samples...


162it [00:00, 4707.54it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:10<00:00, 15.72it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005626_7242.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 47619.12it/s]



EVALUATING PARTIALLY CLEANED FILE
Reading samples...


162it [00:00, 13704.39it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [15:10<00:00,  5.62s/it]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005626_7242_cleaned_partially.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 56684.51it/s]



EVALUATING FULLY CLEANED FILE
Reading samples...


162it [00:00, 49031.41it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 13.64it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005626_7242_cleaned_fully.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 63207.19it/s]



EVALUATING FULLY CLEANED LIGHT FILE
Reading samples...


162it [00:00, 6423.80it/s]


Running test suites...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:11<00:00, 14.65it/s]


Writing results to logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005626_7242_cleaned_fully_light.jsonl_results.jsonl...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 62480.67it/s]


Results for: logs_results/final-round-one-model-top-p_experiment/LBPP/top-p_087/OpenCodeInterpreter-DS-6.7B/OpenCodeInterpreter-DS-6.7B_complete_task_prompt_full_temperature1.0_topP0.87_completions_20250621_005626_7242.jsonl

Raw score
Partially cleaned score
Fully cleaned score
Fully light cleaned score

0.0
0.2716049382716049
0.29012345679012347
0.2839506172839506


Interim  results:
('phixtral-2x2', [0.024691358024691357, 0.12962962962962962, 0.12345679012345678, 0.12962962962962962, 0.012345679012345678, 0.12345679012345678, 0.12962962962962962, 0.13580246913580246, 0.012345679012345678, 0.08641975308641975, 0.11728395061728394, 0.12345679012345678])
('Nxcode-CQ-7B-orpo', [0.006172839506172839, 0.19753086419753085, 0.20987654320987653, 0.20987654320987653, 0.012345679012345678, 0.21604938271604937, 0.20987654320987653, 0.21604938271604937, 0.018518518518518517, 0.15432098765432098, 0.19753086419753085, 0.19753086419753085])
('Artigenz-Coder-DS-6.7B', [0.024691358024691357, 0.27777




In [9]:
cols   = [c for c in model_names if c in final_res]
df_res = pd.DataFrame(final_res)
df_res = df_res[cols]
results_file = 'results/indiv_model_results_lbpp_top-p_087_20250623.csv'
df_res.to_csv(results_file, index=False)
df_res

Unnamed: 0,phixtral-2x2,Solar-10.7B,Llama-3.1-8B,codegemma-7b-it,deepseek-coder-6.7b,OpenCodeInterpreter-DS-6.7B,Artigenz-Coder-DS-6.7B,CodeQwen1.5-7B-Chat,Nxcode-CQ-7B-orpo
0,0.024691,0.024691,0.0,0.0,0.006173,0.024691,0.024691,0.024691,0.006173
1,0.12963,0.055556,0.222222,0.117284,0.271605,0.277778,0.277778,0.197531,0.197531
2,0.123457,0.092593,0.179012,0.141975,0.271605,0.265432,0.234568,0.203704,0.209877
3,0.12963,0.092593,0.222222,0.148148,0.271605,0.277778,0.277778,0.203704,0.209877
4,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346
5,0.123457,0.049383,0.160494,0.111111,0.271605,0.265432,0.240741,0.209877,0.216049
6,0.12963,0.074074,0.191358,0.179012,0.259259,0.265432,0.228395,0.222222,0.209877
7,0.135802,0.074074,0.216049,0.179012,0.271605,0.265432,0.246914,0.222222,0.216049
8,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519
9,0.08642,0.037037,0.203704,0.135802,0.216049,0.271605,0.240741,0.160494,0.154321


In [10]:
unknown_variable

NameError: name 'unknown_variable' is not defined

## 5. Debug errors

In [9]:
# load file with results for debugging
file_name = 'logs/final-round-one-model/BigCode/Nous-Hermes-2-Solar-10.7B/BigCode_Nous-Hermes-2-Solar-10.7B_complete_code_prompt_basic_temperature1.0_topP1.0_completions_20250402_030815_4464.jsonl'
results1 = list(stream_jsonl(file_name))
file_name = 'logs/final-round-one-model/BigCode/Nous-Hermes-2-Solar-10.7B/BigCode_Nous-Hermes-2-Solar-10.7B_complete_code_prompt_temperature1.0_topP1.0_completions_20250402_031004_3447.jsonl'
results2 = list(stream_jsonl(file_name))
len(results1), len(results2)

(500, 500)

In [12]:
# which key is missing
keys1 = [i['task_id'] for i in results1]
keys2 = [i['task_id'] for i in results2]
[key for key in keys2 if key not in keys1]

['BigCodeBench/1138']

In [16]:
one_res = { 'task_id': 'BigCodeBench/1138', 'completion': 'completion' }

In [17]:
results1.append( one_res )
len(results1)

500

In [15]:
results1[-1]['task_id'] = 'BigCodeBench/1138'
results1[-1]

{'task_id': 'BigCodeBench/1138', 'completion': 'completion'}

In [16]:
file_name = 'logs/final-round-one-model/BigCode/Nous-Hermes-2-Solar-10.7B/BigCode_Nous-Hermes-2-Solar-10.7B_complete_code_prompt_basic_temperature1.0_topP1.0_completions_20250402_030815_4464.jsonl'
write_jsonl(file_name, results1)

In [497]:
# review
idx = 55
for res in [results, results_cleaned_partially, results_cleaned_fully, results_cleaned_fully_light]:
    print(res[idx]['completion'])
    print('\n', '='*75, '\n', sep='')

def fib(n: int):
    if n <= 0:
        return "Input should be a positive integer."
    elif n == 1:
        return 0
    elif n == 2:
        return 1
    else:
        a, b = 0, 1
        for _ in range(2, n):
            a, b = b, a + b
        return b

# Test cases
print(fib(10))  # Expected output: 34
print(fib(1))   # Expected output: 0
print(fib(8))   # Expected output: 13


def fib(n: int):
    if n <= 0:
        return "Input should be a positive integer."
    elif n == 1:
        return 0
    elif n == 2:
        return 1
    else:
        a, b = 0, 1
        for _ in range(2, n):
            a, b = b, a + b
        return b

# Test cases
print(fib(10))  # Expected output: 34
print(fib(1))   # Expected output: 0
print(fib(8))   # Expected output: 13


from typing import *

def fib(n: int):
    if n <= 0:
        return "Input should be a positive integer."
    elif n == 1:
        return 0
    elif n == 2:
        return 1
    else:
        a, b = 0, 1
        for _ in rang

In [None]:
results[0]