# Reflection Agent on the Big Code Bench Dataset
This experiment is done using:
* 11 small language models (SLMs)
* 3 types of prompt (no prompt, some instructions, detailed instructions)

In [1]:
# mount Google Drive
from google.colab import drive

drive.mount('/content/drive')             # Mount Google Drive to Colab session
!ls /content/drive/MyDrive                # Verify that your drive is accessible

Mounted at /content/drive
'16 б.gdoc'
'6 фото.     5 лет блондин.JPG'
'Archive (1).zip'
 Archive.zip
'Colab Notebooks'
 futuristic-office.jpeg
'Labwork_Andrew_Nedilko_20250219 (1).pdf'
 Labwork_Andrew_Nedilko_20250219.pdf
'ML System Design Sketch.gdoc'
'Most Asked ML Questions.pdf'
 My_Drawing.gdraw
 rag_design.gdraw
 rental_houses.gsheet
 share
'Statistics in Plain English_Chapter Summaries.zip'
'Statistics in Plain English_WHAT I NEED Variance Significance Effect Size etc.pdf'
'Tanya Passport.JPG'
'лечение травами 3.gdoc'


In [2]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/')

from prompts import ( complete_code_prompt_basic, complete_code_prompt, complete_code_prompt_full,
                      complete_task_prompt_basic, complete_task_prompt, complete_task_prompt_full,
                      reflection_prompt_basic, reflection_prompt, reflection_prompt_full)
from helpers import get_tokenizer, get_model, generate_response, clean_code, clean_code_light, is_function, write_jsonl, read_problems

In [3]:
import logging
import sys
import time, datetime
import os
import torch
from datasets import load_dataset
from google.colab import userdata

# get HuggingFace token from secrets
HF_TOKEN = userdata.get('HF_TOKEN')       # Retrieve token from Colab Secrets
os.environ['HF_TOKEN'] = HF_TOKEN         # Set token as env variable (recommended optional)
from huggingface_hub import HfFolder
HfFolder.save_token(HF_TOKEN)             # Store the token locally to avoid re-authentication

In [4]:
# select model, propmt, temperature, top_p
models = [
    ('mlabonne/phixtral-2x2_8', 'phixtral-2x2_8'),
    ('meta-llama/Meta-Llama-3.1-8B-Instruct', 'Meta-Llama-3.1-8B-Instruct'),
    ('google/codegemma-7b-it', 'codegemma-7b-it'),
    ('deepseek-ai/deepseek-coder-6.7b-instruct', 'deepseek-coder-6.7b-instruct'),
    ('m-a-p/OpenCodeInterpreter-DS-6.7B', 'OpenCodeInterpreter-DS-6.7B'),
    ('Artigenz/Artigenz-Coder-DS-6.7B', 'Artigenz-Coder-DS-6.7B'),
    ('Qwen/CodeQwen1.5-7B-Chat', 'CodeQwen1.5-7B-Chat'),
    ('NTQAI/Nxcode-CQ-7B-orpo', 'Nxcode-CQ-7B-orpo'),
    #('mlabonne/phixtral-4x2_8', 'phixtral-4x2_8'),    # discarded
    #('NousResearch/Nous-Hermes-2-SOLAR-10.7B', 'Nous-Hermes-2-Solar-10.7B'),        # discarded
]
# actual prompt & prompt name for log file name
prompts_and_names = [
    (complete_code_prompt_basic, 'complete_code_prompt_basic'),
    (complete_code_prompt,       'complete_code_prompt'),
    (complete_code_prompt_full,  'complete_code_prompt_full'),
    (complete_task_prompt_basic, 'complete_task_prompt_basic'),
    (complete_task_prompt,       'complete_task_prompt'),
    (complete_task_prompt_full,  'complete_task_prompt_full'), ]
# reflection prompts & their names for log file name
reflection_prompts_and_names = [
    ( reflection_prompt_basic, 'reflection_prompt_basic' ),
    ( reflection_prompt, 'reflection_prompt' ),
    ( reflection_prompt_full, 'reflection_prompt_full' ), ]
# temperature and top_k values and labels for file names
temperature_values = [
    (1.0,  'temperature1.0'),
    (0.75, 'temperature0.75'),
    (0.5,  'temperature0.5'), ]
top_p_values = [
    (1.0,  'topP1.0'),
    (0.75, 'topP0.75'),
    (0.5,  'topP0.5'), ]

model_idx            = 6
prompt_idx           = 0
relection_prompt_idx = 0
temperature_idx      = 0
top_p_idx            = 0

model_name, model_nickname     = models[ model_idx ]
my_prompt, my_prompt_label     = prompts_and_names[ prompt_idx ]
my_reflection_prompt, my_reflection_prompt_label = reflection_prompts_and_names[ relection_prompt_idx ]
TEMPERATURE, temperature_label = temperature_values[ temperature_idx ]
TOP_P, top_p_label             = top_p_values[ top_p_idx ]

SPECIAL_MESSAGE = f'Model temperature: {TEMPERATURE}. Model top_p: {TOP_P}. Model top_k: 50. Model dtype: torch.float32'

print(f'Model name:             {model_name}')
print(f'Model nickname:         {model_nickname}')
print(f'Prompt name:            {my_prompt_label}')
print(f'Relfection prompt name: {my_reflection_prompt_label}')
print(f'\nPrompt:\n{my_prompt.strip().format("Starter Code")}\n')
print(f'Temperature: {TEMPERATURE}. Temperature label: {temperature_label}')
print(f'Top p: {TOP_P}. Top p label: {top_p_label}')
print(f'Special message: {SPECIAL_MESSAGE}')

Model name:             Qwen/CodeQwen1.5-7B-Chat
Model nickname:         CodeQwen1.5-7B-Chat
Prompt name:            complete_code_prompt_basic
Relfection prompt name: reflection_prompt_basic

Prompt:
Complete the following Python code:
Starter Code

Temperature: 1.0. Temperature label: temperature1.0
Top p: 1.0. Top p label: topP1.0
Special message: Model temperature: 1.0. Model top_p: 1.0. Model top_k: 50. Model dtype: torch.float32


In [5]:
# save results
time_stamp   = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-2]
results_file = f'/content/drive/My Drive/Colab Notebooks/logs/{model_nickname}_{my_prompt_label}_{temperature_label}_{top_p_label}_completions_{time_stamp}.jsonl'
log_file     = f'/content/drive/My Drive/Colab Notebooks/logs/logs/{model_nickname}_{my_prompt_label}_{temperature_label}_{top_p_label}_log_{time_stamp}.log'

# log results
for handler in logging.root.handlers[:]:            # overwrite any previous handlers with different formats
    logging.root.removeHandler(handler)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s  %(levelname)s  %(message)s',
    handlers=[
        logging.FileHandler(log_file),              # Log to a file
        logging.StreamHandler(sys.stdout)           # Log to console (default - sys.stderr (red background)
    ]
)

# print job config
logging.info('*****     JOB CONFIGURATION     *****')
logging.info(f'MODEL: {model_name}')
logging.info(f'MODEL NICKNAME: {model_nickname}')
logging.info(SPECIAL_MESSAGE)
logging.info(f'GENERATED CODE SAVED IN: {results_file}')
logging.info(f'THIS LOG FILE: {log_file}')
if my_prompt in [complete_code_prompt_basic, complete_code_prompt, complete_code_prompt_full,]:
  logging.info(f'PROMPT:\n{my_prompt.lstrip().format("Starter Code")}')
else:
  logging.info(f'\nPROMPT:\n{my_prompt.format("Task Description", "Test Cases")}')
logging.info('*****     END OF JOB CONFIGURATION     *****')

2025-06-16 05:10:48,849  INFO  *****     JOB CONFIGURATION     *****
2025-06-16 05:10:48,851  INFO  MODEL: Qwen/CodeQwen1.5-7B-Chat
2025-06-16 05:10:48,853  INFO  MODEL NICKNAME: CodeQwen1.5-7B-Chat
2025-06-16 05:10:48,853  INFO  Model temperature: 1.0. Model top_p: 1.0. Model top_k: 50. Model dtype: torch.float32
2025-06-16 05:10:48,854  INFO  GENERATED CODE SAVED IN: /content/drive/My Drive/Colab Notebooks/logs/CodeQwen1.5-7B-Chat_complete_code_prompt_basic_temperature1.0_topP1.0_completions_20250616_051048_7218.jsonl
2025-06-16 05:10:48,855  INFO  THIS LOG FILE: /content/drive/My Drive/Colab Notebooks/logs/logs/CodeQwen1.5-7B-Chat_complete_code_prompt_basic_temperature1.0_topP1.0_log_20250616_051048_7218.log
2025-06-16 05:10:48,856  INFO  PROMPT:
Complete the following Python code:
Starter Code

2025-06-16 05:10:48,857  INFO  *****     END OF JOB CONFIGURATION     *****


In [6]:
# read dataset
file = '/content/drive/My Drive/Colab Notebooks/data/Big_Code_Bench_Test.jsonl.gz'
tasks = read_problems(file)
print(f'Type of tasks: {type(tasks)}\n')

counter = 0
for k,v in tasks.items():
    if counter == 2:
        break
    print(f'task_id: {k}', type(k))
    for k2, v2 in v.items():
        print(f'\n{k2}:\n{v2}')
    print('\n' + '='*100 + '\n')
    counter += 1

Type of tasks: <class 'dict'>

task_id: BigCodeBench/0 <class 'str'>

task_id:
BigCodeBench/0

test:


import unittest
from unittest.mock import patch
from random import seed, shuffle
import itertools
class TestCases(unittest.TestCase):
    def test_default_numbers(self):
        # Test with default number range (1 to 10) to check that the result is a positive float.
        result = task_func()
        self.assertIsInstance(result, float)
        self.assertGreater(result, 0)
    def test_custom_list(self):
        # Test with a custom list of small positive integers to ensure proper handling and positive result.
        result = task_func([1, 2, 3])
        self.assertIsInstance(result, float)
        self.assertGreater(result, 0)
    def test_negative_numbers(self):
        # Test with negative numbers to verify the function handles and returns a positive result.
        result = task_func([-3, -2, -1])
        self.assertIsInstance(result, float)
        self.assertGreater(result, 

In [7]:
torch_dtype = torch.float32
#torch_dtype = torch.bfloat16
tokenizer = get_tokenizer(model_name)
model     = get_model(model_name, torch_dtype=torch_dtype)

tokenizer_config.json:   0%|          | 0.00/972 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.7k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [8]:
print(f'Model device: {model.device}')
print(f'Model temperature: {model.config.temperature}. Model top_p: {model.config.top_p}. Model top_k: {model.config.top_k}. Model dtype: {model.dtype}')

input_prompt = "What is the capital of California?"
print(input_prompt)
generate_response(input_prompt, tokenizer, model)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model device: cuda:0
Model temperature: 1.0. Model top_p: 1.0. Model top_k: 50. Model dtype: torch.float32
What is the capital of California?


'The capital of California is Sacramento.'

In [None]:
num_samples_per_task = 1
completions  = []

for task_id, task_body in tasks.items():
    logging.info( task_id )
    #if task_id == 'BigCodeBench/7':
    #        break
    for i in range(num_samples_per_task):
        start_time  = time.time()

        # generate completion
        full_prompt = my_prompt.lstrip().format( task_body['prompt'] )
        try:
            proposed_solution = generate_response( full_prompt, tokenizer, model, temperature=TEMPERATURE )
        except Exception as e:
            proposed_solution = f"Error generating a completion:\n{e}"

        # improve solution
        new_full_prompt = my_reflection_prompt.lstrip().format(full_prompt, proposed_solution)
        try:
            improved_solution = generate_response( new_full_prompt, tokenizer, model, temperature=TEMPERATURE )
        except Exception as e:
            improved_solution = f"Error generating a completion:\n{e}"

        # check if improved solution is a Python function. If not, use the previously proposed solution
        cleaned_improved_solution = clean_code_light(improved_solution).strip()
        if is_function(cleaned_improved_solution):
            final_solution = improved_solution
        else:
            final_solution = proposed_solution

        # save and log the results
        completions.append( {'task_id': task_id, 'completion': final_solution} )
        write_jsonl(results_file, completions)
        logging.info('NEW REFLECTION PROMPT:\n' + new_full_prompt + '\n')
        logging.info('IMPROVED COMPLETION:\n' + final_solution + '\n')
        logging.info(f"Time elapsed: {(time.time() - start_time):.4f} seconds\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    plt.xlabel('Sample values')
    plt.ylabel('Frequency')
    plt.title('Histogram of Generated Samples')
    plt.show()

    return samples

# Example usage
data = task_func(0, 1, 1000)
print(len(data))  # Should print 1000
print(isinstance(data, np.ndarray))  # Should print True
```

2025-06-16 05:36:36,440  INFO  Time elapsed: 26.0320 seconds

2025-06-16 05:36:36,441  INFO  BigCodeBench/117
2025-06-16 05:36:58,712  INFO  NEW REFLECTION PROMPT:
1. Read and understand the REQUIREMENTS and the PROPOSED SOLUTION listed below.
2. Analyze the PROPOSED SOLUTION for any errors, inefficiencies or inconsistencies.
3. Generate ONE optimized version of the PROPOSED SOLUTION.
4. If the PROPOSED SOLUTION is already optimal, return the PROPOSED SOLUTION.

REQUIREMENTS:
Complete the following Python code:
import pandas as pd
import numpy as np
from random import choice, seed as set_seed

def task_func(num_of_students, seed=42, name_

In [None]:
from google.colab import runtime
runtime.unassign()