# EDA (exploratory data analysis) on the agentic traces dataset

TLDR:
- A lot of failed generations with the `generate_agent_traces.py` script, so non-sensical generations so I think the script is not working as expected
- No traces actually led to a working solution that pass the test cases, almost all of the them fail with `Error:\\nReached max steps`. I believe we need to increase the context length in `generate_agent_traces.py`
- The new features `Finish reasons` and `API metadata` are always `[None] * number_of_generations` so I guess this is also buggy in the `generate_agent_traces.py` script
- We thus filter all the non-sensical generations and keep the rest but unfortunealy non a single generated traces actually leads to a working solution that pass the test cases. I think the next steps would be fixing `generate_agent_traces.py` (I recommend to debug the whole pipeline with like 10 traces, instead of using the whole dataset)
- This scripts push the dataset to `baptistecolle/codeforces-agentic-generations`, so we can use it for training our model.

In [1]:
import json
from datasets import Dataset
import pandas as pd

# Read the JSONL file
data = []
with open('../data/codeforces_agentic_generations_backup_20250331_081737.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)


## Compared the features of the original dataset with the new one

In [2]:
from datasets import load_dataset

original_dataset = load_dataset("open-r1/codeforces-test-cases", split="train")

# view difference in features between original and dataset
original_features = [feature for feature in original_dataset.features]
dataset_features = [feature for feature in dataset.features]

# print(f"original_features: {len(original_features)}")
# print(f"dataset_features: {len(dataset_features)}")

new_features = []

for feature in dataset_features:
    if feature not in original_features:
        new_features.append(feature)

assert len(new_features) == 4

print(f"New features: {', '.join(new_features)}")

New features: generations, final_outputs, finish_reasons, api_metadata


In [3]:
# View all features of the dataset
dataset.features

{'contestId': Value(dtype='string', id=None),
 'index': Value(dtype='string', id=None),
 'name': Value(dtype='string', id=None),
 'type': Value(dtype='string', id=None),
 'rating': Value(dtype='int64', id=None),
 'tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'title': Value(dtype='string', id=None),
 'time-limit': Value(dtype='string', id=None),
 'memory-limit': Value(dtype='string', id=None),
 'problem-description': Value(dtype='string', id=None),
 'input-specification': Value(dtype='string', id=None),
 'output-specification': Value(dtype='string', id=None),
 'demo-input': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'demo-output': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'note': Value(dtype='string', id=None),
 'points': Value(dtype='float64', id=None),
 'test_cases': [{'input': Value(dtype='string', id=None),
   'output': Value(dtype='string', id=None)}],
 'creationTimeSeconds': Value(dtype='int6

In [4]:
dataset[0]

{'contestId': '1',
 'index': 'A',
 'name': 'Theatre Square',
 'type': 'PROGRAMMING',
 'rating': 1000,
 'tags': ['math'],
 'title': 'A. Theatre Square',
 'time-limit': '1',
 'memory-limit': '256',
 'problem-description': "Theatre Square in the capital city of Berland has a rectangular shape with the size *n*<=×<=*m* meters. On the occasion of the city's anniversary, a decision was taken to pave the Square with square granite flagstones. Each flagstone is of the size *a*<=×<=*a*.\n\nWhat is the least number of flagstones needed to pave the Square? It's allowed to cover the surface larger than the Theatre Square, but the Square has to be covered. It's not allowed to break the flagstones. The sides of flagstones should be parallel to the sides of the Square.",
 'input-specification': 'The input contains three positive integer numbers in the first line: *n*,<=<=*m* and *a* (1<=≤<=<=*n*,<=*m*,<=*a*<=≤<=109).',
 'output-specification': 'Write the needed number of flagstones.',
 'demo-input': 

In [5]:
print(f"Generations: {dataset[0]['generations']}")
print(f"Final outputss: {dataset[0]['final_outputs']}")
print(f"Finish reasons: {dataset[0]['finish_reasons']}")
print(f"API metadata: {dataset[0]['api_metadata']}")

Generations: ['    def new_func(*args: Any, **kwargs: Any) -> Any:\n        func_state = state.copy()\n        arg_names = [arg.arg for arg in func_def.args.args]\n        default_values = [\n            evaluate_ast(d, state, static_tools, custom_tools, authorized_imports) for d in func_def.args.defaults\n        ]\n\n        # Apply default values\n        defaults = dict(zip(arg_names[-len(default_values) :], default_values))\n\n        # Set positional arguments\n        for name, value in zip(arg_names, args):\n            func_state[name] = value\n\n        # Set keyword arguments\n        for name, value in kwargs.items():\n            func_state[name] = value\n\n        # Handle variable arguments\n        if func_def.args.vararg:\n            vararg_name = func_def.args.vararg.arg\n            func_state[vararg_name] = args\n\n        if func_def.args.kwarg:\n            kwarg_name = func_def.args.kwarg.arg\n            func_state[kwarg_name] = kwargs\n\n        # Set default 

In [6]:
# Check if all examples have finish_reasons as array of None and same for api_metadata
all_finish_reasons_none = True
all_api_metadata_none = True

for i, example in enumerate(dataset):
    # Check finish_reasons
    if example['finish_reasons'] is not None and any(reason is not None for reason in example['finish_reasons']):
        all_finish_reasons_none = False
        print(f"Example {i} has non-None finish_reasons: {example['finish_reasons']}")
    
    # Check api_metadata
    if example['api_metadata'] is not None and any(metadata is not None for metadata in example['api_metadata']):
        all_api_metadata_none = False
        print(f"Example {i} has non-None api_metadata: {example['api_metadata']}")

print(f"All finish_reasons are None arrays: {all_finish_reasons_none}")
print(f"All api_metadata are None arrays: {all_api_metadata_none}")

# Count total examples checked
print(f"Total examples checked: {len(dataset)}")

assert all_finish_reasons_none, "Not all finish_reasons are None"
assert all_api_metadata_none, "Not all api_metadata are None"

All finish_reasons are None arrays: True
All api_metadata are None arrays: True
Total examples checked: 384


In [7]:
print(dataset[0]["generations"])
print(dataset[0]["final_outputs"])
print(dataset[0]["final_outputs"][0])

['    def new_func(*args: Any, **kwargs: Any) -> Any:\n        func_state = state.copy()\n        arg_names = [arg.arg for arg in func_def.args.args]\n        default_values = [\n            evaluate_ast(d, state, static_tools, custom_tools, authorized_imports) for d in func_def.args.defaults\n        ]\n\n        # Apply default values\n        defaults = dict(zip(arg_names[-len(default_values) :], default_values))\n\n        # Set positional arguments\n        for name, value in zip(arg_names, args):\n            func_state[name] = value\n\n        # Set keyword arguments\n        for name, value in kwargs.items():\n            func_state[name] = value\n\n        # Handle variable arguments\n        if func_def.args.vararg:\n            vararg_name = func_def.args.vararg.arg\n            func_state[vararg_name] = args\n\n        if func_def.args.kwarg:\n            kwarg_name = func_def.args.kwarg.arg\n            func_state[kwarg_name] = kwargs\n\n        # Set default values for ar

In [8]:
print(len(dataset[0]["generations"]))
# print(dataset[0]["generations"][0])


print("Are the first generation and final generations the same?")
print(dataset[0]["generations"][0] == dataset[0]["generations"][4])

5
Are the first generation and final generations the same?
True


## Filtering the dataset to keep proper generations

In [9]:
# This function is generated in the `generate_agent_traces.py` 
# but this is always the same function that is not related to the actual task (non-sensical generations)
# so we filter it out
weird_function = '    def new_func(*args: Any, **kwargs: Any) -> Any:\n        func_state = state.copy()\n        arg_names = [arg.arg for arg in func_def.args.args]\n        default_values = [\n            evaluate_ast(d, state, static_tools, custom_tools, authorized_imports) for d in func_def.args.defaults\n        ]\n\n        # Apply default values\n        defaults = dict(zip(arg_names[-len(default_values) :], default_values))\n\n        # Set positional arguments\n        for name, value in zip(arg_names, args):\n            func_state[name] = value\n\n        # Set keyword arguments\n        for name, value in kwargs.items():\n            func_state[name] = value\n\n        # Handle variable arguments\n        if func_def.args.vararg:\n            vararg_name = func_def.args.vararg.arg\n            func_state[vararg_name] = args\n\n        if func_def.args.kwarg:\n            kwarg_name = func_def.args.kwarg.arg\n            func_state[kwarg_name] = kwargs\n\n        # Set default values for arguments that were not provided\n        for name, value in defaults.items():\n            if name not in func_state:\n                func_state[name] = value\n\n        # Update function state with self and __class__\n        if func_def.args.args and func_def.args.args[0].arg == "self":\n            if args:\n                func_state["self"] = args[0]\n                func_state["__class__"] = args[0].__class__\n\n        result = None\n        try:\n            for stmt in func_def.body:\n                result = evaluate_ast(stmt, func_state, static_tools, custom_tools, authorized_imports)\n        except ReturnException as e:\n            result = e.value\n\n        if func_def.name == "__init__":\n            return None\n\n        return result\n'

number_of_failed_generations = 0

# Filter out generations that are the same as the weird function and keep track of indices
def filter_weird_function(example):
    global number_of_failed_generations
    
    original_length = len(example["generations"])
    valid_indices = [i for i, gen in enumerate(example["generations"]) if gen != weird_function]
    
    # Count failed generations for this example
    failed_count = original_length - len(valid_indices)
    number_of_failed_generations += failed_count
    
    filtered_generations = [example["generations"][i] for i in valid_indices]
    filtered_final_outputs = [example["final_outputs"][i] for i in valid_indices] if valid_indices else []
        
    return {
        "generations": filtered_generations,
        "final_outputs": filtered_final_outputs
    }

dataset = dataset.map(filter_weird_function)

len_dataset = len(dataset)

# filter all examples with no generations
dataset = dataset.filter(lambda x: len(x["generations"]) > 0)

len_dataset_filtered = len(dataset)

print(f"Number of examples filtered out: {len_dataset - len_dataset_filtered}")
print(f"Total number of failed generations: {number_of_failed_generations}")

dataset[0]["generations"][0]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Filter:   0%|          | 0/384 [00:00<?, ? examples/s]

Number of examples filtered out: 232
Total number of failed generations: 1713


'Okay, I need to solve this Theatre Square problem. Let\'s read the problem carefully.\n\nThe problem says that the Theatre Square is a rectangle of n by m meters, and we need to cover it with a×a square flagstones. The key points are that we can\'t break the flagstones, they must be aligned with the square\'s sides, and we have to cover the entire area, possibly with some extra space. The goal is to find the minimum number of flagstones required.\n\nHmm, so for each dimension (n and m), I need to figure out how many flagstones are needed along that dimension. Since the flagstones can\'t be broken, if the length isn\'t a multiple of a, we\'ll need an extra flagstone to cover the remaining part. For example, if n is 6 and a is 4, then 6/4 is 1.5, but we can\'t use half a flagstone. So we round up to 2. Similarly for the m dimension.\n\nSo the approach is: calculate the number of flagstones along the n direction by taking the ceiling of n/a, do the same for m, then multiply the two numbe

In [10]:
# Check if generations are the same for each example (should not happen but this is a sanity check)
exists_example_with_different_generations = False
examples_with_different_generations = 0
total_examples_with_generations = 0

for i, example in enumerate(dataset):
    # if i > 1:
    #     break
    
    generations = example["generations"]
    
    # Skip if there are no generations
    if not generations or len(generations) == 0:
        continue
    
    total_examples_with_generations += 1
    
    # Group identical generations
    unique_generations = {}
    for idx, gen in enumerate(generations):
        found = False
        for group_key, group_indices in unique_generations.items():
            if generations[group_key] == gen:
                group_indices.append(idx)
                found = True
                break
        if not found:
            unique_generations[idx] = [idx]
    
    # If we have more than one group, generations are different
    if len(unique_generations) > 1:
        examples_with_different_generations += 1
        
        # Format the groups for printing
        groups_str = []
        for indices in unique_generations.values():
            if len(indices) > 1:
                groups_str.append(f"{tuple(indices)}")
            else:
                groups_str.append(f"{indices[0]}")
        
        print(f"Example {i} has different generations: {', '.join(groups_str)}")
        
    if len(unique_generations) == 1 and len(generations) > 1:
        exists_example_with_different_generations = True

print(f"There exists an example with the same generations: {exists_example_with_different_generations}")
print(f"Number of examples with different generations: {examples_with_different_generations}")
print(f"Total examples checked: {len(dataset)}")
print(f"Total examples with generations: {total_examples_with_generations}")
# This is indeed working as expected so that's good


Example 12 has different generations: 0, 1
Example 13 has different generations: 0, 1
Example 17 has different generations: 0, 1
Example 21 has different generations: 0, 1
Example 24 has different generations: 0, 1, 2, 3
Example 28 has different generations: 0, 1
Example 31 has different generations: 0, 1, 2
Example 40 has different generations: 0, 1, 2
Example 42 has different generations: 0, 1
Example 44 has different generations: 0, 1
Example 46 has different generations: 0, 1, 2, 3
Example 51 has different generations: 0, 1
Example 59 has different generations: 0, 1
Example 62 has different generations: 0, 1
Example 66 has different generations: 0, 1, 2
Example 67 has different generations: 0, 1
Example 70 has different generations: 0, 1
Example 71 has different generations: 0, 1
Example 76 has different generations: 0, 1
Example 77 has different generations: 0, 1, 2
Example 78 has different generations: 0, 1
Example 79 has different generations: 0, 1
Example 82 has different gener

In [11]:
# find the first final output where the last value does not contain Error:\nReached max steps
for i, example in enumerate(dataset):
    for j, final_output in enumerate(example["final_outputs"]):
        last_content = final_output[-1]["content"][0]["text"]
        last_content = last_content.replace('\n', '\\n')
        if "Error:\\nReached max steps." not in last_content:
            print(f"Example {i} with generation number {j} has no Error")

# Only 5 examples have a final output that does not contain Error:\nReached max steps. but then the answer is not actually correct :(

Example 51 with generation number 0 has no Error
Example 69 with generation number 0 has no Error
Example 85 with generation number 0 has no Error
Example 115 with generation number 0 has no Error
Example 131 with generation number 0 has no Error


## Upload the dataset to the hub


In [12]:
from datasets import Dataset

# Unroll the final outputs, so that each row contains one final_output, this is the format required for training
# Create a new dataset with expanded rows
expanded_dataset = []

for example in dataset:
    # For each final_output in the example, create a new row
    for final_output in example["final_outputs"]:
        # Create a copy of the example
        new_example = {k: v for k, v in example.items()}
        del new_example["final_outputs"]
        # Replace the list of final_outputs with just this one
        new_example["messages"] = final_output
        # Add to our expanded dataset
        expanded_dataset.append(new_example)

# Convert to Dataset

expanded_dataset = Dataset.from_list(expanded_dataset)
print(f"Original dataset size: {len(dataset)}")
print(f"Expanded dataset size: {len(expanded_dataset)}")

# Use the expanded dataset for the rest of the analysis
expanded_dataset


Original dataset size: 152
Expanded dataset size: 207


Dataset({
    features: ['contestId', 'index', 'name', 'type', 'rating', 'tags', 'title', 'time-limit', 'memory-limit', 'problem-description', 'input-specification', 'output-specification', 'demo-input', 'demo-output', 'note', 'points', 'test_cases', 'creationTimeSeconds', 'relativeTimeSeconds', 'programmingLanguage', 'verdict', 'testset', 'passedTestCount', 'timeConsumedMillis', 'memoryConsumedBytes', 'code', 'prompt', 'response', 'score', 'test_count', 'full_test_set', 'accepted_solutions', 'failed_solutions', 'generations', 'finish_reasons', 'api_metadata', 'messages'],
    num_rows: 207
})

In [13]:
len(expanded_dataset[0]["messages"])


33

In [16]:
# Transform message format from 
# {'content': [{'text': 'You are a helpful assistant.'}], 'role': 'system'}
# to
# {'role': 'system', 'content': 'You are a helpful assistant.'}

def transform_message_format(example):
    transformed_messages = []
    for message in example["messages"]:
        role = message["role"]
        # Extract text content from the list of content items
        content = ""
        for content_item in message["content"]:
            if content_item["type"] == "text":
                content += content_item["text"]
        
        # Create new message format
        transformed_messages.append({
            "role": role,
            "content": content
        })
    
    # Update the example with transformed messages
    example["messages"] = transformed_messages
    return example

# Apply the transformation to the entire dataset
expanded_dataset_formatted = expanded_dataset.map(transform_message_format)

expanded_dataset_formatted[0]["messages"]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

[{'content': 'You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.\nTo do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.\nTo solve the task, you must plan forward to proceed in a series of steps, in a cycle of \'Thought:\', \'Code:\', and \'Observation:\' sequences.\n\nAt each step, in the \'Thought:\' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.\nThen in the \'Code:\' sequence, you should write the code in simple Python. The code sequence must end with \'<end_code>\' sequence.\nDuring each intermediate step, you can use \'print()\' to save whatever important information you will then need.\nThese print outputs will then appear in the \'Observation:\' field, which will be available as input for the next step.\nIn the end you have to return a final answer using the `final_answer` 

In [17]:
# rename prompt to prompt_original
expanded_dataset_formatted = expanded_dataset_formatted.rename_column("prompt", "prompt_original")


ValueError: Original column name prompt not in the dataset. Current columns in the dataset: ['contestId', 'index', 'name', 'type', 'rating', 'tags', 'title', 'time-limit', 'memory-limit', 'problem-description', 'input-specification', 'output-specification', 'demo-input', 'demo-output', 'note', 'points', 'test_cases', 'creationTimeSeconds', 'relativeTimeSeconds', 'programmingLanguage', 'verdict', 'testset', 'passedTestCount', 'timeConsumedMillis', 'memoryConsumedBytes', 'code', 'prompt_original', 'response', 'score', 'test_count', 'full_test_set', 'accepted_solutions', 'failed_solutions', 'generations', 'finish_reasons', 'api_metadata', 'messages']

In [18]:
expanded_dataset_formatted.push_to_hub("baptistecolle/codeforces-agentic-generations")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/baptistecolle/codeforces-agentic-generations/commit/aed7e9673e7c1dcadeb1a489e627d750955f3c8b', commit_message='Upload dataset', commit_description='', oid='aed7e9673e7c1dcadeb1a489e627d750955f3c8b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/baptistecolle/codeforces-agentic-generations', endpoint='https://huggingface.co', repo_type='dataset', repo_id='baptistecolle/codeforces-agentic-generations'), pr_revision=None, pr_num=None)