In [20]:
import json
from pathlib import Path
from tqdm import tqdm
import math
import pandas as pd

# Path to your JSONL file
file_path = "/project/flame/asetlur/rc_dump.jsonl"

# Each line is a JSON list, merge all into one list, then make a DataFrame from the merged list of dicts
import json

all_items = []
with open(file_path, "r") as f:
    for line in f:
        items = json.loads(line)
        for item in items:
            # If 'metadata' exists and is a dictionary, merge it into the item
            metadata = item.pop("metadata", {})
            if isinstance(metadata, dict):
                # Avoid overwriting existing keys in item when merging metadata
                for k, v in metadata.items():
                    if k not in item:
                        item[k] = v
                    else:
                        # If key collision, optionally rename metadata keys, or overwrite
                        item[f"metadata_{k}"] = v
            all_items.append(item)

df = pd.DataFrame(all_items)

print(f"Loaded {len(df)} rows")
print(f"Dataframe shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(df.head())





Loaded 796 rows
Dataframe shape: (796, 17)
Columns: ['text', 'n_predicted', 'reward', 'logprobs', 'ref_logprobs', 'input_ids', 'labels', 'group_id', 'finished', 'prompt_tokens', 'output_tokens', 'visual_features', 'model_version', 'rollout_index', 'cycle_step', 'turn_type', 'turn_number']
                                                text  n_predicted  reward  \
0  <|im_start|>system\nPlease reason step by step...         1088     0.0   
1  <|im_start|>user\nSummarize the solution progr...          245     0.0   
2  <|im_start|>system\nPlease reason step by step...          510     0.0   
3  <|im_start|>user\nSummarize the solution progr...          227     0.0   
4  <|im_start|>system\nPlease reason step by step...         1147     0.0   

                                            logprobs ref_logprobs  \
0  [-0.03804141283035278, -0.575939416885376, 0.0...           []   
1  [0.0, 0.0, 0.0, 0.0, 0.0, -1.2484334707260132,...           []   
2  [0.0, -0.12844371795654297, 0.0, -0.1

In [21]:
grouped = df.groupby("group_id")
first_group_id, first_group_df = next(iter(grouped))
print(f"First group_id: {first_group_id}")
print(first_group_df)



First group_id: train RC scheduler for llms 0,1_0
                                                 text  n_predicted  reward  \
80  <|im_start|>system\nPlease reason step by step...        24092     0.0   
81  <|im_start|>user\nSummarize the solution progr...          871     0.0   
82  <|im_start|>system\nPlease reason step by step...         3778     0.0   
83  <|im_start|>user\nSummarize the solution progr...          652     0.0   

                                             logprobs ref_logprobs  \
80  [0.0, 0.0, -0.6931471824645996, 0.0, 0.0, 0.0,...           []   
81  [-2.8703181743621826, 0.0, -0.1602240949869156...           []   
82  [0.0, -0.1602240949869156, -0.3132617473602295...           []   
83  [0.0, 0.0, 0.0, 0.0, -0.2014133334159851, -1.5...           []   

                                            input_ids  \
80  [151644, 8948, 198, 5501, 2874, 3019, 553, 301...   
81  [151644, 872, 198, 9190, 5612, 551, 279, 6291,...   
82  [151644, 8948, 198, 5501, 2874, 3

In [22]:
# Filter for turn_type == "reasoning"
reasoning_df = df[df["turn_type"] == "reasoning"]

# Group by cycle_step and compute average reward
cycle_step_group = reasoning_df.groupby("cycle_step")
avg_reward_by_cycle_step = cycle_step_group["reward"].mean()

print("Average reward by cycle_step (only for turn_type == 'reasoning'):")
print(avg_reward_by_cycle_step)


Average reward by cycle_step (only for turn_type == 'reasoning'):
cycle_step
0    0.135678
2    0.135678
Name: reward, dtype: float64


In [26]:
# Find a group where at least one row has reward == 1
example_group = None
for group_id, group_df in grouped:
    if (group_df["reward"] == 1).any():
        example_group = group_df
        break

if example_group is not None:
    print(f"Example group_id with reward == 1: {example_group['group_id'].iloc[0]}")
    # Sort by cycle_step to show 4 turns in order if present
    example_group_sorted = example_group.sort_values(by="cycle_step")
    for idx, row in example_group_sorted.iterrows():
        print(f"Turn cycle_step: {row.get('cycle_step', idx)}")
        print(f"Text: {row.get('text', '')}")
        print(f"Reward: {row.get('reward', '[no reward]')}")
        print("------" * 10)
else:
    print("No group found where any reward == 1.")


Example group_id with reward == 1: train RC scheduler for llms 0,1_10
Turn cycle_step: 0
Text: <|im_start|>system
Please reason step by step, and put your final answer within \boxed{}.<|im_end|>
<|im_start|>user
Solve this problem step by step.

Problem: You are given a problem and a partial solution.

Your task is to carefully study the partial response, identify what reasoning or steps are already provided, and then complete the solution from where it left off. Ensure your continuation is logically consistent and leads to a complete and correct final answer.

**Important**: Show your reasoning step-by-step, and clearly present the final answer using LaTeX-style `\boxed{}` notation.

Problem:
Over all pairs of complex numbers $(x, y)$ satisfying the equations $$x+2y^{2}=x^{4} \quad \text{and} \quad y+2x^{2}=y^{4}$$ compute the minimum possible real part of $x$.

Partial Response:
Let the given equations be
(1) $x+2y^2 = x^4$
(2) $y+2x^2 = y^4$

First, consider the case where $x=y$. Th

In [24]:
df.columns

Index(['text', 'n_predicted', 'reward', 'logprobs', 'ref_logprobs',
       'input_ids', 'labels', 'group_id', 'finished', 'prompt_tokens',
       'output_tokens', 'visual_features', 'model_version', 'rollout_index',
       'cycle_step', 'turn_type', 'turn_number'],
      dtype='object')