In [1]:
import agi

In [2]:
import agi
import os

client = agi.Client("")

# Define the expected file path
file_path = 'gr_data/task_data_math-olympiads_DeepSeek-R1.jsonl'

# Download reasoning traces and verifications as a .jsonl only if it doesn't exist
if not os.path.exists(file_path):
    client.data.get(task='math-olympiads', model='DeepSeek-R1')
else:
    print(f"'{file_path}' already exists, skipping download.")

'gr_data/task_data_math-olympiads_DeepSeek-R1.jsonl' already exists, skipping download.


In [3]:
from datasets import load_dataset

# Load the downloaded JSONL file into a HuggingFace dataset
dataset = load_dataset('json', data_files='gr_data/task_data_math-olympiads_DeepSeek-R1.jsonl')


In [4]:
dataset["train"]

Dataset({
    features: ['question', 'answer_content', 'reasoning_content', 'reference_answer', 'verifier_score', 'id', 'metadata', 'community_score'],
    num_rows: 11962
})

In [5]:
len(dataset["train"])

11962

In [6]:
dataset["train"][0]["reasoning_content"]

'Alright, so the question is whether there exists a convex quadrilateral where each diagonal divides it into two acute-angled triangles. Hmm, let me think about this step by step. \n\nFirst, let me recall some properties of convex quadrilaterals and their diagonals. In a convex quadrilateral, the two diagonals intersect each other inside the quadrilateral, dividing it into four triangles. The problem is specifically asking about each diagonal creating two acute-angled triangles. That means, for each diagonal, both of the triangles formed by that diagonal must have all their angles acute. \n\nOkay, so let\'s start by considering one diagonal. Suppose we have quadrilateral ABCD with diagonal AC. If AC divides ABCD into triangles ABC and ACD, both of these triangles need to be acute. Similarly, the other diagonal BD should divide the quadrilateral into triangles ABD and BCD, which also need to be acute. \n\nWait, but each triangle has three angles. So for triangle ABC, all angles at A, B,

In [12]:
dataset["train"][0]

{'question': 'Is there a convex quadrilateral such that each diagonal of it divides it into two acute-angled triangles?',
 'answer_content': "No, such a convex quadrilateral does not exist. Here's the reasoning:\n\n1. **Angle Sum Constraints**: In a convex quadrilateral, the sum of all interior angles is 360°. For all angles to be acute (each < 90°), their total would be less than 360°, which is impossible. Thus, at least one angle must be non-acute (≥ 90°).\n\n2. **Impact of Diagonals**: If a diagonal divides the quadrilateral into two triangles, each triangle must have all angles acute. However, if the original quadrilateral has a non-acute angle, this angle will directly appear in one of the triangles formed by a diagonal, violating the requirement for all triangle angles to be acute.\n\n3. **Contradiction**: Even if non-acute angles are split by diagonals, the resulting angles must still sum to the original non-acute angle. Splitting such an angle into two parts cannot make both pa

In [7]:
def count_wait_but(reasoning_content):
    # Convert to lowercase to catch all cases
    text = reasoning_content.lower()
    
    # Count occurrences of "wait" and "but"
    wait_count = text.count("wait")
    but_count = text.count("but")
    
    return wait_count + but_count

# Apply the function to all reasoning contents
wait_but_counts = [count_wait_but(item["reasoning_content"]) for item in dataset["train"]]

print(f"Total number of 'Wait' or 'But' occurrences: {sum(wait_but_counts)}")
print(f"Average number per reasoning: {sum(wait_but_counts)/len(wait_but_counts):.2f}")


Total number of 'Wait' or 'But' occurrences: 849678
Average number per reasoning: 71.03


In [8]:
import re
from collections import Counter

def get_sentence_starters(text, n_words=3):
    # Split into sentences - handle multiple sentence-ending punctuation
    sentences = re.split('[.!?]+', text)
    
    # Get the first n words of each sentence, skip empty sentences
    starters = []
    for sentence in sentences:
        # Clean and split the sentence
        sentence = sentence.strip()
        if sentence:
            words = sentence.split()
            if words:
                # Take up to n words, but don't exceed sentence length
                starter = ' '.join(words[:min(n_words, len(words))])
                starters.append(starter.lower())
    
    return starters

# Get starters from all reasoning contents
all_starters = []
for item in dataset["train"]:
    starters = get_sentence_starters(item["reasoning_content"])
    all_starters.extend(starters)

# Count the most common starters
starter_counts = Counter(all_starters)

# Print top 20 most common starters
print("Top 20 most common sentence starters:")
for starter, count in starter_counts.most_common(50):
    print(f'"{starter}": {count} times')


Top 20 most common sentence starters:
"wait, no": 23213 times
"but the problem": 18528 times
"let me check": 13485 times
"alternatively, maybe the": 12948 times
"for example, if": 12839 times
"let me try": 12244 times
"let me think": 11380 times
"hmm": 10639 times
"5": 10393 times
"wait, but the": 9622 times
"therefore, the answer": 9268 times
"correct": 8954 times
"wait, but in": 8953 times
"but since the": 8874 times
"we need to": 8733 times
"2": 8536 times
"e": 7770 times
"alternatively, consider that": 7749 times
"let's see": 7653 times
"okay, so i": 7363 times
"but this is": 7206 times
"if we can": 7138 times
"wait, but if": 6682 times
"but let me": 6287 times
"3": 5972 times
"alternatively, use the": 5936 times
"let me denote": 5722 times
"however, the problem": 5567 times
"but we need": 5463 times
"but in the": 5324 times
"wait, maybe the": 5228 times
"but according to": 5176 times
"alternatively, think of": 5046 times
"wait, let me": 4998 times
"maybe there's a": 4859 times
"su

In [9]:
from transformers import AutoTokenizer

# Load the tokenizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
# Tokenize a list of words to analyze token patterns
words = ["wait", "but", "alternatively"]
tokenizer.add_special_tokens = False

print("Token analysis:")
for word in words:
    tokens = tokenizer.encode(word)
    print(f"{word}: {tokens} -> {tokenizer.decode(tokens)}")

Token analysis:
wait: [128000, 11748] -> <｜begin▁of▁sentence｜>wait
but: [128000, 8248] -> <｜begin▁of▁sentence｜>but
alternatively: [128000, 41512, 8046] -> <｜begin▁of▁sentence｜>alternatively


In [13]:
import random
import re
from datasets import Dataset

def get_reasoning_prefixes_and_suffixes(text, end_markers=None, k=5):
    """
    Extract prefixes and their corresponding suffixes from reasoning text.
    Prefixes end at the end of a sentence that comes before a marker,
    and suffixes are everything after that point.
    
    Args:
        text (str): The reasoning text to extract prefixes and suffixes from
        end_markers (list): List of markers to look for
        k (int): Maximum number of prefix-suffix pairs to return
    
    Returns:
        tuple: Lists of prefixes, suffixes, and marker positions
    """
    if end_markers is None:
        end_markers = [" Wait", " But", " Hmm", " However"]
    
    prefixes = []
    suffixes = []
    markers = []
    
    # Find all occurrences of markers
    for marker in end_markers:
        # Find all indices of the marker
        marker_indices = [m.start() for m in re.finditer(re.escape(marker), text)]
        
        for idx in marker_indices:
            # Look for the end of the previous sentence
            # Find the last period, exclamation mark, or question mark before the marker
            sentence_end = max(
                text.rfind('.', 0, idx),
                text.rfind('!', 0, idx),
                text.rfind('?', 0, idx)
            )
            
            if sentence_end != -1:
                # Extract the prefix (everything up to and including the sentence end)
                prefix = text[:sentence_end + 1]
                # Extract the suffix (everything after the sentence end)
                suffix = text[sentence_end + 1:]
                
                prefixes.append(prefix)
                suffixes.append(suffix)
                markers.append(marker)
    
    # Return up to k random prefix-suffix pairs
    if len(prefixes) > k:
        selected_indices = random.sample(range(len(prefixes)), k)
        return ([prefixes[i] for i in selected_indices], 
                [suffixes[i] for i in selected_indices],
                [markers[i] for i in selected_indices])
    return prefixes, suffixes, markers

# Extract prefixes and suffixes from all reasoning contents
all_prefixes = []
all_suffixes = []
all_markers = []
prefix_sources = []
questions = []
answers = []

for item in dataset["train"]:
    prefixes, suffixes, markers = get_reasoning_prefixes_and_suffixes(item["reasoning_content"])
    if prefixes:
        all_prefixes.extend(prefixes)
        all_suffixes.extend(suffixes)
        all_markers.extend(markers)
        # Keep track of which item each prefix came from
        prefix_sources.extend([item["id"]] * len(prefixes))
        # Also keep the original question and answer
        questions.extend([item["question"]] * len(prefixes))
        answers.extend([item["answer_content"]] * len(prefixes))

# Create a dataset with the prefixes and suffixes
prefix_dataset = Dataset.from_dict({
    "prefix": all_prefixes,
    "suffix": all_suffixes,
    "marker": all_markers,
    "source_id": prefix_sources,
    "question": questions,
    "answer": answers
})

print(f"Created dataset with {len(prefix_dataset)} reasoning prefix-suffix pairs")
if len(prefix_dataset) > 0:
    print(f"Sample prefix: {prefix_dataset[0]['prefix'][:100]}...")
    print(f"Sample suffix: {prefix_dataset[0]['suffix'][:100]}...")
    print(f"Sample marker: {prefix_dataset[0]['marker']}")
else:
    print("No prefix-suffix pairs found")


Created dataset with 56007 reasoning prefix-suffix pairs
Sample prefix: Alright, so the question is whether there exists a convex quadrilateral where each diagonal divides ...
Sample suffix:  However, in triangle ABC, angle at B is part of the quadrilateral's angle B, which is acute. Simila...
Sample marker:  However


In [15]:
# Upload the dataset to Hugging Face
from huggingface_hub import login
import os
os.environ["HF_TOKEN"] = ""

# Push the dataset to the Hugging Face Hub
prefix_dataset.push_to_hub(
    "avrecum/reasoning_prefixes_suffixes",
    private=True,
    token=os.environ.get("HF_TOKEN", None)
)

print("Dataset uploaded to Hugging Face at avrecum/reasoning_prefixes_suffixes (private)")


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Dataset uploaded to Hugging Face at avrecum/reasoning_prefixes_suffixes (private)


In [24]:
def extract_reasoning_prefixes(text, min_prefix_length=50, max_prefix_length=500):
    """Extract reasoning prefixes from text by splitting at sentence boundaries."""
    prefixes = []
    suffixes = []
    markers = []
    
    # Skip if text is too short
    if len(text) < min_prefix_length:
        return [], [], []
    
    # Split text into sentences
    # Look for common sentence endings followed by a space and capital letter
    sentence_boundaries = []
    for match in re.finditer(r'[.!?]\s+[A-Z]', text):
        sentence_boundaries.append(match.start() + 1)  # Include the punctuation
    
    # Add potential splitting points
    for i in range(len(sentence_boundaries)):
        split_point = sentence_boundaries[i]
        
        # Only consider if prefix is long enough
        if split_point < min_prefix_length:
            continue
            
        # Skip if prefix is too long
        if split_point > max_prefix_length:
            continue
            
        prefix = text[:split_point]
        suffix = text[split_point+1:].lstrip()  # +1 to skip the space after punctuation
        
        # Skip if suffix starts with excluded phrases
        # Check for phrases that indicate a change in reasoning direction
        excluded_phrases = ["Wait", "But", "Hmm", "However"]
        if any(suffix.lower().lstrip().startswith(phrase.lower()) for phrase in excluded_phrases):
            continue
            
        prefixes.append(prefix)
        suffixes.append(suffix)
        markers.append(split_point)
    
    return prefixes, suffixes, markers

def get_prefixes_without_specific_endings(dataset_split):
    """Extract prefixes that don't end with specific phrases from the dataset."""
    all_prefixes = []
    all_suffixes = []
    all_markers = []
    prefix_sources = []
    questions = []
    answers = []
    
    for i, item in enumerate(dataset_split):
        answer = item["answer_content"]
        item_id = item["id"]
        
        # Extract prefixes and suffixes
        prefixes, suffixes, markers = extract_reasoning_prefixes(answer)
        
        # Add to our collections
        if prefixes:
            all_prefixes.extend(prefixes)
            all_suffixes.extend(suffixes)
            all_markers.extend(markers)
            prefix_sources.extend([item_id] * len(prefixes))
            questions.extend([item["question"]] * len(prefixes))
            answers.extend([answer] * len(prefixes))
    
    # Create a dataset with the filtered prefixes and suffixes
    filtered_dataset = Dataset.from_dict({
        "prefix": all_prefixes,
        "suffix": all_suffixes,
        "marker": all_markers,
        "source_id": prefix_sources,
        "question": questions,
        "answer": answers
    })
    
    return filtered_dataset

# Create the filtered dataset from the original dataset's train split
filtered_prefix_dataset = get_prefixes_without_specific_endings(dataset["train"])

print(f"Created dataset with {len(filtered_prefix_dataset)} reasoning prefix-suffix pairs")
if len(filtered_prefix_dataset) > 0:
    print(f"Sample prefix: {filtered_prefix_dataset[0]['prefix'][:100]}...")
    print(f"Sample suffix: {filtered_prefix_dataset[0]['suffix'][:100]}...")
    print(f"Sample marker: {filtered_prefix_dataset[0]['marker']}")
else:
    print("No prefix-suffix pairs found")

Created dataset with 19397 reasoning prefix-suffix pairs
Sample prefix: No, such a convex quadrilateral does not exist. Here's the reasoning:

1. **Angle Sum Constraints**:...
Sample suffix: For all angles to be acute (each < 90°), their total would be less than 360°, which is impossible. T...
Sample marker: 167


In [25]:
# Upload the filtered dataset to Hugging Face
filtered_prefix_dataset.push_to_hub(
    "avrecum/reasoning_prefixes_suffixes_no_backtracking",
    private=True,
    token=os.environ.get("HF_TOKEN", None)
)

print("Filtered dataset uploaded to Hugging Face at avrecum/reasoning_prefixes_suffixes_filtered (private)")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/458 [00:00<?, ?B/s]

Filtered dataset uploaded to Hugging Face at avrecum/reasoning_prefixes_suffixes_filtered (private)


In [27]:
# Create a dataset with random splittings of reasoning traces
def create_random_splits(dataset, num_splits_per_item=5, min_prefix_length=50, max_prefix_length=None):
    """
    Create random splits of reasoning traces without regard for sentence boundaries.
    
    Args:
        dataset: The dataset containing reasoning traces
        num_splits_per_item: Number of random splits to create per reasoning trace
        min_prefix_length: Minimum length of prefix in characters
        max_prefix_length: Maximum length of prefix in characters (defaults to None)
    
    Returns:
        A dataset with random prefix-suffix pairs
    """
    import random
    
    all_prefixes = []
    all_suffixes = []
    all_markers = []
    prefix_sources = []
    questions = []
    answers = []
    
    # Define markers that should not start the suffix
    end_markers = [" Wait", " But", " Hmm", " However"]
    
    for item in dataset:
        item_id = item["id"]
        answer = item.get("reasoning_content", "")
        
        if not answer or len(answer) < min_prefix_length + 20:  # Ensure enough content for meaningful splits
            continue
        
        # Determine the valid range for split points
        min_idx = min_prefix_length
        max_idx = len(answer) - 20  # Ensure suffix is at least 20 chars
        
        if max_prefix_length:
            max_idx = min(max_idx, max_prefix_length)
        
        if min_idx >= max_idx:
            continue
        
        # Create random splits
        for _ in range(num_splits_per_item):
            # Choose a random split point
            split_idx = random.randint(min_idx, max_idx)
            
            # Find the nearest space to make the split after a complete word
            space_after = answer.find(" ", split_idx)
            if space_after == -1:  # No space found after split_idx
                space_after = len(answer) - 1
            
            # Check if the suffix starts with any of the end markers
            valid_split = True
            suffix_start = answer[space_after:space_after+10].lower()
            for marker in end_markers:
                if suffix_start.lower().startswith(marker.lower()):
                    valid_split = False
                    break
            
            if not valid_split:
                continue
            
            # Create the prefix and suffix
            prefix = answer[:space_after].strip()
            suffix = answer[space_after:].strip()
            
            # Add to our collections
            all_prefixes.append(prefix)
            all_suffixes.append(suffix)
            all_markers.append("random_split")
            prefix_sources.append(item_id)
            questions.append(item["question"])
            answers.append(answer)
    
    # Create a dataset with the random splits
    random_splits_dataset = Dataset.from_dict({
        "prefix": all_prefixes,
        "suffix": all_suffixes,
        "marker": all_markers,
        "source_id": prefix_sources,
        "question": questions,
        "answer": answers
    })
    
    return random_splits_dataset

# Create the random splits dataset from the original dataset's train split
random_splits_dataset = create_random_splits(
    dataset["train"], 
    num_splits_per_item=5,
    min_prefix_length=100
)

print(f"Created random splits dataset with {len(random_splits_dataset)} prefix-suffix pairs")
if len(random_splits_dataset) > 0:
    print(f"Sample random prefix: {random_splits_dataset[0]['prefix'][-50:]}...")
    print(f"Sample random suffix: {random_splits_dataset[0]['suffix'][:50]}...")

Created random splits dataset with 58663 prefix-suffix pairs
Sample random prefix:  acute angles. 

Therefore, all four angles of the...
Sample random suffix: quadrilateral must be acute, since angles at B and...


In [28]:

# Upload the random splits dataset to Hugging Face
random_splits_dataset.push_to_hub(
    "avrecum/reasoning_prefixes_suffixes_no_backtracking_random",
    private=True,
    token=os.environ.get("HF_TOKEN", None)
)

print("Random splits dataset uploaded to Hugging Face at avrecum/reasoning_random_splits (private)")


Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Random splits dataset uploaded to Hugging Face at avrecum/reasoning_random_splits (private)
