In [1]:
from datasets import load_dataset

initial_dataset = load_dataset('AI-MO/NuminaMath-CoT')

In [2]:
from typing import cast

import torch
from transformer_lens import HookedTransformer  # type: ignore
from transformers import AutoModelForCausalLM, AutoTokenizer  # type: ignore

from model_diffing.scripts.config_common import LLMConfig

def build_llm(
        hf_model_id: str):
    model = AutoModelForCausalLM.from_pretrained(
        hf_model_id
    )
    return model
    


# load models for generating new dataset

r1_distil = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
qwen_math = 'Qwen/Qwen2.5-Math-1.5B'

# load these as HookedTransformers
r1_model = build_llm(r1_distil)
qwen_model = build_llm(qwen_math)

tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', padding_side='left')

In [4]:
import torch
from torch.utils.data import DataLoader
from datasets import Dataset

def batch_get_responses(model, problems, batch_size=4, add_think=False):
    responses = []
    
    # Create batches
    for i in range(0, len(problems), batch_size):
        batch_problems = problems[i:i + batch_size]
        
        # Add think token if needed
        if add_think:
            batch_problems = [p + ' <think>\n' for p in batch_problems]
            
        # Tokenize batch
        batch_inputs = tokenizer(
            batch_problems, 
            return_tensors='pt',
            padding=True,
            truncation=True
        ).to('cuda')
        
        # Generate responses
        with torch.no_grad():
            outputs = model.generate(
                **batch_inputs,
                max_new_tokens=528,
                pad_token_id=tokenizer.pad_token_id
            )
            
        # Decode responses
        batch_responses = [
            tokenizer.decode(output, skip_special_tokens=True) 
            for output in outputs
        ]
        responses.extend(batch_responses)
        
        del batch_inputs, outputs
        torch.cuda.empty_cache()
        
    return responses




In [5]:
from tqdm.notebook import tqdm
from datasets import Dataset
import gc
import torch

from tqdm.notebook import tqdm
from datasets import Dataset
import gc
import torch

# Define a function outside the generator to handle a single batch
def process_batch(batch_problems, batch_size):
    try:
        r1_responses = batch_get_responses(r1_model, batch_problems, batch_size=batch_size, add_think=True)
        # Convert to simple Python objects if needed
        r1_responses = [str(r) for r in r1_responses]  # Ensure they're strings
        
        # Clear CUDA cache
        torch.cuda.empty_cache()
        
        # Qwen generation
        qwen_responses = batch_get_responses(qwen_model, batch_problems, batch_size=batch_size)
        # Convert to simple Python objects if needed
        qwen_responses = [str(q) for q in qwen_responses]  # Ensure they're strings
        
        # Clear CUDA cache again
        torch.cuda.empty_cache()
        
        return r1_responses, qwen_responses
    
    except Exception as e:
        print(f"Error processing batch: {e}")
        # Return empty lists if there's an error
        return [], []

'''def gen_model_responses(problems, batch_size=4):
    """Generator that yields alternating R1 and Qwen responses for each problem"""
    
    # Move models to CPU initially
    
    with tqdm(total=len(problems)*2, desc="Generating responses") as pbar:
        for i in range(0, len(problems), batch_size):
            # Get current batch
            batch_problems = problems[i:i + batch_size]
            
            # Process batch and get responses
            r1_responses, qwen_responses = process_batch(batch_problems, batch_size)
            
            if not r1_responses or not qwen_responses:
                print(f"Skipping batch {i} due to error")
                continue
                
            # Yield alternating responses
            for j, (problem, r1_resp, qwen_resp) in enumerate(zip(batch_problems, r1_responses, qwen_responses)):
                # Yield R1 response
                yield {
                    'problem': problem,
                    'text': r1_resp,
                    'model': 'r1'
                }
                pbar.update(1)
                
                # Yield Qwen response
                yield {
                    'problem': problem,
                    'text': qwen_resp,
                    'model': 'qwen'
                }
                pbar.update(1)
            
            # Force garbage collection after each batch
            del r1_responses
            del qwen_responses
            del batch_problems
            gc.collect()
            torch.cuda.empty_cache()
            
            # Add an explicit pause between batches to allow memory cleanup
            import time
            time.sleep(0.5)



problems = initial_dataset['train']['problem'][:7680]
# Create dataset using generator
combined_dataset = Dataset.from_generator(
    lambda: gen_model_responses(problems, batch_size=128)  # Reduced batch size
)'''

'def gen_model_responses(problems, batch_size=4):\n    """Generator that yields alternating R1 and Qwen responses for each problem"""\n    \n    # Move models to CPU initially\n    \n    with tqdm(total=len(problems)*2, desc="Generating responses") as pbar:\n        for i in range(0, len(problems), batch_size):\n            # Get current batch\n            batch_problems = problems[i:i + batch_size]\n            \n            # Process batch and get responses\n            r1_responses, qwen_responses = process_batch(batch_problems, batch_size)\n            \n            if not r1_responses or not qwen_responses:\n                print(f"Skipping batch {i} due to error")\n                continue\n                \n            # Yield alternating responses\n            for j, (problem, r1_resp, qwen_resp) in enumerate(zip(batch_problems, r1_responses, qwen_responses)):\n                # Yield R1 response\n                yield {\n                    \'problem\': problem,\n             

In [None]:
from tqdm.notebook import tqdm
import csv
import gc
import torch
import os

# Define output file
output_file = "model_responses.csv"
problems = initial_dataset['train']['problem'][:7680]

del initial_dataset

# Create CSV file with headers
with open(output_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['problem', 'text', 'model'])

def process_and_save_batch(batch_problems, batch_index, batch_size):
    """Process a batch and save results directly to CSV"""
    try:
        print(f"Processing batch {batch_index}, problems {batch_index*batch_size} to {batch_index*batch_size + len(batch_problems) - 1}")
        
        # Move R1 model to GPU and generate responses
        r1_model.to('cuda')
        r1_responses = batch_get_responses(r1_model, batch_problems, batch_size=batch_size, add_think=True)
        r1_model.to('cpu')
        torch.cuda.empty_cache()
        
        # Move Qwen model to GPU and generate responses
        qwen_model.to('cuda')
        qwen_responses = batch_get_responses(qwen_model, batch_problems, batch_size=batch_size)
        qwen_model.to('cpu')
        torch.cuda.empty_cache()
        
        # Write results to CSV
        with open(output_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            
            for problem, r1_resp, qwen_resp in zip(batch_problems, r1_responses, qwen_responses):
                # Write R1 response
                writer.writerow([problem, r1_resp, 'r1'])
                
                # Write Qwen response
                writer.writerow([problem, qwen_resp, 'qwen'])
        
        # Clean up
        del r1_responses, qwen_responses
        gc.collect()
        
        return True
    except Exception as e:
        print(f"Error processing batch {batch_index}: {e}")
        return False

# Process all problems in batches
batch_size = 16  # Start with a small batch size
total_batches = (len(problems) + batch_size - 1) // batch_size

with tqdm(total=total_batches, desc="Processing batches") as pbar:
    for batch_index in range(total_batches):
        start_idx = batch_index * batch_size
        end_idx = min(start_idx + batch_size, len(problems))
        batch_problems = problems[start_idx:end_idx]
        
        # Process batch and save to CSV
        success = process_and_save_batch(batch_problems, batch_index, batch_size)
        
        if not success:
            print(f"Failed to process batch {batch_index}, skipping to next batch")
        
        # Force garbage collection
        gc.collect()
        torch.cuda.empty_cache()
        
        pbar.update(1)

# Once complete, you can load the CSV back into a dataset if needed
print(f"Complete! All responses saved to {output_file}")
print(f"Loading back into a dataset...")

from datasets import Dataset
import pandas as pd

combined_dataset = Dataset.from_pandas(pd.read_csv(output_file))
print(f"Dataset loaded with {len(combined_dataset)} examples")
# sort by problem


Processing batches:   0%|          | 0/480 [00:00<?, ?it/s]

Processing batch 0, problems 0 to 15
Processing batch 1, problems 16 to 31
Processing batch 2, problems 32 to 47
Processing batch 3, problems 48 to 63
Processing batch 4, problems 64 to 79
Processing batch 5, problems 80 to 95
Processing batch 6, problems 96 to 111
Processing batch 7, problems 112 to 127
Processing batch 8, problems 128 to 143
Processing batch 9, problems 144 to 159
Processing batch 10, problems 160 to 175
Processing batch 11, problems 176 to 191
Processing batch 12, problems 192 to 207
Processing batch 13, problems 208 to 223
Processing batch 14, problems 224 to 239
Processing batch 15, problems 240 to 255
Processing batch 16, problems 256 to 271
Processing batch 17, problems 272 to 287
Processing batch 18, problems 288 to 303
Processing batch 19, problems 304 to 319
Processing batch 20, problems 320 to 335
Processing batch 21, problems 336 to 351
Processing batch 22, problems 352 to 367
Processing batch 23, problems 368 to 383
Processing batch 24, problems 384 to 399

In [None]:
# split test and train 90 10
test_size = 0.1
combined_dataset = combined_dataset.train_test_split(
    test_size=test_size,
    shuffle=True,
    seed=42
)

#combined_dataset = combined_dataset.sort('problem')
combined_dataset.push_to_hub('annasoli/r1qw_numinamath1')

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from datasets import load_dataset

# load dataset
dataset = load_dataset('annasoli/r1qw_numinamath')
dataset.push_to_hub('annasoli/r1qw_numinamath1')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/annasoli/r1qw_numinamath1/commit/3ef6b3bd8081ab8b6c02ec8d358603c61b764b2c', commit_message='Upload dataset', commit_description='', oid='3ef6b3bd8081ab8b6c02ec8d358603c61b764b2c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/annasoli/r1qw_numinamath1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='annasoli/r1qw_numinamath1'), pr_revision=None, pr_num=None)