In [None]:
# !pip install -q unsloth # install unsloth
# !pip install -q --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
# !pip uninstall -q peft transformers trl accelerate bitsandbytes -y
# !pip install -q peft transformers trl accelerate bitsandbytes

In [None]:
# !pip install -q -U datasets 

In [None]:
from unsloth import FastLanguageModel
import torch # Import PyTorch
from trl import SFTTrainer # Trainer for supervised fine-tuning (SFT)
from unsloth import is_bfloat16_supported # Checks if the hardware supports bfloat16 precision
# Hugging Face modules
from transformers import TrainingArguments # Defines training hyperparameters
from datasets import load_dataset # Lets you load fine-tuning datasets

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
ds = load_dataset("Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B")
ds

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'instruction', 'response', 'conversations', 'gen_input_configs', 'gen_response_configs', 'intent', 'knowledge', 'difficulty', 'difficulty_generator', 'input_quality', 'quality_explanation', 'quality_generator', 'task_category', 'other_task_category', 'task_category_generator', 'language'],
        num_rows: 249922
    })
})

In [None]:

def create_dataset(original_dataset):
    import pandas as pd
    # Convert to pandas DataFrame for faster processing
    df = original_dataset['train'].to_pandas()

    # Initial filtering
    mask = (
        df['difficulty'].isin(['easy', 'medium', 'hard', 'very hard']) &
        (df['task_category'] == 'Math') &
        df['response'].str.contains('</think>', na=False) &
        df.notna().all(axis=1)
    )
    filtered_df = df[mask]

    # Separate very hard samples
    very_hard_df = filtered_df[filtered_df['difficulty'] == 'very hard']

    # Get main difficulties
    main_df = filtered_df[filtered_df['difficulty'].isin(['easy', 'medium', 'hard'])]

    # Get counts and minimum
    difficulty_counts = main_df['difficulty'].value_counts()
    min_count = min(difficulty_counts[['easy', 'medium', 'hard']])
    # Sample equal numbers from each difficulty
    balanced_dfs = []
    for diff in ['easy', 'medium', 'hard']:
        diff_df = main_df[main_df['difficulty'] == diff]
        balanced_dfs.append(diff_df.sample(n=min_count, random_state=42))
    # Combine all dataframes
    final_df = pd.concat(balanced_dfs + [very_hard_df], ignore_index=True)

    # Shuffle
    final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)
    final_df = final_df[['instruction', 'response', 'intent', 'knowledge', 'difficulty']]
    print("Final difficulty distribution:")
    print(final_df['difficulty'].value_counts())

    # Convert back to HuggingFace dataset
    from datasets import Dataset
    final_ds = Dataset.from_pandas(final_df)

    del df, filtered_df, very_hard_df, main_df, balanced_dfs, difficulty_counts, min_count
    return final_ds

filtered_ds = create_dataset(ds)
filtered_ds

Final difficulty distribution:
difficulty
easy         3881
medium       3881
hard         3881
very hard      12
Name: count, dtype: int64


Dataset({
    features: ['instruction', 'response', 'intent', 'knowledge', 'difficulty'],
    num_rows: 11655
})

In [None]:
ds.cleanup_cache_files()
del ds

In [None]:
# Set parameters
max_seq_length = 4096 # Define the maximum sequence length a model can handle (i.e. how many tokens can be processed at once)
dtype = None # Set to default
load_in_4bit = True # Enables 4 bit quantization — a memory saving optimization

# Load the DeepSeek R1 model and tokenizer using unsloth — imported using: from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    max_seq_length=max_seq_length, # Ensure the model can process up to 2048 tokens at once
    dtype=dtype, # Use the default data type (e.g., FP16 or BF16 depending on hardware support)
    load_in_4bit=load_in_4bit, # Load the model in 4-bit quantization to save memory
)

==((====))==  Unsloth 2025.2.12: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
prompt_style = """Below is an instruction that describes a mathematical task, paired with additional context information to guide the solution.
Write a response that thoroughly solves the given problem.
Before solving, develop a clear step-by-step chain of reasoning to ensure accuracy and logical coherence.

### Instruction:
You are a mathematics expert with advanced knowledge in mathematical reasoning, problem-solving, and proof techniques. You think outloud and consider various aspects before giving a concrete answers.

### Question:
{}

### Response:
<think>{}"""

In [None]:
hard_indices = [i for i, x in enumerate(filtered_ds['difficulty']) if x == 'very hard']

In [None]:
import numpy as np
idx = np.random.randint(min(hard_indices),max(hard_indices)+1)

# question = filtered_ds[idx]['instruction']
question = '''A snail travels at 1 cm per second for 1 minute, then teleports 10 meters backward every 30 seconds for 3 minutes while a turtle moving at 0.5 cm/s chases it.
How far is the snail from its starting point after 3 minutes?'''
question

'A snail travels at 1 cm per second for 1 minute, then teleports 10 meters backward every 30 seconds for 3 minutes while a turtle moving at 0.5 cm/s chases it. \nHow far is the snail from its starting point after 3 minutes?'

In [None]:
# Enable optimized inference mode for Unsloth models (improves speed and efficiency)
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = model.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=4096, # Limit response length to 1200 tokens (to prevent excessive output)
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

In [None]:
# Extract and print only the relevant response part (after "### Response:")
print(response[0].split("### Response:")[1])


<think>
Okay, so I need to figure out how far the snail is from its starting point after 3 minutes. Let me break this down step by step. 

First, the snail starts moving at 1 cm per second for 1 minute. Since 1 minute is 60 seconds, I can calculate how far it goes in that time. 

So, snail's speed is 1 cm/s, time is 60 s. Distance = speed × time. That means the snail moves 1 cm/s × 60 s = 60 cm. 

Wait, but the turtle is moving after the snail. So the snail is moving forward 60 cm in the first minute. Now, the turtle is moving at 0.5 cm/s and is chasing the snail. But the turtle isn't just moving the rest of the time; it's teleporting backward every 30 seconds for 3 minutes. 

Wait, no, let me re-read that. The snail teleports 10 meters backward every 30 seconds for 3 minutes while the turtle is moving. So, the snail is teleporting backward every 30 seconds for 3 minutes, but the turtle is moving while the snail is teleporting? Or is the turtle moving while the snail is teleporting? 


In [None]:
# print(filtered_ds['response'][idx])

In [None]:
EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which the model when to stop generating text during training
EOS_TOKEN

'<｜end▁of▁sentence｜>'

In [None]:
def split_response(example):
    # Split the response on </think>
    parts = example['response'].split('</think>')

    # Get the thinking part (remove <think> tag and strip whitespace)
    thinking = parts[0].replace('<think>', '').strip()

    # Get the response part (everything after </think>, or empty string if nothing after)
    response = parts[1].strip() if len(parts) > 1 else ""

    # Return new columns
    return {
        'thinking': thinking,
        'response': response  # This will override the original response column
    }

# Apply the transformation
filtered_ds = filtered_ds.map(split_response)

Map:   0%|          | 0/11655 [00:00<?, ? examples/s]

In [None]:
# # To verify it worked:
# example = filtered_ds[idx]
# print("Thinking:", example['thinking'])
# print("\n" + '-'*200)
# print("Response:", example['response'])

In [None]:
train_prompt_style = """Below is an instruction that describes a mathematical task, paired with additional context information to guide the solution.
Write a response that thoroughly solves the given problem.
Before solving, develop a clear step-by-step chain of reasoning to ensure accuracy and logical coherence.

### Instruction:
You are a mathematics expert with advanced knowledge in mathematical reasoning, problem-solving, and proof techniques. You think outloud and consider various aspects before giving any concrete answers.

### Question:
{}

### Intent:
{}

## #Knowledge Required:
{}

### Response:
<think>
{}
</think>
{}"""

In [None]:
def formatting_prompts_func(examples):
    questions = examples["instruction"]
    intent = examples["intent"]
    knowledge = examples["knowledge"]
    thinking = examples["thinking"]
    response = examples["response"]

    texts = []

    for q, i, k, t, r in zip(questions, intent, knowledge, thinking, response):
        text = train_prompt_style.format(q, i, k, t, r) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}


In [None]:
filtered_ds

Dataset({
    features: ['instruction', 'response', 'intent', 'knowledge', 'difficulty', 'thinking'],
    num_rows: 11655
})

In [None]:
finetuning_data = filtered_ds.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/11655 [00:00<?, ? examples/s]

In [None]:
# print(finetuning_data['text'][idx])

In [None]:
finetuning_data

Dataset({
    features: ['instruction', 'response', 'intent', 'knowledge', 'difficulty', 'thinking', 'text'],
    num_rows: 11655
})

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter

def split_dataset(dataset, test_size=0.1, val_size=0.1, random_state=42):
    """
    Split a dataset into train, test, and validation sets while stratifying by difficulty.
    Only keeps the 'text' column in the resulting datasets.

    Args:
        dataset: HuggingFace dataset
        test_size: proportion of data for test set
        val_size: proportion of data for validation set
        random_state: random seed for reproducibility

    Returns:
        train_dataset, test_dataset, val_dataset
    """
    # Get initial difficulty distribution
    difficulty_counts = Counter(dataset['difficulty'])
    print("Original distribution:")
    for difficulty, count in difficulty_counts.items():
        print(f"{difficulty}: {count}")

    # Create indices for splitting
    indices = list(range(len(dataset)))
    difficulties = dataset['difficulty']

    # First split: separate test set
    train_val_indices, test_indices = train_test_split(
        indices,
        test_size=test_size,
        stratify=difficulties,
        random_state=random_state
    )

    # Second split: separate validation set from training set
    # Adjust val_size to account for reduced dataset size
    adjusted_val_size = val_size / (1 - test_size)
    train_indices, val_indices = train_test_split(
        train_val_indices,
        test_size=adjusted_val_size,
        stratify=[difficulties[i] for i in train_val_indices],
        random_state=random_state
    )

    # Create the datasets with all columns first for distribution checking
    train_ds_full = dataset.select(train_indices)
    test_ds_full = dataset.select(test_indices)
    val_ds_full = dataset.select(val_indices)

    # Print distributions
    print("\nTrain set distribution:")
    train_counts = Counter(train_ds_full['difficulty'])
    for difficulty, count in train_counts.items():
        print(f"{difficulty}: {count}")

    print("\nTest set distribution:")
    test_counts = Counter(test_ds_full['difficulty'])
    for difficulty, count in test_counts.items():
        print(f"{difficulty}: {count}")

    print("\nValidation set distribution:")
    val_counts = Counter(val_ds_full['difficulty'])
    for difficulty, count in val_counts.items():
        print(f"{difficulty}: {count}")

    # Create the final datasets with only the 'text' column
    train_ds = train_ds_full.remove_columns(
        [col for col in dataset.column_names if col != 'text']
    )
    test_ds = test_ds_full.remove_columns(
        [col for col in dataset.column_names if col != 'text']
    )
    val_ds = val_ds_full.remove_columns(
        [col for col in dataset.column_names if col != 'text']
    )

    # Print final sizes
    print(f"\nFinal sizes:")
    print(f"Train set size: {len(train_ds)}")
    print(f"Test set size: {len(test_ds)}")
    print(f"Validation set size: {len(val_ds)}")

    return train_ds, test_ds, val_ds

In [None]:
train_ds, test_ds, val_ds = split_dataset(finetuning_data, test_size=0.05, val_size= 0.05, random_state=42)

Original distribution:
easy: 3881
medium: 3881
hard: 3881
very hard: 12

Train set distribution:
medium: 3493
hard: 3493
easy: 3493
very hard: 10

Test set distribution:
hard: 194
easy: 194
medium: 194
very hard: 1

Validation set distribution:
medium: 194
hard: 194
easy: 194
very hard: 1

Final sizes:
Train set size: 10489
Test set size: 583
Validation set size: 583


In [None]:
# Apply LoRA (Low-Rank Adaptation) fine-tuning to the model
model_lora = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank: Determines the size of the trainable adapters (higher = more parameters, lower = more efficiency)
    target_modules=[  # List of transformer layers where LoRA adapters will be applied
        "q_proj",   # Query projection in the self-attention mechanism
        "k_proj",   # Key projection in the self-attention mechanism
        "v_proj",   # Value projection in the self-attention mechanism
        "o_proj",   # Output projection from the attention layer
        "gate_proj",  # Used in feed-forward layers (MLP)
        "up_proj",    # Part of the transformer’s feed-forward network (FFN)
        "down_proj",  # Another part of the transformer’s FFN
    ],
    lora_alpha=16,  # Scaling factor for LoRA updates (higher values allow more influence from LoRA layers)
    lora_dropout=0,  # Dropout rate for LoRA layers (0 means no dropout, full retention of information)
    bias="none",  # Specifies whether LoRA layers should learn bias terms (setting to "none" saves memory)
    use_gradient_checkpointing="unsloth",  # Saves memory by recomputing activations instead of storing them (recommended for long-context fine-tuning)
    random_state=42,  # Sets a seed for reproducibility, ensuring the same fine-tuning behavior across runs
    use_rslora=True,  # Whether to use Rank-Stabilized LoRA (disabled here, meaning fixed-rank LoRA is used)
    loftq_config=None,  # Low-bit Fine-Tuning Quantization (LoFTQ) is disabled in this configuration
)

Unsloth 2025.2.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
model.num_parameters(), model_lora.num_parameters()

(1795552768, 1795552768)

In [None]:
# # Remove the inference optimization before training the LoRA model.
# if hasattr(model_lora, "_unwrapped_old_generate"):
#     print('1')
#     delattr(model_lora, "_unwrapped_old_generate")
#     if hasattr(model_lora, "generate") and hasattr(model, "generate"):
#         print('2')
#         model_lora.generate = model.generate  # Restore original generate method if available

if hasattr(model_lora, "_unwrapped_old_generate"):
    try:
        # Try to access the attribute directly first
        if model_lora._unwrapped_old_generate is not None:
            # Try to delete using a safer approach
            try:
                model_lora._unwrapped_old_generate = None
            except AttributeError:
                print("Could not directly set attribute to None")

        # Restore original generate method if available
        if hasattr(model_lora, "generate") and hasattr(model, "generate"):
            model_lora.generate = model.generate
            print("Successfully restored original generate method")

    except AttributeError as e:
        print(f"Warning: Could not fully clean up _unwrapped_old_generate: {e}")

Successfully restored original generate method


In [None]:
model.num_parameters(), model_lora.num_parameters()

(1795552768, 1795552768)

In [None]:
train_ds, test_ds, val_ds

(Dataset({
     features: ['text'],
     num_rows: 10489
 }),
 Dataset({
     features: ['text'],
     num_rows: 583
 }),
 Dataset({
     features: ['text'],
     num_rows: 583
 }))

In [None]:
batch_size = 2
gradient_steps = 8
steps_per_epoch = len(train_ds)/(batch_size * gradient_steps)
print(int(steps_per_epoch/3))

218


In [None]:
from transformers import TrainingArguments, EarlyStoppingCallback

# Define training arguments
trainer = SFTTrainer(
    model=model_lora,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    # eval_dataset=val_ds,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,

    # Define training arguments
    args=TrainingArguments(
        per_device_train_batch_size=batch_size,
        # per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_steps,
        num_train_epochs=2,
        warmup_ratio=0.1,               # Changed from steps to ratio (5% of training)
        learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,
        # eval_strategy="steps",     # Changed from epoch to steps for more frequent evaluation
        # eval_steps=100,                 # Evaluate every 100 steps
        # save_strategy="steps",
        # save_steps=50,
        # save_total_limit=3,
        # load_best_model_at_end=True,
        # metric_for_best_model="loss",
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",     # Changed to cosine schedule for better convergence
        seed=42,
        output_dir="deepseek_model",
        report_to="wandb",
    ),
    # Add callbacks for early stopping
    # callbacks=[
    #     EarlyStoppingCallback(
    #         early_stopping_patience=2,    # Stop if no improvement for 2 evaluations
    #         early_stopping_threshold=0.01  # Minimum change to qualify as an improvement
    #     )
    # ]
)

Applying chat template to train dataset (num_proc=2):   0%|          | 0/10489 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/10489 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/10489 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer_stats = trainer.train() #9,232,384

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,489 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 1,310
 "-____-"     Number of trainable parameters = 18,464,768
[34m[1mwandb[0m: Currently logged in as: [33makashsp7666[0m ([33makashsp7666-abso1ute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
50,0.6442
100,0.5607
150,0.4221
200,0.37
250,0.3521
300,0.3565
350,0.3414
400,0.3505
450,0.3345
500,0.3539


Step,Training Loss
50,0.6442
100,0.5607
150,0.4221
200,0.37
250,0.3521
300,0.3565
350,0.3414
400,0.3505
450,0.3345
500,0.3539


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_lora.save_pretrained("/content/drive/MyDrive/Deeplora")
tokenizer.save_pretrained("/content/drive/MyDrive/Deeplora")

('/content/drive/MyDrive/Deeplora/tokenizer_config.json',
 '/content/drive/MyDrive/Deeplora/special_tokens_map.json',
 '/content/drive/MyDrive/Deeplora/tokenizer.json')

In [None]:
model_lora.save_pretrained_gguf("/content/drive/MyDrive/Deepgguf", tokenizer, quantization_method = "q4_k_m")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.8G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 26.75 out of 50.99 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 55.74it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting qwen2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at /content/drive/MyDrive/Deepgguf into f16 GGUF format.
The output location will be /content/drive/MyDrive/Deepgguf/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Deepgguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:output.weight,             torch.float16 --> F16, shape = {1536, 151936}
INFO:hf-to-gguf:token_embd.weight,        

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: /content/drive/MyDrive/Deepgguf/unsloth.Q4_K_M.gguf


In [None]:
# FastLanguageModel.for_inference(model_lora)


inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")


outputs = model_lora.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=4096,
    use_cache=True,
)


response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


<think>
Okay, so I have this problem where a snail is moving and teleporting, and there's a turtle chasing it. I need to figure out how far the snail is from its starting point after 3 minutes. Hmm, let me try to break this down step by step.

First, let me parse the problem. The snail is traveling at 1 cm per second for 1 minute. Then, every 30 seconds, it teleports 10 meters backward. The turtle is moving at 0.5 cm/s and is chasing the snail. So, the snail is moving forward at 1 cm/s, but every 30 seconds, it teleports back 10 meters. The turtle is faster but only chases it for 3 minutes. I need to find the distance between the snail and the turtle after 3 minutes.

Wait, hold on. The problem says the turtle is chasing the snail, but it only chases for 3 minutes. So, does that mean the turtle starts chasing the snail immediately, and after 3 minutes, we need to find where the snail is? Or does the turtle start chasing after some time? Hmm, the problem says the turtle is moving at 0.

In [None]:
model_lora.save_pretrained_merged("/content/drive/MyDrive/Deepfloat", tokenizer, save_method = "merged_4bit_forced",)

Unsloth: Merging 4bit and LoRA weights to 4bit...
This might take 5 minutes...




Done.
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 10 minutes for Llama-7b... Done.


In [None]:
# model_lora.save_pretrained("/content/drive/MyDrive/Deepseek2")
# # The adapter config will be saved as well
# print("LoRA weights and config saved!")

LoRA weights and config saved!


In [None]:
# Format the question using the structured prompt (`prompt_style`) and tokenize it
model_lora.generate = model.generate
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = model_lora.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=4096, # Limit response length to 1200 tokens (to prevent excessive output)
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

In [None]:
print(response[0].split("### Response:")[1])


<think>
Okay, so I have this problem where I need to find f(f(1999)). The function f(x) is defined as the sum of the squares of the digits of x. Let me try to break this down step by step.

First, I need to understand what f(x) does. It takes an integer x, which is made up of one or more digits, and then squares each of those digits and adds them together. So, for example, if x is 23, then f(23) would be 2 squared plus 3 squared, which is 4 plus 9, giving 13. Got it.

Now, I need to compute f(f(1999)). That means I first need to find f(1999), and then take that result and plug it back into f again. So, let's start by finding f(1999).

To find f(1999), I need to break down 1999 into its individual digits. Let me write that out: 1, 9, 9, and 9. Wait, no, hold on. 1999 is a four-digit number, right? So, the digits are 1, 9, 9, and 9. Wait, no, that's not correct. Let me double-check. 1999 is 1 thousand, 9 hundreds, 9 tens, and 9 units. So, the digits are 1, 9, 9, and 9. Wait, no, that's 

In [None]:
filtered_ds['thinking'][idx]

'Okay, so I have this problem here where I need to find f(f(1999)), and f(x) is defined as the sum of the squares of the digits of x. Hmm, let me break this down step by step. \n\nFirst, I need to understand what the function f does. It takes an integer, splits it into its individual digits, squares each digit, and then adds all those squares together. For example, they gave f(23) = 2² + 3² = 4 + 9 = 13. Got it. So, it\'s a straightforward process once I can break down the number into its digits.\n\nAlright, so the problem is asking for f(f(1999)). That means I need to compute f(1999) first, and then take that result and compute f of that. Let me start with the inner function: f(1999).\n\nSo, 1999 is a four-digit number. Let me write down each digit separately. The digits are 1, 9, 9, and 9. Now, I need to square each of these digits and add them up. Let me compute each square:\n\n- 1 squared is 1.\n- 9 squared is 81.\n- 9 squared is 81.\n- 9 squared is 81.\n\nNow, adding them all toge

In [None]:
print(response[0].split("### Response:")[1])  #old response


<think>
Okay, so I need to find f(f(1999)) where f(x) is the sum of the squares of the digits of x. Let me break this down step by step.

First, I need to compute f(1999). To do that, I'll look at each digit in 1999 and square them, then add them up. The number 1999 has four digits: 1, 9, 9, and 9.

So, for the first digit, which is 1: 1 squared is 1.
Next digit is 9: 9 squared is 81.
Same with the next 9: another 81.
And the last digit is also 9: another 81.

Now, adding them up: 1 + 81 + 81 + 81. Let me compute that. 1 + 81 is 82, plus another 81 is 163, and then plus 81 gives 164. So f(1999) is 164.

Wait, let me check that again to make sure I didn't make a mistake. 1 squared is 1, 9 squared is 81, so each 9 contributes 81. There are three 9s, so that's 3 times 81, which is 243. Then the first digit is 1, so total is 1 + 243 = 244. Hmm, I think I made a mistake earlier because I added 1 + 81 + 81 +81 again. Wait, actually, 1999 is four digits, so each digit is 1, 9, 9, 9.

So, 1 s