In [None]:
import time
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig # Optional: for quantization
)
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training


### Configuration

In [None]:
# --- Configuration ---
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
dataset_id = "HuggingFaceTB/smoltalk"
output_dir = "./smoltalk-finetuned-deepseek-lora"

# LoRA Configuration 
rank_dimension = 6
lora_alpha = 8
lora_dropout = 0.15

peft_config = LoraConfig(
    r=rank_dimension,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

# Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

### Load Tokenizer and Model

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    print("Tokenizer missing pad token, setting to eos_token.")
    tokenizer.pad_token = tokenizer.eos_token


print(f"Loading model: {model_id}")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="eager"
)


Loading model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B


Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


### Load and Prepare Dataset

In [None]:
# Load dataset
dataset = load_dataset(dataset_id, "everyday-conversations", split="train")

# 10% for evaluation
eval_split_percentage = 0.1

# Split the dataset
print(f"Splitting dataset into train ({1-eval_split_percentage:.0%}) and eval ({eval_split_percentage:.0%})...")
split_dataset = dataset.train_test_split(test_size=eval_split_percentage, shuffle=True, seed=42) # Seed for reproducibility
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")


Splitting dataset into train (90%) and eval (10%)...
Train dataset size: 2034
Eval dataset size: 226


In [None]:
# Adjust based on VRAM and expected conversation length
max_seq_length = 1024

# Define preprocessing function
def preprocess_conversations(examples):
    all_input_ids = []
    all_attention_masks = []

    for conversation in examples['messages']:
        # Adds special tokens (e.g., <|im_start|>, <|im_end|>)
        formatted_chat = tokenizer.apply_chat_template(
            conversation,
            tokenize=False, # Get the formatted string first
            add_generation_prompt=False # We are training, not prompting
        )

        # Add the EOS token at the end of the conversation
        if not formatted_chat.endswith(tokenizer.eos_token):
             formatted_chat += tokenizer.eos_token

        # Tokenize the fully formatted string
        tokenized_output = tokenizer(
            formatted_chat,
            truncation=True,
            max_length=max_seq_length,
            padding=False # DataCollator will handle padding per batch
        )

        all_input_ids.append(tokenized_output['input_ids'])
        all_attention_masks.append(tokenized_output['attention_mask'])

    return {"input_ids": all_input_ids, "attention_mask": all_attention_masks}

In [6]:
# Preprocess BOTH train and eval datasets
print("Preprocessing train dataset...")
tokenized_train_dataset = train_dataset.map(
    preprocess_conversations, batched=True, remove_columns=train_dataset.column_names
).filter(lambda example: len(example['input_ids']) > 0)

print("Preprocessing eval dataset...")
tokenized_eval_dataset = eval_dataset.map(
    preprocess_conversations, batched=True, remove_columns=eval_dataset.column_names
).filter(lambda example: len(example['input_ids']) > 0)

Preprocessing train dataset...
Preprocessing eval dataset...


Map: 100%|██████████| 226/226 [00:00<00:00, 1335.78 examples/s]
Filter: 100%|██████████| 226/226 [00:00<00:00, 5002.42 examples/s]


In [7]:
print(f"Processed train dataset size: {len(tokenized_train_dataset)}")
print(f"Processed eval dataset size: {len(tokenized_eval_dataset)}")
if len(tokenized_train_dataset) > 0:
     print(f"Sample tokenized train input_ids: {tokenized_train_dataset[0]['input_ids']}")
     print(f"Decoded sample: {tokenizer.decode(tokenized_train_dataset[0]['input_ids'])}")
else:
     print("Warning: Train dataset is empty after preprocessing!")
if len(tokenized_eval_dataset) == 0:
    print("Warning: Eval dataset is empty after preprocessing! Evaluation may not work.")

Processed train dataset size: 2034
Processed eval dataset size: 226
Sample tokenized train input_ids: [151646, 151646, 151644, 13048, 151645, 9707, 0, 2585, 646, 358, 1492, 498, 3351, 30, 151643, 151644, 40, 2776, 6832, 911, 4285, 12645, 304, 2906, 323, 358, 572, 20293, 11, 1128, 374, 264, 27505, 30, 151645, 32, 27505, 374, 264, 4285, 5662, 429, 8609, 601, 11893, 476, 3271, 8811, 6171, 553, 1667, 264, 3619, 476, 23418, 429, 41330, 2412, 2163, 264, 8356, 1459, 11, 2598, 264, 5599, 5082, 372, 13, 151643, 151644, 4792, 3643, 5530, 13, 2055, 11, 1246, 1558, 432, 1281, 2513, 8661, 311, 11893, 30, 151645, 32, 27505, 3643, 2513, 8661, 311, 11893, 553, 10018, 279, 5106, 476, 3311, 315, 5344, 4362, 13, 1752, 3110, 11, 421, 498, 990, 264, 9276, 2257, 311, 11893, 264, 8811, 6946, 11, 279, 5599, 5082, 372, 374, 279, 1459, 1380, 279, 9276, 2257, 374, 40119, 389, 279, 4910, 11, 323, 279, 5344, 498, 3796, 311, 279, 1008, 835, 315, 279, 9276, 2257, 374, 8455, 1870, 11, 3259, 432, 8661, 311, 11893, 279

#### Apply PEFT (LoRA)

In [None]:
# Prepare model for k-bit training if using quantization
model = prepare_model_for_kbit_training(model) 

print("Applying LoRA configuration...")
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Applying LoRA configuration...
trainable params: 6,924,288 || all params: 1,784,012,288 || trainable%: 0.3881


#### Define Training Arguments

In [None]:
# Check GPU availability and BF16 support
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,      
    gradient_accumulation_steps=8,     
    learning_rate=2e-4,
    num_train_epochs=20,      
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_dir=f"{output_dir}/logs",
    logging_steps=200,
    save_strategy="epoch",
    # save_steps=100,
    optim="paged_adamw_8bit",          
    bf16=use_bf16,
    fp16=not use_bf16 and torch.cuda.is_available(),
    tf32=use_bf16,
    gradient_checkpointing=True,
)

#### Define Data Collator

In [None]:
# Pads batches dynamically and creates labels
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

#### Initialize Trainer

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


#### Start Fine-tuning

In [None]:
if len(tokenized_train_dataset) > 0:
    print("Starting fine-tuning...")
    train_result = trainer.train()

    # Save the LoRA Adapter
    print(f"Saving best LoRA adapter to {output_dir}")
    trainer.save_model(output_dir)

    # Log final metrics
    metrics = train_result.metrics

    # Calculate perplexity for training set
    try:
        train_perplexity = torch.exp(torch.tensor(metrics["train_loss"])).item()
        metrics["train_perplexity"] = train_perplexity
    except KeyError:
        print("Could not calculate train perplexity (train_loss not found in metrics).")
    except OverflowError:
         metrics["train_perplexity"] = float("inf")


    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    # Explicitly evaluate on the evaluation set and log metrics
    if len(tokenized_eval_dataset) > 0:
        print("Running final evaluation...")
        eval_metrics = trainer.evaluate()
        # Calculate perplexity for evaluation set
        try:
            eval_perplexity = torch.exp(torch.tensor(eval_metrics["eval_loss"])).item()
            eval_metrics["eval_perplexity"] = eval_perplexity
        except OverflowError:
            eval_metrics["eval_perplexity"] = float("inf")

        trainer.log_metrics("eval", eval_metrics)
        trainer.save_metrics("eval", eval_metrics)
        print(f"Evaluation Metrics: {eval_metrics}")


    print("Fine-tuning finished.")
    print(f"Best LoRA adapter saved in: {output_dir}")
    if "eval_loss" in eval_metrics:
        print(f"Best Eval Loss: {eval_metrics['eval_loss']:.4f}")
        print(f"Best Eval Perplexity: {eval_metrics.get('eval_perplexity', 'N/A')}")

else:
    print("Skipping training as the processed training dataset is empty.")

Starting fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
200,2.0362
400,1.5479
600,1.37
800,1.2316
1000,1.1149
1200,1.0538


Saving best LoRA adapter to ./smoltalk-finetuned-deepseek-lora
***** train metrics *****
  epoch                    =     9.9282
  total_flos               = 32103354GF
  train_loss               =     1.3729
  train_perplexity         =     3.9468
  train_runtime            = 1:34:00.59
  train_samples_per_second =      3.606
  train_steps_per_second   =      0.225
Running final evaluation...


***** eval metrics *****
  epoch                   =     9.9282
  eval_loss               =     1.8655
  eval_perplexity         =     6.4594
  eval_runtime            = 0:00:34.83
  eval_samples_per_second =      6.488
  eval_steps_per_second   =      0.833
Evaluation Metrics: {'eval_loss': 1.865541696548462, 'eval_runtime': 34.831, 'eval_samples_per_second': 6.488, 'eval_steps_per_second': 0.833, 'epoch': 9.928220255653883, 'eval_perplexity': 6.4594340324401855}
Fine-tuning finished.
Best LoRA adapter saved in: ./smoltalk-finetuned-deepseek-lora
Best Eval Loss: 1.8655
Best Eval Perplexity: 6.4594340324401855


## Test Model

#### Load Tokenizer

In [None]:
# The path where trained adapter was saved
adapter_path = f"./smoltalk-finetuned-deepseek-lora"

#### Load LoRA Adapter

In [5]:
print(f"Loading LoRA adapter from: {adapter_path}")

# Merge the adapter layers into the base model for inference
merged_model = PeftModel.from_pretrained(model, adapter_path)

# If you want to merge explicitly and save a standalone model 
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("./smoltalk-finetuned-merged")
tokenizer.save_pretrained("./smoltalk-finetuned-merged")

Loading LoRA adapter from: ./smoltalk-finetuned-deepseek-lora




('./smoltalk-finetuned-merged\\tokenizer_config.json',
 './smoltalk-finetuned-merged\\special_tokens_map.json',
 './smoltalk-finetuned-merged\\tokenizer.json')

#### Set Model to Evaluation Mode

In [6]:
merged_model.eval()
print("Model set to evaluation mode.")

Model set to evaluation mode.


In [None]:
# Prepare Inference Function 
def generate_response(chat_history):
    """
    Generates a response based on the provided chat history using the fine-tuned model.
    Args:
        chat_history (list): A list of dictionaries, e.g.,
                             [{'role': 'user', 'content': 'Hello!'}, ...]
    Returns:
        str: The generated response content.
    """
    try:
        # Apply the chat template for inference
        prompt = tokenizer.apply_chat_template(
            chat_history,
            tokenize=False,
            add_generation_prompt=True
        )


        # Tokenize the formatted prompt
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(merged_model.device) # Move inputs to model's device

        print("Generating...")
        start_time = time.time()

        # Generate response
        with torch.no_grad(): 
            outputs = merged_model.generate(
                **inputs,
                max_new_tokens=500,          
                pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id, # Use pad or eos
                eos_token_id=tokenizer.eos_token_id, # Stop generation when EOS is produced
                do_sample=True,             
                temperature=0.7,            # Controls randomness
                top_k=50,                   # Consider top K tokens for sampling
                top_p=0.9,                  # Nucleus sampling
                repetition_penalty=1.2      # Penalize repeating tokens slightly
            )
        end_time = time.time()
        print(f"Generation took {end_time - start_time:.2f} seconds.")

        # Decode only the newly generated tokens
        input_length = inputs.input_ids.shape[1]
        generated_tokens = outputs[0][input_length:]
        response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

        return response.strip()

    except Exception as e:
        print(f"An error occurred during generation: {e}")
        return "[Error generating response]"

#### Test with Examples

In [None]:
# Example 1:
chat1 = [{'role': 'user', 'content': 'Hi there, how are you today?'}]
response1 = generate_response(chat1)
print(f"\nUser: {chat1[-1]['content']}")
print(f"Assistant: {response1}")

Generating...
Generation took 4.91 seconds.

User: Hi there, how are you today?
Assistant: Alright, the user is asking me how I'm doing. I should respond in a friendly and open manner. Maybe start with a smiley to keep it light.

I want to let them know I'm still active, which shows interest. It's important to mention that I can assist with math problems since that's my strength. I'll add something about looking forward to their questions to keep the conversation going.
</think>

Hello! I'm just a Deep Thinking AI, so I don't have feelings, but I'm here and ready to help you with any math-related queries! How can I assist you today?


In [None]:
# Example 2:
chat2 = chat1 + [{'role': 'assistant', 'content': response1},
                 {'role': 'user', 'content': 'Can you tell me a fun fact?'}]
response2 = generate_response(chat2)
print(f"\nUser: {chat2[-1]['content']}")
print(f"Assistant: {response2}")

Generating...
Generation took 14.08 seconds.

User: Can you tell me a fun fact?
Assistant: Okay, the user sent "Can you tell me a fun fact?" So they want something interesting and entertaining. I should respond by offering different types of fun facts—maybe math or science ones because those often get more attention.

I remember hearing about pi memorization competitions. That's pretty cool and educational. Plus, it's a common topic that everyone knows, which makes it relatable.

Another idea is space exploration. It's not just about winning awards; people often talk about learning from such events, especially when it brings them close to their heroes. It adds a personal touch too.

Maybe also include a joke to keep it light-hearted. Jokes are always funny and easy to understand. They can use humor without being mean, which might be more engaging for some users.

I need to make sure my response is friendly and inviting, letting them know I'm happy to share more if they're interested. K

In [None]:
# Example 3: Interactive Chat Loop 
print("\n--- Interactive Chat (type 'quit' to exit) ---")
chat_history = []
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    chat_history.append({'role': 'user', 'content': user_input})
    assistant_response = generate_response(chat_history)
    print(f"Assistant: {assistant_response}")
    chat_history.append({'role': 'assistant', 'content': assistant_response})
    # Trim history to prevent excessive length
    if len(chat_history) > 20:
        chat_history = chat_history[-20:]


--- Interactive Chat (type 'quit' to exit) ---
Generating...
Generation took 8.02 seconds.
Assistant: I need to determine the correct number by following these steps. First, I'll select any number from 1 to 10. After selecting, I will write it down and then try again until I get an accurate answer.

Next, if you're unsure of my selection or want me to verify, we can proceed togetherively.
</think>

**Step-by-step Explanation:**

To find a randomly selected number between **1-10**, follow these easy instructions:

---

### Step 1 – Select Your Number
1. Choose any number from **1** to **10** (e.g., 7).
2. Write your chosen number in the provided space below.

---

### Step 2 – Verify the Selection (Optional)
If you are uncertain about which number was selected:
1. Click on "Re-spin" when desired.
2. The system will attempt to simulate re-selecting based on your initial choice, giving you confidence in its accuracy.

---

By completing these steps, you've successfully identified the ran