<a href="https://colab.research.google.com/github/athulskrish/unsloth-DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit/blob/main/unsloth_DeepSeek_R1_Distill_Qwen_1_5B_unsloth_bnb_4bit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit/tree/main

In [None]:
!git lfs install

In [None]:
!git clone https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit

In [None]:
!pip install unsloth[colab-new] --quiet
!pip install transformers accelerate bitsandbytes --quiet

In [None]:
# Step 2: Import necessary libraries
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer
import gc

In [None]:
# Step 3: Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Step 4: Load the model
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


In [None]:
# Step 5: Set up FastLanguageModel for inference
FastLanguageModel.for_inference(model)

In [None]:
 #Step 6: Create a text streamer for real-time output
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

In [None]:
# Step 7: Define a function to generate responses
def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
    """
    Generate a response using the DeepSeek-R1-Distill model

    Args:
        prompt (str): Input text prompt
        max_new_tokens (int): Maximum number of tokens to generate
        temperature (float): Sampling temperature (0.0 = deterministic, 1.0 = random)
        top_p (float): Nucleus sampling threshold
    """

    # Tokenize the input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length-max_new_tokens
    ).to("cuda")

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            streamer=text_streamer,
            use_cache=True
        )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the original prompt from the response
    response = response[len(prompt):].strip()

    return response


In [None]:
# Step 8: Example usage
print("Model loaded successfully! Here's a test:")
print("=" * 50)


In [None]:
# Test prompt
test_prompt = "What is the capital of France?"

print(f"Question: {test_prompt}")
print("Answer: ", end="")


In [None]:
# Generate and display response
response = generate_response(test_prompt, max_new_tokens=256)

print("\n" + "=" * 50)
print("Model is ready for use!")


In [None]:
# Step 9: Interactive chat function
def chat():
    """
    Interactive chat function
    """
    print("\nStarting interactive chat. Type 'quit' to exit.")
    print("-" * 50)

    while True:
        user_input = input("\nYou: ").strip()

        if user_input.lower() in ['quit', 'exit', 'bye']:
            print("Goodbye!")
            break

        if user_input:
            print("Assistant: ", end="")
            response = generate_response(user_input, max_new_tokens=512)
            print()  # New line after response
chat()


In [None]:
# Step 10: Advanced generation function with custom parameters
def advanced_generate(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9,
                     top_k=50, repetition_penalty=1.1):
    """
    Advanced generation with more parameters
    """
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length-max_new_tokens
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()
advanced_generate("what is the speed of light")