In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

In [2]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import Dataset
import torch
import json

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = 1024,  # Reduced for stability
    dtype = None,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-04 21:54:09.230639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751666049.450540      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751666049.508137      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)


Unsloth 2025.6.12 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [5]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
    mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
    map_eos_token = True,
)

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = []
    for convo in convos:
        text = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False)
        texts.append(text)
    return { "text" : texts, }

In [6]:
with open("/kaggle/input/pro-pod/professional_podcast_dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [7]:
# Use subset for initial training
dataset = Dataset.from_list(data[:16])  # Start with 16 conversations
dataset = dataset.map(formatting_prompts_func, batched = True)


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 1024,
    dataset_num_proc = 1,  # Single process for stability
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 1,  # Very conservative
        gradient_accumulation_steps = 16, # Maintain effective batch size
        warmup_steps = 5,
        max_steps = 50,  # Shorter training for testing
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 2,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
        save_steps = 25,
        save_total_limit = 2,
        dataloader_drop_last = True,
        remove_unused_columns = False,
        # Additional stability settings
        dataloader_num_workers = 0,
        eval_strategy = "no",
        save_safetensors = True,
    ),
)

trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"]:   0%|          | 0/16 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16 | Num Epochs = 50 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 16 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
2,2.7601
4,2.5977
6,2.1664
8,1.7151
10,1.4776
12,1.2422
14,1.034
16,0.8701
18,0.7342
20,0.6125


In [10]:
FastLanguageModel.for_inference(model)

test_prompts = [
    "Welcome to our podcast! Today we're discussing quantum computing with Dr. Sarah Chen. What initially drew you to quantum computing?",
    "What's the most exciting breakthrough in artificial intelligence recently?", 
    "How do you see renewable energy transforming our future?"
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n🔹 TEST {i}:")
    print(f"HOST: {prompt}")
    
    messages = [{"role": "user", "content": prompt}]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids = inputs, 
            max_new_tokens = 100, 
            use_cache = True,
            temperature = 0.8,
            do_sample = True,
            pad_token_id = tokenizer.eos_token_id,
            eos_token_id = tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    print(f"GUEST: {response}")
    print("-" * 50)


🔹 TEST 1:
HOST: Welcome to our podcast! Today we're discussing quantum computing with Dr. Sarah Chen. What initially drew you to quantum computing?
GUEST: Thank you for having me! I'm excited to discuss quantum computing and share what's happening at the cutting edge.
--------------------------------------------------

🔹 TEST 2:
HOST: What's the most exciting breakthrough in artificial intelligence recently?
GUEST: Recent developments in artificial intelligence suggest significant advancements, particularly in neural networks and reasoning capabilities.
--------------------------------------------------

🔹 TEST 3:
HOST: How do you see renewable energy transforming our future?
GUEST: Renewable energy is transforming our future by providing clean, sustainable, and infinite energy sources.
--------------------------------------------------


In [11]:
model.save_pretrained("podcast_model_final")
tokenizer.save_pretrained("podcast_model_final")

('podcast_model_final/tokenizer_config.json',
 'podcast_model_final/special_tokens_map.json',
 'podcast_model_final/tokenizer.json')

In [17]:
# First, install huggingface_hub if not already installed
!pip install huggingface_hub

# Login to Hugging Face (you'll need to enter your token)
from huggingface_hub import login
login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
# Save and push to Hugging Face Hub
model.push_to_hub(
    "navth/podcast-llama3-2-1b-finetuned",  # Replace with your desired model name
    token=True,  # Uses the token from login()
    private=False,  # Set to True if you want a private repo
    safe_serialization=True
)

tokenizer.push_to_hub(
    "navth/podcast-llama3-2-1b-finetuned",  # Same repo name
    token=True,
    private=False
)

README.md:   0%|          | 0.00/594 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/navth/podcast-llama3-2-1b-finetuned


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [12]:
# Load base model for testing
from unsloth import FastLanguageModel
import torch

base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(base_model)

test_prompts = [
    "Welcome to our podcast! Today we're discussing quantum computing with Dr. Sarah Chen. What initially drew you to quantum computing?",
    "What's the most exciting breakthrough in artificial intelligence recently?", 
    "How do you see renewable energy transforming our future?"
]

print("📝 BASE MODEL RESPONSES ONLY\n" + "="*60)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n🔹 TEST {i}:")
    print(f"HOST: {prompt}")
    
    messages = [{"role": "user", "content": prompt}]
    
    inputs = base_tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")
    
    with torch.no_grad():
        outputs = base_model.generate(
            input_ids = inputs, 
            max_new_tokens = 100, 
            use_cache = True,
            temperature = 0.8,
            do_sample = True,
            pad_token_id = base_tokenizer.eos_token_id,
            eos_token_id = base_tokenizer.eos_token_id,
        )
    
    response = base_tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    print(f"BASE MODEL: {response}")
    print("-" * 60)

==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
📝 BASE MODEL RESPONSES ONLY

🔹 TEST 1:
HOST: Welcome to our podcast! Today we're discussing quantum computing with Dr. Sarah Chen. What initially drew you to quantum computing?
BASE MODEL: Thank you for having me on the show. I'm excited to be here.

Dr. Sarah Chen: Welcome back to the podcast. I'm Dr. Sarah Chen, and I'm thrilled to be here with our guest, Dr. Rachel Kim.

Dr. Sarah Chen: Dr. Rachel, it's great to have you on the show. Can you start by telling us what initially drew you to quantum computing?

Dr. Rachel Kim: Well, I've been f