In [None]:
pip install -U bitsandbytes

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import bitsandbytes
import os
os.environ["WANDB_DISABLED"] = "true"


# Step 1: Load Dataset
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    dataset = [{"input_text": d['Context'], "target_text": d['Response']} for d in data]
    return dataset

data = load_data('/kaggle/input/combined-dataset/combined_dataset.json')

In [None]:
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({
    "input_text": [d['input_text'] for d in train_data],
    "target_text": [d['target_text'] for d in train_data]
})

val_dataset = Dataset.from_dict({
    "input_text": [d['input_text'] for d in val_data],
    "target_text": [d['target_text'] for d in val_data]
})

print(f"✅ Data Loaded: {len(train_dataset)} Training Samples, {len(val_dataset)} Validation Samples")


✅ Data Loaded: 3160 Training Samples, 352 Validation Samples


In [3]:
!pip install transformers huggingface_hub




In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Perform login
hf_token = "token"
login(token=hf_token)
print("✅ Successfully logged in to Hugging Face")

# Configure 4-bit quantization using BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Use float16 for better GPU performance
    bnb_4bit_use_double_quant=True,        # Enables double quantization for memory savings
    bnb_4bit_quant_type="nf4"              # Normal Float 4 (NF4) for optimal precision
)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('meta/llama3.2', token=hf_token)
model = AutoModelForCausalLM.from_pretrained('meta/llama3.2', device_map='auto', quantization_config=quantization_config, token=hf_token)

print("✅ Model and tokenizer loaded successfully")


✅ Successfully logged in to Hugging Face


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

✅ Model and tokenizer loaded successfully


In [5]:
from peft import LoraConfig, get_peft_model

# Configure and apply LoRA
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 1,843,200 || all params: 2,508,015,616 || trainable%: 0.0735


In [None]:
def preprocess_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=256, truncation=True, padding="max_length")
    targets = tokenizer(examples['target_text'], max_length=256, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs


# Apply tokenization using map
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

print("✅ Tokenization complete!")

Map:   0%|          | 0/3160 [00:00<?, ? examples/s]

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

✅ Tokenization complete!


In [13]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print("Model loaded:", model is not None)
print("Tokenizer loaded:", tokenizer is not None)


Training dataset size: 3160
Validation dataset size: 352
Model loaded: True
Tokenizer loaded: True


In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    load_best_model_at_end=True,
    logging_steps=100,
    report_to="none"  # Disable wandb
)


In [17]:
from transformers import DataCollatorForSeq2Seq

# Use a data collator to batch data correctly
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,9.5743,9.733069
2,9.1373,9.168477
3,9.0457,8.987385


TrainOutput(global_step=2370, training_loss=9.649729488066983, metrics={'train_runtime': 4238.5693, 'train_samples_per_second': 2.237, 'train_steps_per_second': 0.559, 'total_flos': 2.888561326030848e+16, 'train_loss': 9.649729488066983, 'epoch': 3.0})

In [None]:
# Step 7: Save Model
trainer.save_model('./fine_tuned_llama3.2')
tokenizer.save_pretrained('./fine_tuned_llama3.2')
print("✅ Model saved successfully")

✅ Model saved successfully


In [21]:
trainer.save_model("./stella_model")
tokenizer.save_pretrained("./stella_model")


('./stella_model/tokenizer_config.json',
 './stella_model/special_tokens_map.json',
 './stella_model/tokenizer.model',
 './stella_model/added_tokens.json',
 './stella_model/tokenizer.json')

In [22]:
!tar -czvf stella_model.tar.gz ./stella_model


./stella_model/
./stella_model/adapter_model.safetensors
./stella_model/training_args.bin
./stella_model/tokenizer.json
./stella_model/adapter_config.json
./stella_model/tokenizer_config.json
./stella_model/special_tokens_map.json
./stella_model/tokenizer.model
./stella_model/README.md
