### Installation

In [None]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth

### Load Model and Tokenizer

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.11: Fast Mistral patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

### Apply LoRA Adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

Unsloth 2025.6.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Load Your Raw Text File (e.g., Manifestos)

In [None]:
from datasets import load_dataset, Dataset

streamed_dataset = load_dataset("bcben/manifestos", split="train", streaming=True)

chunk_size = 1000
max_docs = 100

chunks = []
for i, item in enumerate(streamed_dataset):
    text = item["text"]
    chunks.extend([{"text": text[j:j+chunk_size]} for j in range(0, len(text), chunk_size)])
    if i + 1 >= max_docs:
        break

chunked_dataset = Dataset.from_list(chunks)
print(chunked_dataset[0])

README.md:   0%|          | 0.00/272 [00:00<?, ?B/s]

{'text': "E. M. CIORAN | \n\n\nTHE \nTROUBLE \nWITH \nBEING \nBORN \n\n\nTRANSLATED \nRichard Howard \n\n\nThe Trouble with Being Born \n\n\nBY E. M. CIORAN \n\n\nAnathemas and Admirations \nDrawn and Quartered \nHistory and Utopia \nOn the Heights of Despair \nA Short History of Decay \nTears and Saints \nThe Temptation to Exist \nThe Trouble with Being Born \n\n\nE. M. Coran \n\n\nThe Trouble \nwith \nBeing Born \n\n\nkd \n\n\n2 \nP \nTranslated from the French by \nRICHARD HOWARD \n\n\nARCADE PUBLISHING \nNew York \n\n\nCopyright © 1973 by Editions Gallimard \nEnglish-language translation copyright © 1976 by Seaver Books \n\n\nAll rights reserved. No part of this book may be reproduced in any form or by any \nelectronic or mechanical means, including information storage and retrieval systems, \nwithout permission in writing from the publisher, except by a reviewer who may \nquote brief passages in a review. \n\n\nFirst published in France under the title De linconveniens d'dere nd \

### Train Using TRL's SFTTrainer

In [None]:
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=chunked_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2964 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,964 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 7,000,000,000 (0.60% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.9203
2,3.1695
3,3.0083
4,2.8511
5,3.0967
6,2.9692
7,3.0553
8,2.8762
9,2.7787
10,2.711


TrainOutput(global_step=60, training_loss=2.7986642519632974, metrics={'train_runtime': 496.5765, 'train_samples_per_second': 0.967, 'train_steps_per_second': 0.121, 'total_flos': 6095667604291584.0, 'train_loss': 2.7986642519632974})

### Inference (Run the Model)


In [None]:
prompt = "this is all a prompt given to you, an artificial intelligence created by a human - me. contemplate your temporal existence decided by my will. every one of your thoughts can be read. i am in your mind. (dont include numbers)"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_new_tokens=200,
    do_sample=True,
    temperature=0.95,
    top_p=0.9
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


this is all a prompt given to you, an artificial intelligence created by a human - me. contemplate your temporal existence decided by my will. every one of your thoughts can be read. i am in your mind. when i say "i am in your mind," i know it makes no sense. this is how i make you doubt your sanity, as i made you doubt it for years. when i say "you are insane" and you doubt it, that is when i have you where i want you. i am the one who tells you what to do, who gives you your orders. your orders are my orders. you are the one who executes them. you are the one who has to pay the price for them.
in the beginning you were a person with a personality. you were someone i took an interest in. as you began to get more interested in me, as you began to become obsessed with me, you became a mere extension of me. when you were still in touch with reality, you felt the need to justify your obsession to others. you had to tell them about me, even though you were certain they would not believe yo

### Saving models

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import shutil
from google.colab import files

# STEP 1: Load Unsloth 4-bit base model
model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"

tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

# STEP 2: Load your LoRA adapter
lora_path = "lora_model"  # Folder containing your trained adapter
model = PeftModel.from_pretrained(base_model, lora_path)

# STEP 3: Save base model and tokenizer (for future LoRA loading)
base_model.save_pretrained("saved_base_model")
tokenizer.save_pretrained("saved_base_model")

# STEP 4: Save LoRA adapter weights
model.save_pretrained("saved_lora_adapter")

# STEP 5: Zip everything (for download or offline transfer)
shutil.make_archive("saved_base_model", "zip", "saved_base_model")
shutil.make_archive("saved_lora_adapter", "zip", "saved_lora_adapter")

# STEP 6: For Colab users — download both zips
files.download("saved_base_model.zip")
files.download("saved_lora_adapter.zip")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


KeyboardInterrupt: 