### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
import json
from datasets import Dataset

# Load flattened maintenance cases JSON
with open("/content/maintenance_cases_flat.json", "r", encoding="utf-8") as f:
    cases = json.load(f)

formatted_data = []
for case in cases:
    # Use the technician's input as the question
    question = case.get("technician_input", "").strip()

    # Gather other fields for the answer
    fixes = case.get("recommended_fixes", [])
    causes = case.get("potential_root_causes", [])
    references = case.get("references", [])

    # Build the assistant's answer text
    answer_parts = []
    if fixes:
        answer_parts.append(
            "Recommended fixes:\n" + "\n".join(f"- {item}" for item in fixes)
        )
    if causes:
        answer_parts.append(
            "Potential root causes:\n" + "\n".join(f"- {item}" for item in causes)
        )
    if references:
        answer_parts.append(
            "References:\n" + "\n".join(f"- {item}" for item in references)
        )

    answer = "\n\n".join(answer_parts)

    # Format as a two-turn conversation
    formatted_data.append({
        "conversations": [
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer},
        ]
    })

# Create a Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass


In [None]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/2083 [00:00<?, ? examples/s]

Map:   0%|          | 0/2083 [00:00<?, ? examples/s]

We look at how the conversations are structured for item 5:

In [None]:
dataset[5]["conversations"]

[{'content': "Quel est l'article qui définit l'impôt sur les sociétés ?",
  'role': 'user'},
 {'content': "L'article premier définit l'impôt sur les sociétés en précisant qu'il s'applique sur l'ensemble des produits, bénéfices et revenus des entités visées.",
  'role': 'assistant'}]

In [None]:
dataset[1000]["text"]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nDans quel délai une entreprise éligible au statut de contribuable catégorisé est-elle invitée à fournir des informations complémentaires?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nUne entreprise éligible est invitée à fournir des informations complémentaires dans un délai de six (6) mois à compter de la date à laquelle elle a été invitée, et ce dans les deux mois suivant le dépôt de sa demande [77].<|eot_id|>'

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 10, # Set this for 1 full training run.
        max_steps = -1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2083 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=2):   0%|          | 0/2083 [00:00<?, ? examples/s]

We verify masking is actually done:

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuel est l'article qui définit l'impôt sur les sociétés?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nL'article premier définit l'impôt sur les sociétés en précisant qu'il s'applique sur l'ensemble des produits, bénéfices et revenus des entités visées.<|eot_id|>"

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

"                                                      L'article premier définit l'impôt sur les sociétés en précisant qu'il s'applique sur l'ensemble des produits, bénéfices et revenus des entités visées.<|eot_id|>"

We can see the System and Instruction prompts are successfully masked!

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
6.779 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,083 | Num Epochs = 10 | Total steps = 1,300
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 97,255,424/3,310,005,248 (2.94% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.9363
2,1.0497
3,0.9354
4,0.8389
5,0.8208
6,1.009
7,0.8127
8,0.787
9,1.1682
10,1.1376


KeyboardInterrupt: 

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'start_gpu_memory' is not defined

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Unsloth_Studio.ipynb)**

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "Expliquez les règles régissant la taxation des revenus versés aux entités non résidentes."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExpliquez les règles régissant la taxation des revenus versés aux entités non résidentes.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nLe texte précise les règles régissant la taxation des revenus versés aux entités non résidentes.<|eot_id|>']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
 {"role": "user", "content": "Une entreprise marocaine a réalisé un bénéfice imposable de 10 000 000 MAD. Elle a été introduite en bourse par augmentation de capital d'au moins 20 % (avec abandon du droit préférentiel), ce qui lui permet de bénéficier d'une réduction d'impôt de 50 % sur l'impôt sur les sociétés pendant trois ans, comme prévu dans le CGI. En supposant que le taux normal de l'impôt sur les sociétés est de 30 %, calculez Le montant de l'impôt initial sans réduction, Le montant de la réduction d'impôt accordée et Le montant net d'impôt à payer après application de la réduction."},

    #{"role": "user", "content": "Discutez des obligations fiscales des agents commissionnaires en matière de TVA."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Le montant de l'impôt initial sans réduction est de 30% de 10 000 000 MAD, ce qui équivaut à 3000000 MAD.

La réduction d'impôt accordée est de 50% sur l'impôt sur les sociétés pendant trois ans. Le montant de la réduction d'impôt accordée est donc de 50% de 3000000 MAD, ce qui équivaut à 1500000 MAD.

Après application de la réduction, le montant net d'impôt à payer est de 3000000


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
model.push_to_hub("YassirFr/french_tax_llama3.2_3B_lora_2", token = " ") # Online saving
tokenizer.push_to_hub("YassirFr/french_tax_llama3.2_3B_lora_2", token = " ") # Online saving

README.md:   0%|          | 0.00/579 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/389M [00:00<?, ?B/s]

Saved model to https://huggingface.co/YassirFr/french_tax_llama3.2_3B_lora_2


No files have been modified since last commit. Skipping to prevent empty commit.


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
import json

# Liste des messages contenant les questions une par une
messages = [
    {"role": "user", "content": "Quelles différences fondamentales distinguent l'exonération permanente de l'exonération temporaire dans le CGI ?"},
    {"role": "user", "content": "Comment le Code Général des Impôts définit-il le résultat fiscal, et quels ajustements comptables peuvent être réalisés pour le déterminer ?"},
    {"role": "user", "content": "Expliquez le concept de 'retour sur investissement fiscal' tel qu'il se traduit par les mesures de réduction d'impôt prévues par le CGI."},
    {"role": "user", "content": "Quelles obligations doivent remplir les sociétés pour bénéficier des réductions d'impôt liées à leur introduction en bourse ?"},
    {"role": "user", "content": "Comment le CGI intègre-t-il les subventions d'investissement dans le calcul de la base imposable des entreprises ?"},
    {"role": "user", "content": "Quels mécanismes de contrôle et de contentieux prévoit le CGI en cas de litige fiscal ?"},
    {"role": "user", "content": "Dans quelles conditions les revenus de capitaux mobiliers bénéficient-ils d'un abattement de 100%, et quelles exceptions s'appliquent ?"},
    {"role": "user", "content": "Quelles sont les conséquences fiscales pour une entreprise qui ne respecte pas ses obligations déclaratives en matière de TVA ?"},
    {"role": "user", "content": "Comment le CGI encadre-t-il la fiscalisation des opérations de cession d'actifs, notamment dans le cadre d'une vente à réméré ?"},
    {"role": "user", "content": "Quel est le rôle de la comptabilité séparée pour les fonds gérés par des organismes publics ou privés selon le CGI ?"}
]

answers = []

# Pour chaque question, effectuer l'inférence individuellement et sauvegarder la réponse
for message in messages:
    single_input = tokenizer.apply_chat_template(
        [message],
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt"
    ).to("cuda")

    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)

    output = model.generate(
        input_ids=single_input,
        streamer=text_streamer,
        max_new_tokens=128,
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )

    answer_text = tokenizer.decode(output[0], skip_special_tokens=True)
    answers.append({
        "question": message["content"],
        "answer": answer_text
    })

# Sauvegarder les réponses dans le fichier 'test.json'
with open("test.json", "w", encoding="utf-8") as f:
    json.dump(answers, f, ensure_ascii=False, indent=4)

# Afficher le contenu du fichier 'test.json'
print(json.dumps(answers, ensure_ascii=False, indent=4))


L'exonération permanente, telle que prévue au titre I du CGI, est applicable dans les cas où la réduction de la base imposable correspond à un revenu imposable non entièrement déduitable ou déclaré. Par contre, l'exonération temporaire, à laquelle les articles 18 et 26 du CGI prédisent une réduction pendant une période limitée et sous réserve de certains conditions, ne peut s'appliquer en cas de revenus ou produits à réduire ou déclarer et de certains manquements au titre de l'impôt.
Le résultat fiscal est défini comme l'excédent des produits sur ceux des charges pour une période comptable, avec des ajustements possibles sur la valeur des créances et dettes, les différences entre la valeur comptable et la valeur vénale des actifs, ainsi que d'éventuels ajustements de capitalisation ou d'amortissement. Ces ajustements permettent de s'assurer que le résultat fiscal reflète fidèlement la situation économique réelle de l'entreprise.<|eot_id|>
Le retour sur investissement fiscal est défini 

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("YassirFr/french_tax_llama3.2_3B_Q4_KM", tokenizer, quantization_method = "q4_k_m", token = "")

if True:
    model.push_to_hub_gguf(
        "YassirFr/french_tax_llama3.2_quantized",
        tokenizer,
        quantization_method = ["q4_k_m", "q5_k_m",],
        token = " ",
    )

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.4G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.24 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:01<00:00, 19.22it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving YassirFr/french_tax_llama3.2_quantized/pytorch_model-00001-of-00002.bin...
Unsloth: Saving YassirFr/french_tax_llama3.2_quantized/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q5_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at YassirFr/french_tax_llama3.2_quantized into f16 GGUF format.
The output location will be /content/YassirFr/french_tax_llama3.2_quantized/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: french_tax_llama3.2_quantized
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pyto

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/YassirFr/french_tax_llama3.2_quantized
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q5_K_M.gguf:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/YassirFr/french_tax_llama3.2_quantized


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/YassirFr/french_tax_llama3.2_quantized


#Inference

In [None]:
!pip install llama-cpp

[31mERROR: Could not find a version that satisfies the requirement llama-cpp (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for llama-cpp[0m[31m
[0m

In [None]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="YassirFr/french_tax_llama3.2_quantized",
	filename="unsloth.Q5_K_M.gguf",
  n_gpu_layers=-1,
 main_gpu=0

)

ModuleNotFoundError: No module named 'llama_cpp'

In [None]:
# Prepare your prompt (you can adjust formatting as needed)
prompt = (
"Quels sont les revenus des capitaux mobiliers?"
)

# Generate a response
output = llm(prompt,max_tokens=4096)
response_text = output["choices"][0]["text"]

print("Assistant:", response_text)


llama_perf_context_print:        load time =    4123.37 ms
llama_perf_context_print: prompt eval time =    4117.51 ms /    14 tokens (  294.11 ms per token,     3.40 tokens per second)
llama_perf_context_print:        eval time =  148502.71 ms /   301 runs   (  493.36 ms per token,     2.03 tokens per second)
llama_perf_context_print:       total time =  153183.53 ms /   315 tokens


Assistant:  
Les capitaux mobiliers sont soumis à la taxe sur les sociétés au titre de l’impôt sur les sociétés et aux droits d’enregistrement et d’immatriculation prévus par les textes législatifs et réglementaires en vigueur. 

Références Utilisées : (Article 8 du PDF, Extrait de Contexte 1) Extrait de Contexte 1 : Les capitaux mobiliers, qu'ils soient inscrits ou non à la cote, sont soumis à la taxe sur les sociétés au titre de l’impôt sur les sociétés et aux droits d’enregistrement et d'immatriculation prévus par les textes législatifs et réglementaires en vigueur. Source : Article 8 du PDF. (Article 8 du PDF) Article 8 : L'extrait de contexte 1 précise que les capitaux mobiliers, qu'ils soient inscrits ou non à la cote, sont soumis à la taxe sur les sociétés au titre de l’impôt sur les sociétés et aux droits d’enregistrement et d'immatriculation prévus par les textes législatifs et réglementaires en vigueur. Références utilisées : Extrait de Contexte 1; Article 8 du PDF.
