# 🧱 Ollama: Run LLMs Locally with CLI Chat Interface

**Ollama** allows you to run large language models locally on your machine (Mac, Linux, or Windows) via a simple CLI-based chat interface.

🔁 To use a fine-tuned Hugging Face model with Ollama:

- Export the model from Hugging Face format.
- Convert it to `GGUF` or an Ollama-compatible format using the `transformers` library along with [`transformers-to-ollama`](https://github.com/jmorganca/transformers-to-ollama).

This allows you to deploy and interact with your custom model entirely offline.

In [1]:
# Install dependencies
!pip install -q transformers datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.4 kB[0m [31m20.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.4 kB[0m [31m20.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dep

# 📦 Import Required Libraries for Model Training and Dataset Handling

In [2]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 🧠 Load Pretrained Language Model and Tokenizer

In [3]:
# ✅ Load model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# 📂 Load or Create the Dataset

In [4]:
# ✅ Create synthetic mental health support dataset
data = {
    "text": [
        "User: I'm feeling really anxious lately.\nAssistant: I'm really sorry you're feeling this way. You're not alone, and I'm here for you.",
        "User: I don't know how to deal with stress.\nAssistant: It's okay to feel overwhelmed. Taking small steps like deep breathing can help.",
        "User: I feel like nobody understands me.\nAssistant: That must be really hard. But please know your feelings are valid and someone does care.",
        "User: I can't sleep because of my worries.\nAssistant: Sleep can be tough when the mind is racing. Sometimes journaling or calming music can help.",
    ]
}

# 📦 Wrap Raw Data into a Hugging Face Dataset

In [5]:
# Wrap in Hugging Face Dataset
dataset = Dataset.from_dict(data)

# ✂️ Tokenize the Dataset

In [6]:
# Tokenize
def tokenize_fn(ex):
    return tokenizer(ex["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_fn)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

# ⚙️ Setup Training Arguments and Initialize Trainer

In [7]:
# ✅ Training setup
training_args = TrainingArguments(
    output_dir="./mental-health-chatbot",
    per_device_train_batch_size=1,
    num_train_epochs=2,
    logging_steps=1,
    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

  trainer = Trainer(


# 🚀 Model Fine-Tuning

In [8]:
# Fine-tune the model
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,4.2625
2,4.0674
3,2.9246
4,3.2619
5,2.539
6,2.9364
7,3.0734
8,2.9624


TrainOutput(global_step=8, training_loss=3.253467172384262, metrics={'train_runtime': 32.7157, 'train_samples_per_second': 0.245, 'train_steps_per_second': 0.245, 'total_flos': 261296750592.0, 'train_loss': 3.253467172384262, 'epoch': 2.0})

# 💬 Inference - Chat With the Bot

In [9]:
prompt = "User: I'm feeling very stressed at work.\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Set pad_token_id explicitly and add attention_mask
output = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_new_tokens=50,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,         # Enable sampling to avoid repetition
    top_k=50,                # Top-k sampling
    top_p=0.95,              # Nucleus sampling
    temperature=0.7          # Lower temp = less randomness
)

print(tokenizer.decode(output[0], skip_special_tokens=True))


User: I'm feeling very stressed at work.
Assistant: You're a bit like a person.
Assistant: It's like I'm feeling really stressed at work.
Assistant: You're a bit worried about your work.
Assistant: I'm feeling very anxious at work.
Assistant: You're


# 📤 Export Fine-Tuned Model for Use with Ollama

In [10]:
import os

checkpoints = [ckpt for ckpt in os.listdir("./mental-health-chatbot") if ckpt.startswith("checkpoint")]
print(checkpoints)


['checkpoint-8']


# 🔄 Load Model and Tokenizer from a Specific Checkpoint

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "./mental-health-chatbot/checkpoint-8"

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


# 💾 Save the Final Fine-Tuned Model and Tokenizer

In [12]:
model.save_pretrained("./mental-health-model")
tokenizer.save_pretrained("./mental-health-model")


('./mental-health-model/tokenizer_config.json',
 './mental-health-model/special_tokens_map.json',
 './mental-health-model/vocab.json',
 './mental-health-model/merges.txt',
 './mental-health-model/added_tokens.json',
 './mental-health-model/tokenizer.json')

# 📦 Zip and Download the Fine-Tuned Model

In [13]:
!zip -r mental_health_model.zip ./mental-health-model
from google.colab import files
files.download("mental_health_model.zip")

  adding: mental-health-model/ (stored 0%)
  adding: mental-health-model/generation_config.json (deflated 24%)
  adding: mental-health-model/special_tokens_map.json (deflated 80%)
  adding: mental-health-model/merges.txt (deflated 53%)
  adding: mental-health-model/config.json (deflated 51%)
  adding: mental-health-model/tokenizer.json (deflated 82%)
  adding: mental-health-model/tokenizer_config.json (deflated 56%)
  adding: mental-health-model/vocab.json (deflated 59%)
  adding: mental-health-model/model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>