<a href="https://colab.research.google.com/github/alexhmyang/AIface/blob/master/ollama_colab_runner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Ollama Colab Runner**
# <img src='https://ollama.com/public/ollama.png' alt="Ollama"/>
When running this, ideally, select an instance with GPU:<br>
T4 for free ones, A100/L4 for paid subscribers<br><br>
Run each of the 3 cells, before running your prompt.<br>
If you interrupt execution, start the server again

In [1]:
# @title Install components
!curl https://ollama.ai/install.sh | sh
!pip install ollama

!echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
!sudo apt-get update && sudo apt-get install -y cuda-drivers

import os
# Set LD_LIBRARY_PATH so the system NVIDIA library
os.environ.update({'LD_LIBRARY_PATH': '/usr/lib64-nvidia'})

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13281    0 13281    0     0  11582      0 --:--:--  0:00:01 --:--:-- 11589
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Collecting ollama
  Downloading ollama-0.6.0-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: ollama
Successfully installed ollama-0.6.0
Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_

In [6]:
# @title Start server
import subprocess
proccess = subprocess.Popen(['ollama', 'serve'])

In [7]:
# @title Select your model
model = "qwen3:4b" # @param ["qwen3:4b","qwen3:14b","deepseek-r1:1.5b","deepseek-r1:7b","deepseek-r1:14b","deepseek-r1:32b","deepseek-r1:70b","deepseek-coder:1.3b","deepseek-coder:6.7b","deepseek-coder:33b","gemma3:12b","gemma3:27b","llama3.3:70b","mistral:7b","phi4:14b","qwen2.5:7b","qwen2.5:14b","qwen2.5:32b","qwen2.5-coder:7b","qwen2.5-coder:14b","qwen2.5-coder:32b"]
!ollama pull {model}

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026

In [8]:
# @title Interacting with the model
question = "写一篇200字的报告" # @param {"type":"string"}
from IPython.display import display, Markdown, HTML
import ollama
import time
import re

def count_chinese_characters(text):
    """Counts the number of Chinese characters in a string."""
    # This is a simple regex for common Chinese character ranges.
    # It might not be exhaustive but covers most cases.
    chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
    return len(chinese_chars)

start_time = time.time()
# Use streaming for real-time output
stream = ollama.chat(model=model, messages=[
  {
    'role': 'user',
    'content': question,
  },
], stream=True)

generated_text = ""
output_div = display(HTML("<div id='output'></div>"), display_id=True)

for chunk in stream:
    if chunk['message']['content'] is not None:
        generated_text += chunk['message']['content']
        # Update the output div with the current generated text
        output_div.update(HTML(f"<div id='output'>{generated_text}</div>"))

end_time = time.time()
generation_time = end_time - start_time

# Assuming each Chinese character is roughly equivalent to one token for a simplified calculation
# For more accurate token counting, a proper tokenizer for the model would be needed.
token_count = count_chinese_characters(generated_text)
tokens_per_second = token_count / generation_time if generation_time > 0 else 0

print(f"\nGeneration time: {generation_time:.2f} seconds")
print(f"Tokens generated: {token_count}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")

print(model)


Generation time: 49.15 seconds
Tokens generated: 2179
Speed: 44.33 tokens/second
qwen3:4b


In [9]:
# 1. 安装依赖
!pip install -q transformers accelerate autoawq torch safetensors


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/74.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for autoawq (setup.py) ... [?25l[?25hdone


In [18]:
# 2. 导入库
import torch
import time
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

# 3. 指定 Hugging Face 上的 AWQ 模型
# ⚠️ 注意：这里用示例仓库，你可以换成 Hugging Face 上的其他 AWQ 模型，比如：
#   - "Qwen/Qwen1.5-14B-AWQ"
#   - "TheBloke/Qwen-14B-AWQ"
#   - "stelterlab/Qwen3-14B-AWQ"
# （你需要在 https://huggingface.co 上确认模型名称）

model_id = "Qwen/Qwen3-4B-AWQ" # 替换为正确的 Qwen3 14B AWQ 模型 ID

# 4. 加载量化模型
print("开始加载模型...")
t0 = time.time()
model = AutoAWQForCausalLM.from_quantized(
    model_id,
    fuse_layers=True,
    device_map="auto",
    safetensors=True,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
torch.cuda.synchronize()
t1 = time.time()
print(f"模型加载完成，用时 {t1-t0:.1f} 秒")

# 5. 编写问答测试函数
def answer_question(question, model, tokenizer):
    inputs = tokenizer(question, return_tensors="pt") # Removed .to(model.device)

    t2 = time.time()
    # 生成回答
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200) # 可以调整 max_new_tokens
    t3 = time.time()

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generation_time = t3 - t2
    tokens_generated = outputs[0].shape[0] - inputs['input_ids'].shape[1] # 计算生成的 tokens 数量
    tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0

    print(f"\n问题: {question}")
    print(f"回答: {response}")
    print(f"生成时间: {generation_time:.2f} 秒")
    print(f"生成的 tokens: {tokens_generated}")
    print(f"速度: {tokens_per_second:.2f} tokens/秒")

# 6. 进行问答测试
question_to_ask = "请用中文写一篇关于人工智能的简短文章。" # 你可以修改这个问题
answer_question(question_to_ask, model, tokenizer)

开始加载模型...


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 36/36 [00:13<00:00,  2.71it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 742.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 296.12 MiB is free. Process 3379 has 14.45 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 17.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)