### Check GPU Availability

In [1]:
!nvidia-smi

Mon Jun 16 09:55:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Installation

In [2]:
%%capture
# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:

!pip install transformers==4.37.2
!pip install unsloth==2025.2.15 unsloth_zoo==2025.2.7
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

### Initialize the LLM

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

### Changing the model here is forbidden !

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "taide/Llama-3.1-TAIDE-LX-8B-Chat",    ### Do not change the model for any other models or quantization versions
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = False,
    token = "",
)

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



taide/Llama-3.1-TAIDE-LX-8B-Chat does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


## Load datasets (store in google cloud)

## Inference

In [None]:
from pathlib import Path
# 假設 rules.txt 裡就是你要放進 system prompt 的規則
rules_path = Path("rules.txt")
rules_text = rules_path.read_text(encoding="utf-8")      # ← 一次全部讀進記憶體

sys_prompt = (
    rules_text.strip() +      # 去掉前後空白
    "\n\n你是 Cisco IOS XR 網管助手，只輸出 RESTCONF curl 指令與對應 config JSON，"
    "禁止加上任何解釋或多餘文字。"
)
links_to_close = ["L1", "L3", "L6"]
user_prompt = f"關閉 {', '.join(links_to_close)}"

chat = [
    {"role": "system", "content": sys_prompt},
    {"role": "user",   "content": user_prompt},
]
# tokenize=False 則回傳純文字；tokenize=True 直接回傳 tensor
prompt_ids = tokenizer.apply_chat_template(
    chat,
    tokenize=True,              # 直接變 tensor
    add_generation_prompt=True, # 自動在最後接 assistant 開頭
    return_tensors="pt",
).to(model.device)


inputs  = tokenizer(action_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs,
                         max_new_tokens = 128,
                         do_sample      = True,
                         temperature    = 0.7,
                         top_p          = 0.9)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)