Fine-tune Mongolian Pos-tagged dataset using Llama-2-3B-hf from hugging face 
https://huggingface.co/winglian/Llama-2-3b-hf
-we download all files needed because this is run in Autodl and there's no VPN to access Hugging face's API

#1 Download and upgrade required Libraries(make sure that they are compatible with your environment) before running all lines of code

In [1]:
!pip install -r requirements.txt

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [2]:
!pip install scikit-learn numpy tqdm

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [3]:
!pip install -U bitsandbytes

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [4]:
!pip install protobuf sentencepiece

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [5]:
!pip install --upgrade protobuf sentencepiece transformers

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

#2 (skip this if you use API from hugging face) Try and load downloaded files for Llama-2-3B and test run to see if it works

In [6]:
from transformers import LlamaTokenizer
import torch

# 确保 tokenizer_class 明确指定为 LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained(
    "/dev/shm/Llama-2-3b-hf/",
    tokenizer_file="/dev/shm/Llama-2-3b-hf/tokenizer.model",
    legacy=False  # 强制使用新版本处理方式
)

# 检查 tokenizer 是否加载成功
print(tokenizer.tokenize("Сүүлийн таван жил дараалан"))  # 应输出 ['▁Hello', '▁world']

['▁С', 'ү', 'ү', 'лий', 'н', '▁та', 'ван', '▁жи', 'л', '▁да', 'ра', 'а', 'лан']


#3 Configuration

In [7]:
import json
import os
import time
import gc
import torch
import random
from torch.utils.data import Dataset, Subset
from transformers import LlamaForCausalLM, LlamaTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import bitsandbytes as bnb  # For 4-bit quantization

LLAMA_MODEL_PATH = "/dev/shm/Llama-2-3b-hf/"
DATASET_PATH_LLAMA = "/dev/shm/train.jsonl"
USE_LORA = True  # Enable LoRA to reduce memory usage
USE_4BIT = True  # Use 4-bit quantization for efficient training
DOWNSAMPLE_RATIO = 0.3  # Adjust dataset size (e.g., 0.3 = 30% of full data)

#4 Memory Management Function

In [8]:
def clear_cuda_memory():
    """Clears unused GPU memory to prevent OutOfMemory errors."""
    gc.collect()
    torch.cuda.empty_cache()
    torch.set_float32_matmul_precision('high')

#5 POS Tagging Dataset Loader Implementation

In [10]:
class LlamaPOSDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.samples = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line.strip())
                prompt = data["original_text"]
                completion = " ".join(data["pos_tags"])
                self.samples.append({"prompt": prompt, "completion": completion})
        
        # Downsample dataset to reduce training time
        if 0 < DOWNSAMPLE_RATIO < 1:
            self.samples = random.sample(self.samples, int(len(self.samples) * DOWNSAMPLE_RATIO))

        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        input_text = sample["prompt"] + " " + sample["completion"]
        encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding["labels"] = encoding["input_ids"].clone()
        return encoding



#6 Load Tokenizer and model

In [11]:
try:
    tokenizer_llama = LlamaTokenizer.from_pretrained(
        LLAMA_MODEL_PATH, 
        tokenizer_file=os.path.join(LLAMA_MODEL_PATH, "tokenizer.model"),
        legacy=False,
        local_files_only=True
    )

    # **Fix**: Set padding token to eos token
    tokenizer_llama.pad_token = tokenizer_llama.eos_token  # Use eos_token as pad_token

    model_llama = LlamaForCausalLM.from_pretrained(LLAMA_MODEL_PATH, local_files_only=True).cuda()

    print("Model and Tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    raise RuntimeError("Model or Tokenizer could not be loaded.")

# **Fix**: Prepare 4-bit model for LoRA
if USE_4BIT:
    model_llama = prepare_model_for_kbit_training(model_llama)

Model and Tokenizer loaded successfully.


#7 LoRA Configuration for Efficient Training

In [12]:
# Apply LoRA for low-rank adaptation
if USE_LORA:
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM, 
        inference_mode=False,
        r=8,  # Low-rank dimension (adjust as needed)
        lora_alpha=16,  
        lora_dropout=0.05
    )
    model_llama = get_peft_model(model_llama, peft_config)

#8 Train-Validation Dataset Split

In [None]:
dataset_llama = LlamaPOSDataset(DATASET_PATH_LLAMA, tokenizer_llama)
dataset_size = len(dataset_llama)
split_point = int(0.8 * dataset_size)
train_dataset_llama = Subset(dataset_llama, list(range(split_point)))
val_dataset_llama = Subset(dataset_llama, list(range(split_point, dataset_size)))

#9 Training Arguments

In [13]:
# ====== TRAINING ARGUMENTS ======
training_args_llama = TrainingArguments(
    output_dir="./llama_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Increase if memory allows
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,  # Adjust to fit memory
    eval_strategy="epoch",
    logging_steps=10,
    save_total_limit=1,
    logging_dir="./logs_llama",
    fp16=True,  # Enable mixed precision training
    optim="adamw_bnb_8bit" if USE_4BIT else "adamw_torch_fused",  # Use optimized optimizer
    torch_compile=True,
    save_steps=500,  # Save the model every 500 steps
)

#9 LLaMA Training Initialization

In [14]:
# ====== TRAINER SETUP ======
trainer_llama = Trainer(
    model=model_llama,
    args=training_args_llama,
    train_dataset=train_dataset_llama,
    eval_dataset=val_dataset_llama,
    tokenizer=tokenizer_llama
)

# ====== TRAINING LOOP WITH MEMORY MANAGEMENT ======
print("LLaMA Fine-tuning started at:", time.strftime("%Y-%m-%d %H:%M:%S"))

  trainer_llama = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


LLaMA Fine-tuning started at: 2025-03-26 20:33:18


#10 OOM Handling & Adaptive Training

In [None]:
try:
    trainer_llama.train()
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("CUDA Out of Memory! Reducing batch size...")
        clear_cuda_memory()

        # Retry with lower batch size
        training_args_llama.per_device_train_batch_size = max(1, training_args_llama.per_device_train_batch_size // 2)
        training_args_llama.gradient_accumulation_steps *= 2  # Compensate for smaller batches

        trainer_llama = Trainer(
            model=model_llama,
            args=training_args_llama,
            train_dataset=train_dataset_llama,
            eval_dataset=val_dataset_llama,
            tokenizer=tokenizer_llama
        )

        trainer_llama.train()
    

Epoch,Training Loss,Validation Loss
0,0.8665,0.942197
1,0.8841,0.869438


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



#11 Save Model

In [None]:
print("LLaMA Fine-tuning finished at:", time.strftime("%Y-%m-%d %H:%M:%S"))

# ====== SAVE FINAL MODEL ======
trainer_llama.save_model("./llama_finetuned")
print("Final model saved.")

In [7]:
%run graph.py


=== LoRA Configuration ===
LoRA Rank (r): 8
LoRA Alpha (α): 16
LoRA Dropout: 0.05
Graphs saved as training_loss.png, evaluation_loss.png, and learning_rate.png
