#Install & imports

In [20]:
# Upgrade pip first
!pip install -q --upgrade pip

# Install core packages (latest stable)
!pip install -q "transformers" "peft" "accelerate" "safetensors" "datasets" "sentencepiece" "tokenizers"

In [None]:
#import os, sys
#os.kill(os.getpid(), 9)

In [21]:
import os, math, torch, numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling,
    TrainingArguments, Trainer, pipeline
)
from peft import LoraConfig, get_peft_model

In [None]:
#!pip install -q transformers==4.41.2 datasets peft accelerate evaluate safetensors

In [22]:
import os, math, torch, numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling,
    TrainingArguments, Trainer, pipeline
)
from peft import LoraConfig, get_peft_model

In [23]:
from transformers import AutoTokenizer

#Mount Drive & setting paths

In [24]:
# Colab: mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#path & files
base_path = "/content/drive/MyDrive/Projects/LLM Projects"
train_path = os.path.join(base_path, "tig_train.txt")
val_path   = os.path.join(base_path, "tig_valid.txt")

print("Train path:", train_path)
print("Valid path:", val_path)
print("Train exists?", os.path.exists(train_path))
print("Valid exists?", os.path.exists(val_path))

# quick peek (handles UTF-8 Tigrinya)
if os.path.exists(train_path):
    with open(train_path, "r", encoding="utf-8", errors="ignore") as f:
        for _ in range(3):
            print("Sample:", f.readline().strip())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train path: /content/drive/MyDrive/Projects/LLM Projects/tig_train.txt
Valid path: /content/drive/MyDrive/Projects/LLM Projects/tig_valid.txt
Train exists? True
Valid exists? True
Sample: ኣብ ገለ እዋን መዓት ዝጸሓፍ ኣብ ኣእምሮይ ይመጽእ እሞ ፡ ክጽሕፍ ኢለ ብርዕን ወረቐትን ምስ ሓዝኩ ህልም ይብለኒ ።
Sample: ሽዑ እሓርቕ ።
Sample: ስለምንታይ እየ ክጽሕፍ ዘይክእል?


#Load the dataset or check dataset

In [25]:
data_files = {}
if os.path.exists(train_path): data_files["train"] = train_path
if os.path.exists(val_path):   data_files["validation"] = val_path

assert "train" in data_files, "Training file not found. Check path/filename."
ds = load_dataset("text", data_files=data_files)

In [26]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1978213
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 43335
    })
})

#Tokenizer ( GPT-2; pad token fix)

In [27]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
print("Vocab size:", len(tokenizer))

Vocab size: 50257


#Tokenize & chunk into blocks (causal LM)

In [28]:
block_size = 128  # shorter context to speed up training

def tok_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=block_size,
        return_attention_mask=False
    )

# Tokenize dataset
tokenized = ds.map(tok_fn, batched=True, remove_columns=["text"])

def group_texts(examples):
    # Concatenate then split into block_size chunks
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i:i+block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized.map(group_texts, batched=True, batch_size=1000)

Map:   0%|          | 0/1978213 [00:00<?, ? examples/s]

Map:   0%|          | 0/43335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1978213 [00:00<?, ? examples/s]

Map:   0%|          | 0/43335 [00:00<?, ? examples/s]

In [29]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1422303
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 29486
    })
})

In [30]:
# Reduce dataset size for faster training
small_train = lm_datasets["train"].shuffle(seed=42).select(range(10000))   # 10k examples
small_val   = lm_datasets["validation"].shuffle(seed=42).select(range(1000))  # 1k examples

print("Train size:", len(small_train))
print("Validation size:", len(small_val))

Train size: 10000
Validation size: 1000


#Load base model & attach LoRA adapters

In [31]:
base_model_name = "gpt2"  # use "distilgpt2" for even faster runs
model = AutoModelForCausalLM.from_pretrained(base_model_name)
model.resize_token_embeddings(len(tokenizer))  # adjust embeddings if pad_token was added

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # LoRA targets in GPT-2
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / Total: {total:,} ({100*trainable/total:.2f}% trainable)")

Trainable params: 811,008 / Total: 125,250,816 (0.65% trainable)




#Training config

In [33]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

use_fp16 = torch.cuda.is_available()

training_args = TrainingArguments(
    output_dir="./ti-gpt2-lora",
    per_device_train_batch_size=2,      # keep small for T4 GPU
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,      # effective batch size = 8
    num_train_epochs=1,                 # just 1 epoch for demo (fast!)
    learning_rate=2e-4,                 # LoRA works well with slightly higher LR
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="epoch",        # run eval at end of each epoch
    save_strategy="no",                 # don’t save multiple checkpoints (faster)
    fp16=use_fp16,                      # mixed precision if GPU supports
    bf16=False,
    report_to=[]                        # disable logging to W&B
)

#Train

In [35]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    data_collator=data_collator,
)

# Start training
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.6719,1.611328


TrainOutput(global_step=1250, training_loss=1.745262841796875, metrics={'train_runtime': 19706.9485, 'train_samples_per_second': 0.507, 'train_steps_per_second': 0.063, 'total_flos': 659458621440000.0, 'train_loss': 1.745262841796875, 'epoch': 1.0})

In [36]:
trainer.save_model("./ti-gpt2-lora-final")  # saves LoRA adapter weights

In [37]:
results = {}
if trainer.eval_dataset is not None:
    eval_out = trainer.evaluate()
    eval_loss = float(eval_out["eval_loss"])
    ppl = math.exp(eval_loss) if eval_loss < 100 else float("inf")
    results = {"eval_loss": eval_loss, "perplexity": ppl}
    print(results)
else:
    print("No validation dataset; skipped perplexity.")

{'eval_loss': 1.6113276481628418, 'perplexity': 5.0094576120228735}


#Evaluate (Perplexity)

In [38]:
results = {}
if "validation" in lm_datasets:
    eval_out = trainer.evaluate()
    eval_loss = float(eval_out["eval_loss"])
    ppl = math.exp(eval_loss) if eval_loss < 100 else float("inf")
    results = {"eval_loss": eval_loss, "perplexity": ppl}
    print(results)
else:
    print("No validation split found; skipped perplexity.")

{'eval_loss': 1.6113276481628418, 'perplexity': 5.0094576120228735}


#Generate Tigrinya text (sampling)

In [39]:
model.to("cuda" if torch.cuda.is_available() else "cpu")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
          

In [40]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

prompt = "ኣብ ኣዲስ ኣበባ ምስ ምሸት ኣብ መንገዲ ኣብ ገምጋም እየ።"
outs = generator(
    prompt,
    max_length=160,
    do_sample=True,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3
)
for i, o in enumerate(outs, 1):
    print(f"\n=== Sample {i} ===\n{o['generated_text']}\n")


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=160) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



=== Sample 1 ===
ኣብ ኣዲስ ኣበባ ምስ ምሸት ኣብ መንገዲ ኣብ ገምጋም እየ። ንይንብን ሓገምት ሓስ ሓስ ምጋት። ካስትን፡ ኣብ ምሲኣብ ኣብ ዝሰንለዕ ከም ተቅብገነ ኣብ ነብዝት ንጋም ዝገምዕ ብምተም ነእባብ ዝሰንለዕ ካስትዝ ኣብ ዝሰንለዕ ዝደንለዕ ዝ�


=== Sample 2 ===
ኣብ ኣዲስ ኣበባ ምስ ምሸት ኣብ መንገዲ ኣብ ገምጋም እየ። ነነይት ውኣሰካት መንገዲ ኣብ ምጋምጋድል ኣብይት ውኣብትን ምስቲ ኣለንብትል ኣብይትት ኣብይትት ኣብምሳንስይስ ኣብምሰካትው ኣለንብስል ኣብይትል ኣብምሳን ኣምስል እ�


=== Sample 3 ===
ኣብ ኣዲስ ኣበባ ምስ ምሸት ኣብ መንገዲ ኣብ ገምጋም እየ። እይትም ደጸምሳታት እኒሰርነታት ድንብ ብትምው፡ ኣስበዲ እንገዲ ኣብንገዳታት ምሽስ እባተት እየ። ኣናኑሰ ኣእብእላነታትት እናናሰርነታት ኣምራዚ ካእየ። እባሃራ ነም�



In [None]:
import gradio as gr
import torch
from transformers import pipeline

# Load your model + tokenizer (already trained with LoRA)
generator = pipeline(
    "text-generation",
    model=model,          # your LoRA-wrapped GPT-2
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Helper: format conversation for GPT-2 style
def format_conversation(history):
    text = "System: You are a helpful assistant that responds in Tigrinya.\n"
    for user_msg, bot_msg in history:
        text += f"User: {user_msg}\nAssistant: {bot_msg}\n"
    text += "Assistant:"
    return text

# Generate reply
def chatbot(user_input, history):
    history = history or []  # initialize if empty
    prompt = format_conversation(history + [(user_input, "")])

    outs = generator(
        prompt,
        max_length=200,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        num_return_sequences=1,
        pad_token_id=generator.tokenizer.eos_token_id,
    )

    full_output = outs[0]["generated_text"]
    reply = full_output[len(prompt):].strip().split("\n")[0]  # stop at first line

    history.append((user_input, reply))
    return history, history

# Gradio UI
chatbot_ui = gr.ChatInterface(
    fn=chatbot,
    title="💬 Tigrinya GPT-2 LoRA Chatbot",
    description="A chatbot fine-tuned with LoRA on GPT-2. Responds in Tigrinya.",
    theme="soft",
)

if __name__ == "__main__":
    chatbot_ui.launch()

NameError: name 'model' is not defined