In [None]:
!pip install transformers peft datasets

In [None]:
from google.colab import files
files.upload()  

In [None]:
from datasets import Dataset

lines = open("llm_train.txt", encoding="utf-8").read().splitlines()
dataset = Dataset.from_dict({"text": lines})

def split_input_target(example):
    inp, tgt = example["text"].split("\t")
    return {"input_text": inp, "labels": tgt}

dataset = dataset.map(split_input_target)


In [None]:
from transformers import AutoTokenizer

BASE_MODEL = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    full = example["input_text"] + " " + example["labels"]
    out = tokenizer(full, truncation=True, padding="max_length", max_length=128)
    out["labels"] = out["input_ids"].copy()
    return out

tokenized_dataset = dataset.map(tokenize)


In [None]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["c_attn"]
)
model = get_peft_model(model, lora_config)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=1e-4,
    output_dir="llm_lora_out",
    save_strategy="epoch",
    report_to="none",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

In [None]:
model.save_pretrained("llm_lora_out/final_model")
tokenizer.save_pretrained("llm_lora_out/final_model")
import shutil
from google.colab import files
shutil.make_archive("llm_lora_model", 'zip', "llm_lora_out/final_model")
files.download("llm_lora_model.zip")
