In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import accelerate

In [4]:
from datasets import Dataset
from tqdm import tqdm

# Specify the file path
file_path = "/Users/billy/Downloads/mn.txt"

# Count total lines in the file
with open(file_path, "r", encoding="utf-8") as file:
    total_lines = sum(1 for line in file)

# Calculate 10% of total lines
portion_size = int(total_lines * 0.01)

# Load only 10% of the file with tqdm
lines = []
with open(file_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(tqdm(file, desc="Loading 1% of Mongolian Sentences", total=portion_size)):
        if i >= portion_size:
            break
        lines.append(line.strip())  # Strip any extra whitespace or newlines

# Create a Dataset from the sampled text data
dataset = Dataset.from_dict({"text": lines})

Loading 1% of Mongolian Sentences: 100%|██████████| 150981/150981 [00:00<00:00, 1295439.82it/s]


In [5]:
# Load the model and tokenizer
model_name = "gpt2"  # You can start with a smaller model like GPT-2
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [6]:
# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# Tokenize your dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Load the dataset (assuming you've created it earlier)
dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/150981 [00:00<?, ? examples/s]

In [8]:
from peft import get_peft_model, LoraConfig, TaskType

# Configure LoRA parameters
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # GPT-2 is a causal language model
    r=8,  # Low-rank dimension, can adjust based on resources
    lora_alpha=16,  # Scaling factor for LoRA; adjust for your setup
    lora_dropout=0.1  # Dropout rate; helps prevent overfitting
)

# Wrap the GPT-2 model with LoRA
model = get_peft_model(model, lora_config)



In [9]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
       

In [20]:
# Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="/gpt2_lora_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=1000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

OSError: [Errno 30] Read-only file system: '../models'