In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import os

# Set new cache directories
os.environ["HF_DATASETS_CACHE"] = "/ephemeral/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/ephemeral/transformers_cache"
os.environ["TMPDIR"] = "/ephemeral/tmp"

# Ensure the directories exist
os.makedirs("/ephemeral/hf_cache", exist_ok=True)
os.makedirs("/ephemeral/transformers_cache", exist_ok=True)
os.makedirs("/ephemeral/tmp", exist_ok=True)

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from transformers import LlamaTokenizer, LlamaTokenizerFast

# Log in to Hugging Face
login(token="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")



In [3]:
tokenizer = LlamaTokenizer.from_pretrained("Billyyy/mongolian-llama-untrained")
model = AutoModelForCausalLM.from_pretrained("Billyyy/mongolian-llama-untrained")

In [9]:
import torch
# Set the model to evaluation mode
model.eval()

# Example input text
input_text = "Улаанбаатар"  # Replace this with your input sentence

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate output using the model
with torch.no_grad():
    output = model.generate(
        inputs.input_ids, 
        max_length=50,             # Maximum length of the output
        temperature=0.7,           # Controls randomness: lower is more deterministic
        top_p=0.9,                 # Nucleus sampling
        repetition_penalty=1.2,    # Penalize repetition
        do_sample=True             # Enable sampling
    )

# Decode the generated output
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the input and generated output
print("Input Text:", input_text)
print("Generated Output:", output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input Text: Улаанбаатар
Generated Output: Улаанбаатарати, одобрена и се креираја на еден део. Намерен да ги прифаат ако што то ниво ни бил би лоштиште по


In [44]:
from datasets import load_dataset

# Load and save the dataset into the /ephemeral directory
dataset = load_dataset("cc100", "mn", trust_remote_code=True)

In [45]:
# Verify successful loading
print("Dataset loaded and cached at:")
print(dataset)

Dataset loaded and cached at:
DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 15098167
    })
})


In [46]:
## Tokenized dataset
import multiprocessing

# Fix the tokenizer function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Ensures all inputs are fixed-length
        max_length=512,
        return_tensors="pt"    # Return PyTorch tensors directly
    )

tokenized_dataset = dataset.map(
    tokenize_function, 
    batched=True, 
    batch_size=8000,
    num_proc= multiprocessing.cpu_count() - 1,
    remove_columns=["text", "id"],
    load_from_cache_file=False
)

# Print a summary
print("Dataset tokenized successfully!")
print(tokenized_dataset)

tokenized_dataset.save_to_disk("/ephemeral/tokenized_dataset_dir")
print("Tokenized dataset saved to ./tokenized_dataset_dir")


Map (num_proc=27):   0%|          | 0/15098167 [00:00<?, ? examples/s]

Dataset tokenized successfully!
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 15098167
    })
})


Saving the dataset (0/78 shards):   0%|          | 0/15098167 [00:00<?, ? examples/s]

Tokenized dataset saved to ./tokenized_dataset_dir


In [5]:
## LORA MODEL 
from peft import get_peft_model, LoraConfig, TaskType
# Step 2: Apply LoRA to the Model
print("Applying LoRA adapters to the model...")

# Define LoRA configuration with embeddings
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,                                # LoRA rank
    lora_alpha=32,                       # LoRA scaling factor
    lora_dropout=0.1,                    # Dropout for LoRA layers
    target_modules=["q_proj", "v_proj", "embed_tokens"])

# Wrap the model with LoRA adapters
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()


Applying LoRA adapters to the model...
trainable params: 2,248,704 || all params: 1,040,930,816 || trainable%: 0.2160


In [6]:
for param in lora_model.parameters():
    if param.requires_grad:
        param.data = param.data.float()

In [16]:
lora_model = torch.compile(lora_model)

In [7]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32000, 2048)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 16x32000])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 2048x16])
          (lora_magnitude_vector): ModuleDict()
        )
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
   

In [8]:
import torch

torch.set_float32_matmul_precision('high')

In [9]:
import torch

# Check if CUDA (GPU) is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to GPU
lora_model.to(device)

Using device: cuda


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32000, 2048)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 16x32000 (cuda:0)])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 2048x16 (cuda:0)])
          (lora_magnitude_vector): ModuleDict()
        )
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropo

In [11]:
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_from_disk
from transformers import TrainerCallback, TrainerControl, TrainerState
from torch.profiler import ProfilerActivity

# Path where the tokenized dataset is saved
dataset_path = "/ephemeral/small_tokenized_dataset"

# Load the tokenized dataset
tokenized_dataset = load_from_disk(dataset_path)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./llama_lora_finetuned",
    eval_strategy="no",           # Skip evaluation for faster runs
    save_strategy="steps",              # Save checkpoints every few steps
    save_steps=2000,
    save_total_limit=2,
    per_device_train_batch_size=40,     # Batch size
    gradient_accumulation_steps=2,      # Simulates batch size of 64
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,                          # Mixed precision for faster training
    logging_dir="/workspace/logs",               # Logs will be saved here for TensorBoard
    logging_steps=10,                   # Log every 10 steps
    report_to="tensorboard",
    remove_unused_columns=False
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set mlm=False for causal language modeling
    return_tensors="pt"  # Ensure PyTorch tensors
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    data_collator=data_collator
)

# Start Training
print("Starting training with PyTorch Profiler...")
trainer.train()

# Save the Fine-Tuned Model
output_model_path = "/workspace/llama_lora_finetuned"
print("Saving the fine-tuned model...")
lora_model.save_pretrained(output_model_path)

Starting training with PyTorch Profiler...


Step,Training Loss
10,18.3931
20,18.0708
30,18.0552
40,17.7947
50,17.8459
60,17.739
70,17.5413
80,17.4271
90,17.2682
100,17.1311




Saving the fine-tuned model...




In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login

# Log in to Hugging Face
login(token="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")  # Replace with your HF token

# Push model and tokenizer to Hub
lora_model.push_to_hub("Billyyy/mongolian-trained_llama")
tokenizer.push_to_hub("Billyyy/mongolian-trained_llama")



adapter_model.safetensors:   0%|          | 0.00/271M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/922k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Billyyy/mongolian-trained_llama/commit/42e15575cde3bc12babbe908e255ad86ecf6ac80', commit_message='Upload tokenizer', commit_description='', oid='42e15575cde3bc12babbe908e255ad86ecf6ac80', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Billyyy/mongolian-trained_llama', endpoint='https://huggingface.co', repo_type='model', repo_id='Billyyy/mongolian-trained_llama'), pr_revision=None, pr_num=None)

In [14]:
model = AutoModelForCausalLM.from_pretrained("Billyyy/mongolian-trained_llama")
tokenizer = AutoTokenizer.from_pretrained("Billyyy/mongolian-trained_llama")

adapter_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/271M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/922k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]