In [2]:
import os
import torch

# Set new cache directories
os.environ["HF_DATASETS_CACHE"] = "/ephemeral/hf_cache"
os.environ["HF_HOME"] = "/ephemeral/transformers_cache"
os.environ["TMPDIR"] = "/ephemeral/tmp"

# Ensure the directories exist
os.makedirs("/ephemeral/hf_cache", exist_ok=True)
os.makedirs("/ephemeral/transformers_cache", exist_ok=True)
os.makedirs("/ephemeral/tmp", exist_ok=True)

import torch
import wandb
from transformers import AutoModelForCausalLM,AutoTokenizer,Trainer,TrainingArguments,DataCollatorForLanguageModeling,DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import login

In [2]:
def show_first_lines_text(file_path, num_lines=5):
    """
    Reads and prints the first `num_lines` lines from a plain text file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        for i in range(num_lines):
            line = f.readline()
            if not line:
                break  # Stop if file has fewer than num_lines
            print(f"Line {i+1}: {line.strip()}")


In [3]:
show_first_lines_text("mn_corpus.txt")

Line 1: MONGOLIAN: –Ø–º–∞—Ä –Ω”©—Ö—Ü”©–ª–¥ –∞–º—å–∂–∏—Ä–≥–∞–∞–Ω—ã —Ç“Ø–≤—à–Ω–∏–π “Ø–Ω—ç–ª–≥—ç—ç–Ω–¥ –¥–∞—Ö–∏–Ω –æ—Ä–æ—Ö –≤—ç?
Line 2: ENGLISH: What is the case with living standards?
Line 3: 
Line 4: MONGOLIAN: –£–≥ –∞—è–Ω—ã “Ø—Ä–≥—ç–ª–∂–ª—ç–ª –±–æ–ª–≥–æ–∂ "–ë–∏–¥ —Ö–∞–º—Ç–¥–∞–∞" “Ø–Ω–¥—ç—Å–Ω–∏–π —Ñ–æ—Ä—É–º—ã–≥ –∏—Ä—ç—Ö –¥–æ–ª–æ–æ —Ö–æ–Ω–æ–≥—Ç –∑–æ—Ö–∏–æ–Ω –±–∞–π–≥—É—É–ª–∞—Ö–∞–∞—Ä –±–æ–ª–∂—ç—ç.
Line 5: ENGLISH: As a continuation of the campaign, we will organize a national forum "We are together" next week.


In [46]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

# Log in to the Hugging Face Hub
login(token="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")


model_name = "meta-llama/Llama-3.1-8B-Instruct"

# 1. Create a quantization config for 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",            # Options: 'nf4' (recommended) or 'fp4'
    bnb_4bit_use_double_quant=True,       # Nested quantization for higher accuracy
    bnb_4bit_compute_dtype="bfloat16"     # Mixed-precision type: 'fp16', 'bf16', etc.
)

# 3. Load model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"  # auto-distributes across GPU(s) if available
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
from transformers import PreTrainedTokenizerFast
# 2. Load tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("Billyyy/llama_8K_extended")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BloomTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [7]:
from datasets import load_dataset
merged_dataset = load_dataset("Billyyy/llama_8K_extended")

In [14]:

def merge_sequences_with_text(dataset, tokenizer, max_length=1024):
    """
    Merge sequences in a dataset while keeping 'text' and 'input_ids' aligned.
    
    Args:
        dataset (Dataset): Hugging Face Dataset with 'text' and 'input_ids'.
        tokenizer: Tokenizer used to tokenize the text.
        max_length (int): Maximum length for merged sequences.

    Returns:
        dict: A dictionary with merged 'text' and 'input_ids'.
    """
    merged_texts = []
    merged_input_ids = []
    current_text = []
    current_input_ids = []

    for text, input_ids in zip(dataset["text"], dataset["input_ids"]):
        # Check if adding the current sequence exceeds the max length
        if len(current_input_ids) + len(input_ids) > max_length:
            # Append the merged sequences
            merged_texts.append(" ".join(current_text))
            merged_input_ids.append(current_input_ids[:max_length])
            # Reset for the next sequence
            current_text = []
            current_input_ids = []
        
        # Extend the current sequence
        current_text.append(text)
        current_input_ids.extend(input_ids)
    
    # Add the final batch if it exists
    if current_input_ids:
        merged_texts.append(" ".join(current_text))
        merged_input_ids.append(current_input_ids[:max_length])

    return {"text": merged_texts, "input_ids": merged_input_ids}


In [15]:
dataset = dataset.remove_columns("attention_mask")

# Add 'input_ids' column if not already in the dataset
if "input_ids" not in dataset.column_names:
    dataset = dataset.map(
        lambda examples: {"input_ids": tokenizer(examples["text"], truncation=True, padding=False, max_length=256)["input_ids"]},
        batched=True,
        num_proc=27
    )

# Apply the merging function
merged_dataset = dataset.map(
    lambda batch: merge_sequences_with_text(batch, tokenizer, max_length=256),
    batched=True, 
    num_proc=27)

Map (num_proc=27): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 955519/955519 [00:13<00:00, 72378.89 examples/s]
Map (num_proc=27): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50291/50291 [00:00<00:00, 51975.37 examples/s]
Map (num_proc=27): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 955519/955519 [00:04<00:00, 221512.98 examples/s]
Map (num_proc=27): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50291/50291 [00:00<00:00, 96506.03 examples/s] 


In [30]:
merged_dataset.remove_columns("text")

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 645131
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34004
    })
})

In [11]:
merged_dataset.push_to_hub("Billyyy/llama_8K_extended")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/216 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/216 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/216 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/423 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Billyyy/llama_8K_extended/commit/298dc49a0351e7853c5357e4cc5971b82ce2eed8', commit_message='Upload dataset', commit_description='', oid='298dc49a0351e7853c5357e4cc5971b82ce2eed8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Billyyy/llama_8K_extended', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Billyyy/llama_8K_extended'), pr_revision=None, pr_num=None)

In [7]:
def tokenize_function(examples):
    """Tokenizes each line in the dataset."""
    return tokenizer(
        examples["text"], truncation=True, padding=True, max_length=256
    )

In [10]:
merged_dataset = merged_dataset.map(tokenize_function, batched=True, num_proc=27)

Map (num_proc=27):   0%|          | 0/645131 [00:00<?, ? examples/s]

Map (num_proc=27):   0%|          | 0/34004 [00:00<?, ? examples/s]

In [29]:
merged_dataset['train'][60000]['text']

'<mn> –•–∞–º–≥–∏–π–Ω –∞–Ω—Ö "–•–∞–ª—É—É–Ω —Å—ç—Ç–≥—ç–ª" –∫–∏–Ω–æ–Ω—ã –Ω–∞–π—Ä—É—É–ª–∞–≥—á –õ.–≠–Ω—Ö–±–∞—è—Ä –∞—Ö 2006 –æ–Ω–¥ –Ω–∞–º–∞–π–≥ –∫–∏–Ω–æ–Ω–¥ —É—Ä–∏–∞–¥, –≥–æ–ª –¥“Ø—Ä—ç—ç ”©–≥”©—Ö”©–¥ –Ω—å –º–∞—à –∏—Ö –±–∞—è—Ä–ª–∞–∂ –±–∞–π–ª–∞–∞. <en>The first director of "Hot Soul", L.Enkhbayar, was excited to thank me for the role of the film in 2006. <mn> –ú–∞–Ω–∞–π —Ö–æ–π–¥ —Ö—î—Ä—à–∏–π–≥ —Ö—É–≤—å–¥ —á –≥—ç—Å—ç–Ω —Å—É—Ä—Ç–∞–ª—á–∏–ª–≥–∞–∞–Ω—ã –∑–æ—Ä–∏–ª–≥–æ–æ—Ä —Ö—É—É—Ä–∞–º—á –º—î–Ω–≥–∏–π–≥ –∞—à–∏–≥–ª–∞—Ö —Ç—É—Ä—à–ª–∞–≥—ã–≥ —Ö—ç—Ä—ç–≥–∂vv–ª–∂ –±–∞–π–∂—ç—ç. <en>In addition, our northern neighbors have been involved in the use of counterfeit money for advertising.'

In [None]:
# 5. Training Arguments -------------------------------------------
training_args = TrainingArguments(
    output_dir="/ephemeral/llama_3B_translation",  
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,
    logging_dir="/workspace/logs",
    logging_steps=10,
    report_to="wandb",
    remove_unused_columns=False,
    group_by_length=False,
    
    # A100-specific optimizations
    gradient_checkpointing=True,
    optim="adamw_torch",
    ddp_find_unused_parameters=False,

)

# 6. Data Collator for Masked Language Modeling -------------------
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# 7. Trainer Initialization ---------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=merged_dataset["train"],
    eval_dataset=merged_dataset["eval"], 
    data_collator=data_collator
)

In [10]:
merged_dataset['train']

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 645131
})

In [3]:

# 2. Load Pretrained Tokenizer & Model -----------------------------
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"  # Change if using another model
TOKENIZER_PATH = "Billyyy/llama_8K_extended" # Path to your trained tokenizer
DATASET_PATH = "/workspace/labeled_corpus.txt" 

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
vocab_size = len(tokenizer)

# 3. Resize Model's Embedding Layer -------------------------------
old_vocab_size = model.get_input_embeddings().num_embeddings


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
old_vocab_size

128256

In [5]:

if vocab_size > old_vocab_size:
    print(f"Resizing model embeddings from {old_vocab_size} ‚Üí {vocab_size}")
    model.resize_token_embeddings(vocab_size)

# Freeze all layers except embedding
for param in model.parameters():
    param.requires_grad = False  # Freeze entire model

# Enable training for embedding layer only
model.get_input_embeddings().weight.requires_grad = True

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Resizing model embeddings from 128256 ‚Üí 135126


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
import torch

def count_trainable_parameters(model):
    """Returns the number of trainable parameters in the model."""
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.2f}%")
    
    return trainable_params

# Example Usage (after loading your model)
trainable_params = count_trainable_parameters(model)


Total parameters: 8,086,540,288
Trainable parameters: 553,476,096
Percentage of trainable parameters: 6.84%


In [39]:
trainer.train()

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 777, in convert_to_tensors
    tensor = as_tensor(value)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 739, in as_tensor
    return torch.tensor(value)
ValueError: too many dimensions 'str'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/workspace/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/workspace/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 45, in __call__
    return self.torch_call(features)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 943, in torch_call
    batch = pad_without_fast_tokenizer_warning(
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/data/data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3397, in pad
    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 241, in __init__
    self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
  File "/workspace/.venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 793, in convert_to_tensors
    raise ValueError(
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).


In [11]:
merged_dataset = merged_dataset.remove_columns(["text"])

In [12]:
merged_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 645131
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34004
    })
})

In [42]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

def show_collated_batch(dataset, tokenizer, data_collator, batch_size=4):
    """Fetches and prints a sample batch after applying the DataCollator."""
    
    # Create DataLoader with DataCollator
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    
    # Get one batch
    batch = next(iter(dataloader))
    
    for i in range(batch_size):
        print(f"\nüìù **Sample {i+1}:**")
        print("Input IDs:", batch["input_ids"][i].tolist())
        print("Attention Mask:", batch["attention_mask"][i].tolist())

        # Decode tokenized input back to text
        decoded_text = tokenizer.decode(batch["input_ids"][i], skip_special_tokens=True)
        print("Decoded Text:", decoded_text)

        if "labels" in batch:
            print("Labels:", batch["labels"][i].tolist())

# Initialize Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)

# Example Usage
show_collated_batch(merged_dataset['train'], tokenizer, data_collator)



üìù **Sample 1:**
Input IDs: [128000, 14066, 77, 29, 81529, 129063, 220, 128972, 129025, 130337, 220, 128594, 129051, 220, 128665, 134437, 130829, 129613, 220, 128989, 129461, 135069, 220, 129807, 131538, 220, 131143, 133635, 132778, 131651, 220, 131061, 128480, 129051, 134429, 128530, 129461, 129126, 220, 128813, 131160, 220, 128480, 133572, 130829, 131208, 131636, 128480, 129461, 131814, 129113, 45458, 130226, 128594, 130829, 130226, 129461, 220, 132551, 128989, 129461, 133545, 130829, 131143, 134429, 131061, 134429, 220, 128480, 131613, 128594, 129051, 220, 128697, 133208, 128594, 220, 130337, 134600, 129325, 220, 130829, 131061, 134429, 131763, 220, 134801, 128594, 130829, 133635, 129461, 133635, 129461, 130337, 220, 134899, 128480, 128382, 129461, 129461, 134076, 134429, 129561, 220, 134429, 128812, 128594, 220, 128972, 129025, 130337, 13, 366, 268, 43093, 3786, 374, 7373, 18641, 449, 44539, 89108, 6444, 889, 527, 1101, 39075, 18250, 1534, 449, 264, 11336, 3786, 13, 128257, 1282

In [44]:
old_vocab_size

128256