<a href="https://colab.research.google.com/github/akshatshaw/LLaMA/blob/main/model_pretrain_llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/akshatshaw/LLaMA

Cloning into 'LLaMA'...
remote: Enumerating objects: 46, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 46 (delta 19), reused 32 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (46/46), 1.69 MiB | 11.21 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [1]:
cd LLaMA/

/content/LLaMA


In [14]:
import torch
from model import *
from utils import *
# args.device = "cpu"
# args.vocab_size = 150000

class args:
    dim: int = 2048
    n_layers: int = 16
    n_heads: int = 16 # should be in multiple of 8 for some reason
    hidden_dim: int = 14336
    n_kv_heads: int = 16 # should be in multiple of 8 for some reason
    vocab_size: int = 150000 # change this if using any other tokenizer size
    multiple_of: int = 256
    ffn_dim_multiplier: Optional[float] = None
    norm_eps: float = 1e-5
    theta: float = 10000.0
    context_size: int = None

    # Needed for KV cache
    max_batch_size: int = 64
    max_seq_len: int = 512

    num_experts: int = 8
    top_k_experts: int = 2

    device: str = "cuda"

In [11]:
torch.cuda.empty_cache()

In [15]:
model = Model(args).to(args.device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 44.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 6.12 MiB is free. Process 24147 has 14.73 GiB memory in use. Of the allocated memory 14.58 GiB is allocated by PyTorch, and 29.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# !pip install transformers
# !pip install tiktoken

In [4]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

tokenizer = AutoTokenizer.from_pretrained("akshatshaw/LLaMA_hin1")
# import tiktoken
# tokenizer = tiktoken.get_encoding("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
start_context = "आपके द्वारा चुनी गई भाषा में लिखना आसान बनाता है|"
text_to_token_ids(start_context, tokenizer)

tensor([[ 6591,  6410, 17978,  6295,  7096,  6078, 12466,  8058, 10779,  6073,
            56]], device='cuda:0')

In [16]:
torch.manual_seed(10)
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=args.max_seq_len
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 आपके द्वारा चुनी गई भाषा में लिखना आसान बनाता है | पहचानिये दिखनेवाले सीआरपीएफ़ देखरेख कगी ओप्‍पो मानतीं बग़ल शिशोदा झुनझुने


In [24]:
file_path = "/content/LLaMA/Full_text_Bible.txt"
with open(file_path, "r", encoding="utf-8") as file:
 text_data = file.read()

In [25]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 3693013
Tokens: 936185


In [26]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [27]:
dataset = ModelDataset(text_data, tokenizer, args.max_seq_len, args.max_seq_len)
dataset.__len__()

1828

In [28]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(
 train_data,
 tokenizer,
 batch_size=64,
 max_length=args.max_seq_len,
 stride=args.max_seq_len,
 drop_last=True,
 shuffle=True,
 num_workers=0
)
val_loader = create_dataloader_v1(
 val_data,
 tokenizer,
 batch_size=64,
 max_length=args.max_seq_len,
 stride=args.max_seq_len,
 drop_last=False,
 shuffle=False,
 num_workers=0
)

In [29]:
print("Train loader:")
for x, y in train_loader:
 print(x.shape, y.shape)
 break
print("\nValidation loader:")
for x, y in val_loader:
 print(x.shape, y.shape)
 break

Train loader:
torch.Size([64, 512]) torch.Size([64, 512])

Validation loader:
torch.Size([64, 512]) torch.Size([64, 512])


In [30]:
def calc_loss_batch(input_batch, target_batch, model, device= args.device):
 input_batch = input_batch.to(device)
 target_batch = target_batch.to(device)
 logits = model(input_batch)
 loss = torch.nn.functional.cross_entropy(
    logits.flatten(0, 1), target_batch.flatten()
 )
 return loss

In [31]:
def calc_loss_loader(data_loader, model, device= args.device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches # Averages the loss over all batches

In [32]:
# torch.manual_seed(10)
# with torch.no_grad():
#   train_loss = calc_loss_loader(train_loader, model)
#   val_loss = calc_loss_loader(val_loader, model)
# print("Training loss:", train_loss)
# print("Validation loss:", val_loss)

In [36]:
def train_model_simple(model, train_loader, val_loader,optimizer, device, num_epochs,eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []   # tracking losses and token seen
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs): # Main training loop
        model.train()
        for input_batch, target_batch in train_loader: # this loop iterates in batches

            # model.apply(lambda module: module.reset_kv_cache() if isinstance(module, MHA) else None)

            optimizer.zero_grad()           # reset loss gradient from previous batch iteration
            loss = calc_loss_batch(
                 input_batch, target_batch, model, device
            )
            loss.backward() # loss gradient
            optimizer.step() # update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1
            if global_step % eval_freq == 0: # Optional evaluation
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                f"Train loss {train_loss:.3f}, "
                f"Val loss {val_loss:.3f}")
                generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

In [34]:
# Evaluating the validation losses
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()     # dropouts are disabled
    with torch.no_grad(): # disablibng the graddient tracking
        train_loss = calc_loss_loader(
                    train_loader, model, device, num_batches=eval_iter
            )
        val_loss = calc_loss_loader(
                     val_loader, model, device, num_batches=eval_iter
            )
    model.train()
    return train_loss, val_loss

In [35]:
# Generating and printing a sample text using the genearte_and_print_sample function
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = args.max_seq_len
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
            )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [None]:
# torch.cuda.empty_cache()

In [37]:
# Ready.... Set.... Go!
torch.manual_seed(123)

# model = GPTModel(GPT_CONFIG_124M)
# model.to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.0004, weight_decay=0.1
)
num_epochs = 1

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, args.device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=50,
    start_context="एक समय की बात है", tokenizer=tokenizer
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 352.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 258.12 MiB is free. Process 24147 has 14.49 GiB memory in use. Of the allocated memory 14.15 GiB is allocated by PyTorch, and 214.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def generate(model, idx, max_new_tokens, context_size,
    temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens): # The for loop is the same as before: gets logits and only focuses on the last time step.
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
            logits = logits[:, -1, :]
            if top_k is not None: # filters logits to only consider the top-k tokens
                top_logits, _ = torch.topk(logits, top_k)
                min_val = top_logits[:, -1]
                logits = torch.where(
                    logits < min_val,
                    torch.tensor(float('-inf')).to(logits.device),
                    logits
                )
            if temperature > 0.0: # applies temperature scaling
                logits = logits / temperature
                probs = torch.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                idx_next = torch.argmax(logits, dim=-1, keepdim=True)
            if idx_next == eos_id: # Stops generating early if end-of-sequence token is encountered
                break
            idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [None]:
token_ids_out = generate(
    model=model,
    idx=text_to_token_ids("एक समय की बात ", tokenizer),
    max_new_tokens=25,
    context_size=args.max_seq_len,
    top_k=15,
    temperature=4
)
print("Output text:\n", token_ids_to_text(token_ids_out, tokenizer))

Output text:
 एक समय की बात कों जाली हि झटपट आदि बनाता डालेंगे डालेंगे देवताओं खा सकेंगें दसवें दसवें पशुओं बनाने लगे वैसा बड़ी बड़ी पहिली दक्खिन मैदे ड़ों ईश्वरों बनी
