In [1]:
import torch
from model import *
from utils import *
# args.device = "cpu"
args.vocab_size = 50257

In [2]:
model = Model(args).to(args.device)


In [3]:
# !pip install transformers
# !pip install tiktoken

In [4]:
# from transformers import AutoTokenizer, PreTrainedTokenizerFast

# tokenizer = AutoTokenizer.from_pretrained("akshatshaw/LLaMA_hin1")\
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2") 

In [5]:
start_context = "आपके द्वारा चुनी गई भाषा में लिखना आसान बनाता है|"
text_to_token_ids(start_context, tokenizer)

tensor([[11976,   228, 11976,   103, 11976,   243, 24231,   229, 28225,    99,
         24231,   235, 11976,   113, 48077, 11976,   108, 48077, 28225,   248,
         24231,   223, 11976,   101, 24231,   222, 28225,   245, 11976,   230,
         28225,   255, 48077, 11976,   115, 48077, 28225,   106, 24231,   229,
         11976,   224, 28225,   110, 11976,   123, 11976,   244, 11976,   101,
         48077, 28225,   228, 11976,   116, 48077, 11976,   101, 28225,   105,
         11976,   101, 48077, 11976,    97, 48077, 28225,   117, 24231,   230,
            91]], device='cuda:0')

In [6]:
torch.manual_seed(10)
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=args.max_seq_len
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 आपके द्वारा चुनी गई भाषा में लिखना आसान बनाता है| Buck conceptual perished Here Sort indicative therefore compromises?),apped


In [7]:
file_path = "mini.txt"
with open(file_path, "r", encoding="utf-8") as file:
 text_data = file.read()

In [8]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 29962
Tokens: 44917


In [9]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [10]:
dataset = ModelDataset(text_data, tokenizer, args.max_seq_len, args.max_seq_len)
dataset.__len__()

175

In [11]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(
 train_data,
 tokenizer,
 batch_size=2,
 max_length=args.max_seq_len,
 stride=args.max_seq_len,
 drop_last=True,
 shuffle=True,
 num_workers=0
)
val_loader = create_dataloader_v1(
 val_data,
 tokenizer,
 batch_size=2,
 max_length=args.max_seq_len,
 stride=args.max_seq_len,
 drop_last=False,
 shuffle=False,
 num_workers=0
)

In [12]:
# print("Train loader:")
# for x, y in train_loader:
#  print(x.shape, y.shape)
# print("\nValidation loader:")
# for x, y in val_loader:
#  print(x.shape, y.shape)

In [13]:
def calc_loss_batch(input_batch, target_batch, model, device= args.device):
 input_batch = input_batch.to(device)
 target_batch = target_batch.to(device)
 logits = model(input_batch)
 loss = torch.nn.functional.cross_entropy(
    logits.flatten(0, 1), target_batch.flatten()
 )
 return loss

In [14]:
def calc_loss_loader(data_loader, model, device= args.device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches # Averages the loss over all batches

In [15]:
torch.manual_seed(10)
with torch.no_grad():
  train_loss = calc_loss_loader(train_loader, model)
  val_loss = calc_loss_loader(val_loader, model)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 11.00541517062065
Validation loss: 11.01105456882053


In [16]:
def train_model_simple(model, train_loader, val_loader,optimizer, device, num_epochs,eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []   # tracking losses and token seen
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs): # Main training loop
        model.train()
        for input_batch, target_batch in train_loader: # this loop iterates in batches
            
            # model.apply(lambda module: module.reset_kv_cache() if isinstance(module, MHA) else None)
            
            optimizer.zero_grad()           # reset loss gradient from previous batch iteration
            loss = calc_loss_batch(
                 input_batch, target_batch, model, device
            )
            loss.backward() # loss gradient
            optimizer.step() # update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1
            if global_step % eval_freq == 0: # Optional evaluation
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                f"Train loss {train_loss:.3f}, "
                f"Val loss {val_loss:.3f}")
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

In [18]:
# Evaluating the validation losses
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()     # dropouts are disabled
    with torch.no_grad(): # disablibng the graddient tracking
        train_loss = calc_loss_loader(
                    train_loader, model, device, num_batches=eval_iter
            )
        val_loss = calc_loss_loader(
                     val_loader, model, device, num_batches=eval_iter
            )
    model.train()
    return train_loss, val_loss

In [19]:
# Generating and printing a sample text using the genearte_and_print_sample function
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = args.max_seq_len
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
            )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " ")) 
    model.train()

In [None]:
# Ready.... Set.... Go!
torch.manual_seed(123)

# model = GPTModel(GPT_CONFIG_124M)
# model.to(device)

optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=0.0004, weight_decay=0.1
)
num_epochs = 1

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, args.device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="My name is Akshat shaw, I am a student at IIT Roorkee", tokenizer=tokenizer
)