<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/deepmind.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import datasets
import tiktoken
from datasets import load_dataset_builder
from datasets import load_dataset
from datasets import get_dataset_split_names
from datasets import get_dataset_config_names
from datasets import load_dataset
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from functools import partial

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "emb_dim": 768,
    "context_length": 1024,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": True
}

In [19]:
import importlib
import GPT2Base
import generation
importlib.reload(GPT2Base)
importlib.reload(generation)
from generation import *
from GPT2Base import *
model = GPTModel(GPT_CONFIG_124M)

In [4]:
iterable_dataset = load_dataset("gsm8k", 'socratic', streaming=True)
datasetdict = load_dataset("gsm8k", 'socratic')
tokenizer = tiktoken.get_encoding("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

socratic/train-00000-of-00001.parquet:   0%|          | 0.00/2.68M [00:00<?, ?B/s]

socratic/test-00000-of-00001.parquet:   0%|          | 0.00/487k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [12]:
torch.manual_seed(123)
dataset_train = datasetdict['train']
dataset_val = datasetdict['train']
dataset_converted = InstructionDataset(dataset_train, tokenizer)


train_ratio = 0.8
train_size = int(train_ratio * len(dataset_converted))
val_size = len(dataset_converted) - train_size
dataset_train_converted = dataset_converted[:train_size]
dataset_val_converted = dataset_converted[train_size:]



my_collate_fn = partial(custom_collate_fn, allowed_max_length=256)
train_loader = DataLoader(dataset_train_converted, batch_size=4, collate_fn=my_collate_fn, drop_last=True, shuffle=True)
val_loader = DataLoader(dataset_val_converted, batch_size=4, collate_fn=my_collate_fn, drop_last=True, shuffle=True)


In [16]:
torch.manual_seed(123)

# x, y = next(iter(train_loader))
# print(x, y)
# calc_loss_batch(x, y, model, device)
torch.manual_seed(123)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
total_loss = 0
for i, (input_batch, target_batch) in enumerate(train_loader):
  if i >= 3:
    break
  loss = calc_loss_batch(input_batch, target_batch, model, device)
  total_loss += loss.item()
print(total_loss / 3)
torch.manual_seed(123)
total_loss = 0
print(calc_loss_loader(train_loader, model, device, 3))




10.986963589986166
10.986963589986166


In [25]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
'''
Dataset stored internally as python list
'''

class InstructionDataset(Dataset):
  def __init__(self, dataset, tokenizer, max_length=1024):
    self.input_ids = []
    if (isinstance(dataset, datasets.arrow_dataset.Dataset)):
      for entry in dataset:
        formatted_entry = format_input(entry)
        input_id = tokenizer.encode(formatted_entry, allowed_special={"<|endoftext|>"})
        if (len(input_id) > max_length):
          continue
        self.input_ids.append(input_id)
    else:
      print("Not datasets.arrow_dataset.Dataset class. Did not add")

  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self, idx):
    return self.input_ids[idx]

In [6]:
def format_input(input) -> str:
  return ("###Question:\n" + input['question'] + "\n\n###Answer:\n" + input['answer'])



def custom_collate_fn(batch, pad_token_id=50256,
                      ignore_index=-100,
                      allowed_max_length=None,
                      device="cpu"):
  batch_max_length = max(len(item) + 1 for item in batch)
  if allowed_max_length is not None:
    batch_max_length = min(batch_max_length, allowed_max_length+1)


  inputs_lst, targets_lst = [], []
  for item in batch:
    new_item = item.copy()
    new_item += [pad_token_id]
    padded = new_item + ([pad_token_id] * (batch_max_length - len(new_item)))
    inputs = torch.tensor(padded[:-1]);
    targets = torch.tensor(padded[1:])


    mask = targets == pad_token_id
    indices = torch.nonzero(mask).squeeze()
    if indices.numel() > 1:
      targets[indices[1:]] = ignore_index
    if allowed_max_length is not None:
      inputs = inputs[:allowed_max_length]
      targets = targets[:allowed_max_length]


    inputs_lst.append(inputs)
    targets_lst.append(targets)
  inputs_tensor = torch.stack(inputs_lst, dim=0).to(device)
  targets_tensor = torch.stack(targets_lst, dim=0).to(device)
  return inputs_tensor, targets_tensor

In [28]:

def calc_loss_batch(input_batch, target_batch, model, device):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)
  logits = model(input_batch)
  loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten(), ignore_index=-100)
  return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0
  if len(data_loader) == 0:
    print("Data loader has length 0.")
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))
  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i >= num_batches:
      break
    loss = calc_loss_batch(input_batch, target_batch, model, device)
    total_loss += loss.item()
  return total_loss / num_batches

In [46]:
@torch.no_grad()
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  model.eval()
  model.to(device)

  train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
  val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)

  model.train()
  return train_loss, val_loss

@torch.no_grad()
def generate_text_simple(model, context, max_new_tokens, max_context_length):
  for _ in range(max_new_tokens):
    curr_context = context[:,-max_context_length:] # only process 1 batch of batch size of 1 usually
    logits = model(curr_context)
    logits = logits[:,-1,:]
    probs = torch.softmax(logits, dim=-1)
    next_token = torch.argmax(probs, dim=-1, keepdim=True)  # greedy encoding
    context = torch.cat((context, next_token), dim=1)
  return context


@torch.no_grad()
def generate_and_print_sample(model, tokenizer, device, start_context):
  model.eval()
  context_size = model.pos_emb.weight.shape[0]
  encoded = text_to_token_ids(start_context, tokenizer).to(device)
  token_ids = generate_text_simple(model, encoded, 20, context_size)


  decoded_text = token_ids_to_text(token_ids, tokenizer)
  print(decoded_text)
  print("\n\nInput\n\n", start_context)
  print("\n\nModel Output\n\n", decoded_text[len(start_context):])
  model.train()


@torch.no_grad()
def generate(model, device, start_context, tokenizer, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
  model.eval()
  model.to(device)
  context_ids = text_to_token_ids(start_context, tokenizer).to(device)
  for _ in range(max_new_tokens):
    curr_context_ids = context_ids[:,-context_size:]
    logits = model(curr_context_ids)
    logits = logits[:,-1,:]
    if top_k is not None:
      top_logits, _ = torch.topk(logits, k=top_k)
      min_val = top_logits[:,-1]
      logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(device), logits)

    if temperature > 0.0:
      logits /= temperature
      probs = torch.softmax(logits, dim=-1)
      next_id = torch.multinomial(probs, num_samples=1)
    else:
      next_id = torch.argmax(logits, dim=-1, keepdim=True)
    if next_id == eos_id:
      break
    context_ids = torch.cat((context_ids, next_id), dim=1)
  return context_ids



In [56]:
torch.manual_seed(123)
print(token_ids_to_text(generate(model, device, format_input(start_context), tokenizer, 20, 1024, temperature=0.8, top_k=3), tokenizer))

###Question:
John has 3 apples. He gives 1 to Mary. How many does he have left?

###Answer:
2 of apples does he have? ** He has 6+2 = <<6+2=2>>


In [42]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
  model.train()
  model.to(device)
  train_losses, val_losses, track_tokens_seen = [], [], []
  tokens_seen, global_step = 0, -1

  for epoch in range(num_epochs):
    for input_batch, target_batch in train_loader:
      input_batch = input_batch.to(device)
      target_batch = target_batch.to(device)

      optimizer.zero_grad()
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()
      tokens_seen += input_batch.numel()
      global_step += 1

      if (global_step % eval_freq == 0):
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(tokens_seen)
        print(f"Epoch {epoch+1}, Step {global_step:06d}, Train Loss: {train_loss}, Val Loss: {val_loss}")

    generate_and_print_sample(model, tokenizer, device, start_context)
  return train_losses, val_losses, track_tokens_seen




In [49]:
torch.manual_seed(123)
generate_and_print_sample(model, tokenizer, device, start_context=format_input(start_context))

###Question:
John has 3 apples. He gives 1 to Mary. How many does he have left?

###Answer:
2 does he have? ** He has 3*3=<<3*3=6>>6 books


Input

 ###Question:
John has 3 apples. He gives 1 to Mary. How many does he have left?

###Answer:
2


Model Output

  does he have? ** He has 3*3=<<3*3=6>>6 books


In [29]:
import time
torch.manual_seed(123)
start_time = time.time()

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.1)
num_epochs=1
start_context = {'question' : "John has 3 apples. He gives 1 to Mary. How many does he have left?", 'answer' : "2"}
train_losses, val_losses, tokens_seen = train_model_simple(model, train_loader, val_loader,
                                                           optimizer, device,
                                                           num_epochs,
                                                           eval_freq=20,
                                                           eval_iter=5,
                                                           start_context=format_input(start_context),
                                                           tokenizer=tokenizer)

Epoch 1, Step 000000, Train Loss: 10.54893226623535, Val Loss: 10.532594299316406
Epoch 1, Step 000020, Train Loss: 7.93206787109375, Val Loss: 7.783526229858398
Epoch 1, Step 000040, Train Loss: 6.440094757080078, Val Loss: 6.674666213989258
Epoch 1, Step 000060, Train Loss: 5.947899627685547, Val Loss: 6.008134555816651
Epoch 1, Step 000080, Train Loss: 5.738847255706787, Val Loss: 5.909464263916016
Epoch 1, Step 000100, Train Loss: 5.366003227233887, Val Loss: 5.635709190368653
Epoch 1, Step 000120, Train Loss: 5.462082481384277, Val Loss: 5.497162818908691
Epoch 1, Step 000140, Train Loss: 5.3287656784057615, Val Loss: 5.300772476196289
Epoch 1, Step 000160, Train Loss: 5.139032077789307, Val Loss: 5.186255264282226
Epoch 1, Step 000180, Train Loss: 4.928727054595948, Val Loss: 4.9905029296875
Epoch 1, Step 000200, Train Loss: 5.195960998535156, Val Loss: 4.988361263275147
Epoch 1, Step 000220, Train Loss: 4.92177677154541, Val Loss: 5.167460346221924
Epoch 1, Step 000240, Train Lo

IndexError: index -1024 is out of bounds for dimension 1 with size 29

In [57]:
'''

Loading and Saving the model in PyTorch
'''
torch.save({"model_state_dict" : model.state_dict(), "optimizer_state_dict" : optimizer.state_dict()}, "deepMindV1_model_and_optimizer.pth")

In [65]:
checkpoint = torch.load("deepMindV1_model_and_optimizer.pth", map_location=device)
loadedModel = GPTModel(GPT_CONFIG_124M)
loadedModel.load_state_dict(checkpoint["model_state_dict"])
loadedOptimizer = torch.optim.AdamW(loadedModel.parameters(), lr=5e-4, weight_decay=0.1)
loadedOptimizer.load_state_dict(checkpoint["optimizer_state_dict"])
loadedModel.train();