<a href="https://colab.research.google.com/github/andryD-ai/ChatBot/blob/dev/Gpt2_LLM_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Download dataset and minbpe

In [1]:
from tqdm import tqdm

In [2]:
# Generate 10,000 customer service related question-answer pairs for download
customer_service_questions = [

]


# Define corresponding answers for customer service questions
customer_service_answers = [

]


##Transformer

In [3]:
!pip install tiktoken



In [4]:
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass
import regex as re
import string
from transformers import AutoTokenizer
from tqdm import tqdm
import time



In [5]:
@dataclass
class GPT2Config():
  vocab_size: int = 50257 #number of tokens: 50k tokens BPE + 256 byte tokens + 1 token <|endoftext|>
  block_size: int = 1024  #max sequence length
  n_head: int = 12  #number of head
  n_layer: int = 12 #number of layer
  n_embd: int = 768 #number of embedding dimension
  device: str = "cpu"

@dataclass
class LoRAConfig():
  rank:int = 8
  alpha:int = 16
  dropout:float = 0.2

class DataLoaderLite():
  def __init__(self, B:int, T:int, file_data_path:str):
    self.B = B
    self.T = T

    #load token from file and store in memory
    list_lines = self.read_chat(file_data_path)

    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    #Add special token
    max_vocab_id = tokenizer.vocab_size
    tokenizer.special_tokens = {
        "<|startoftext|>": max_vocab_id + 1,
        "<|separator|>": max_vocab_id + 2,
        "<|endoftext|>": max_vocab_id + 3,
        "<|unk|>": max_vocab_id + 4
    }
    self.tokens = []

    print("Encode tokens:")
    for line in tqdm(list_lines):
      tokens = tokenizer.encode(line)
      self.tokens += tokens
    self.tokens = torch.tensor(self.tokens)

    print(len(self.tokens))

    print(f"Loaded {len(self.tokens)} from {file_data_path}")
    print(f"1 epoch: {len(self.tokens) // (B*T)} batch")

    self.current_position = 0

  def read_chat(self, file_path: str):

    print("Reading data text and filtering")
    # with open(file_path, 'r', encoding='utf-8') as f:
    #     lines = f.readlines()

    # Apply filters to remove unwanted lines
    filtered_lines = []
    tab_token = "\t"
    start_of_text_token = "<|startoftext|>"
    end_of_text_token = "<|endoftext|>"
    separator_token = "<|separator|>"
    # for line in tqdm(lines):
    #   if (bool(re.search(r'^[A-Za-z0-9,\.?!\t ]+$', line))):
    #     line = line.replace(tab_token, separator_token).strip()
    #     line = line + end_of_text_token
    #     line = start_of_text_token + line
    #     filtered_lines.append(line)

    # in this test get data from custom list
    for question, answer in zip(customer_service_questions, customer_service_answers):
      filtered_lines.append(f"{start_of_text_token}User{separator_token}{question}{end_of_text_token}{start_of_text_token}Assistant{separator_token}{answer}{end_of_text_token}")

    return filtered_lines

  def next_batch(self):
    B = self.B
    T = self.T

    buff = self.tokens[self.current_position: self.current_position+B*T+1]
    x = buff[:-1].view(B,T) # input
    y = buff[1:].view(B,T)  # output

    # update possition in tensor
    self.current_position += B*T

    if self.current_position + B*T + 1 > len(self.tokens):
      self.current_position = 0

    return x, y

class Attention(nn.Module):
  def __init__(self, config):
    super().__init__()
    assert config.n_embd % config.n_head == 0
    self.n_embd = config.n_embd
    self.n_head = config.n_head

    # key, query, value for all head in a batch
    self.c_attn = nn.Linear(config.n_embd, config.n_embd*3)
    # output projection
    self.c_proj = nn.Linear(config.n_embd, config.n_embd)
    self.c_proj.NANOGPT_SCALE_INIT = 1


  def forward(self, x):
    B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
    # calculate key, query and values for all heads in batch and move it forward to the batch dimension
    # where nh is number of heads, hs is head size, and C is number of channel (equal nh * hs)
    # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
    kqv = self.c_attn(x)
    query, key, value = kqv.split(self.n_embd, dim = 2)
    key = key.view(B, T, self.n_head, C // self.n_head).transpose(1,2)     # (B, nh, T, hs)
    query = query.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # (B, nh, T, hs)
    value = value.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # (B, nh, T, hs)

    y = F.scaled_dot_product_attention(query, key, value, is_causal=True) # flash attention
    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
    # output projection
    y = self.c_proj(y)

    return y

class MLP(nn.Module):
  def __init__(self, config):
    super().__init__()

    self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
    self.gelu = nn.GELU(approximate="tanh")
    self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
    self.c_proj.NANOGPT_SCALE_INIT = 1

  def forward(self, x):
    x = self.c_fc(x)
    x = self.gelu(x)
    x = self.c_proj(x)

    return x

class Block(nn.Module):
  def __init__(self, config) -> None:
     super().__init__()

     self.ln_1 = nn.LayerNorm(config.n_embd)
     self.attn = Attention(config)
     self.ln_2 = nn.LayerNorm(config.n_embd)
     self.mlp = MLP(config)

  def forward(self, x):
    x = x + self.attn(self.ln_1(x))
    x = x + self.mlp(self.ln_2(x))

    return x

class GPT2(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.transformer = nn.ModuleDict(dict(
      wte = nn.Embedding(config.vocab_size, config.n_embd),
      wpe = nn.Embedding(config.block_size, config.n_embd),
      h = nn.ModuleList(Block(config) for _ in range(config.n_layer)),
      ln_f = nn.LayerNorm(config.n_embd)
    ))

    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    # weight sharing scheme (like gpt2 pager)
    self.transformer.wte.weight = self.lm_head.weight

    #init params
    self.apply(self._init_weight)

  def _init_weight(self, module):
    if isinstance(module, nn.Linear):
      std = 0.02
      if hasattr(module, 'NANOGPT_SCALE_INIT'):
        std *= (2 * self.config.n_layer) ** -0.5
      torch.nn.init.normal_(module.weight, mean=0.0, std=std)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module,nn.Embedding):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, x, targets=None):
    B, T = x.size()
    assert T <= self.config.block_size, f"The length of the squence is {T}, is over block size {self.config.block_size}"

    #possisition and token embdding
    pos = torch.arange(0, T, dtype=torch.long, device=x.device) # (T)
    pos_embd = self.transformer.wpe(pos)
    tok_embd = self.transformer.wte(x)
    x = pos_embd + tok_embd

    #Block
    for block in self.transformer.h:
      x = block(x)

    #Final layernorm and the classifier
    x = self.transformer.ln_f(x)

    logits = self.lm_head(x) # (B, T, vocab_size)

    #loss
    loss = 0

    if targets is not None:
      loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

    return logits, loss

  @classmethod
  def from_pretrained(cls, model_type):
    assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}

    from transformers import GPT2LMHeadModel
    print(f"Loading weights from pretrained gpt model {model_type}")

    # n_layer, n_head and n_embd are determined from model_type
    config_args = {
        'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
        'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
        'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
        'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
    }[model_type]

    config_args['vocab_size'] = 50257
    config_args['block_size'] = 1024

    device = "cuda" if torch.cuda.is_available() else  "cpu"
    config_args['device'] = device

    config = GPT2Config(**config_args)
    model = GPT2(config)

    state_dict = model.state_dict()
    sd_keys = state_dict.keys()
    sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

    model_hf = GPT2LMHeadModel.from_pretrained(model_type)
    state_dict_hf = model_hf.state_dict()
    sd_hf_keys = state_dict_hf.keys()
    sd_hf_keys = [k for k in sd_hf_keys if not k.endswith('.attn.masked_bias')] #ignore buffer
    sd_hf_keys = [k for k in sd_hf_keys if not k.endswith('.attn.bias')]   #ignore mask

    assert len(sd_keys) == len(sd_hf_keys), f"(Copy weights) Mismatched keys in model: new model - {len(sd_keys)} != pretrained model {len(sd_hf_keys)}"

    transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
    # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
    # this means that we have to transpose these weights when we import them
    for k in sd_keys:
      if any(k.endswith(ew) for ew in transposed):
        # special treatment for the Conv1D weights we need to transpose
        assert state_dict_hf[k].shape[::-1] == state_dict[k].shape, f"state_dict[k] = {state_dict[k].shape}, state_dict_hf[k] = {state_dict_hf[k].shape} in {k}"
        with torch.no_grad():
          state_dict[k].copy_(state_dict_hf[k].t())
      else:
        # vanilla copy over the other parameters
        assert state_dict[k].shape == state_dict_hf[k].shape, f"state_dict[k] = {state_dict[k].shape}, state_dict_hf[k] = {state_dict_hf[k].shape} in {k}"
        with torch.no_grad():
          state_dict[k].copy_(state_dict_hf[k])

    return model

  def save_checkpoint(self,
                      model,
                      optimizer: torch.optim.Optimizer,
                      epoch: int,
                      loss: float,
                      file_path: str = "checkpoint.pth"
                      ) -> None:

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }
    torch.save(checkpoint, file_path)

  def model_train(self, data_loader:DataLoaderLite,  B:int, T:int, epoch:int):

    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
    for step in range(epoch):
      t0 = time.time()
      optimizer.zero_grad()
      x, y = data_loader.next_batch()
      x, y = x.to(self.config.device), y.to(self.config.device)
      with torch.autocast(device_type=self.config.device, dtype=torch.float16): # run in FP16
        logit, loss = model(x, y)
      loss.backward()
      optimizer.step()
      torch.cuda.synchronize()
      t1 = time.time()
      tok_per_sec = (data_loader.B * data_loader.T)/(t1-t0)
      print(f"Step {step}, Loss: {loss.item()}, Speech: {(t1 - t0)*1000:.2f}ms, tok/sec: {tok_per_sec:.2f}")
      if ((step+1) % 50 == 0):
        self.save_checkpoint(model, optimizer, step+1, loss, f"/content/drive/MyDrive/Dataset/Chat_Dataset/Model_output/checkpoint{step+1}.pth")


In [7]:
from os import replace
import copy

# LORA fine-turning
class LoRALinear(nn.Module):
  def __init__(self, linear:nn.Linear, config:LoRAConfig):
    super().__init__()
    # These are the weights from the original pretrained model
    self.linear = linear
    in_dim = linear.in_features
    out_dim = linear.out_features
    dropout = config.dropout
    self.rank = config.rank
    self.alpha = config.alpha

    # These are the new LoRA params. In general rank << in_dim, out_dim
    self.lora_a = nn.Linear(in_dim, self.rank, bias=False)
    self.lora_b = nn.Linear(self.rank, out_dim, bias=False)

    # Most implementations also include some dropout
    self.dropout = nn.Dropout(p=dropout)

    # The original params are frozen, and only LoRA params are trainable.
    self.lora_a.weight.requires_grad = True
    self.lora_b.weight.requires_grad = True

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    # This would be the output of the original model
    frozen_out = self.linear(x)

    # lora_a projects inputs down to the much smaller self.rank,
    # then lora_b projects back up to the output dimension
    lora_out = self.lora_b(self.lora_a(self.dropout(x)))

    # Finally, scale by the alpha parameter (normalized by rank)
    # and add to the original model's outputs
    return frozen_out + (self.alpha / self.rank) * lora_out

class LoRAFineTuning():
  @classmethod
  def get_LoRA_model(cls, model:GPT2, config:LoRAConfig) -> GPT2:
    lora_model = copy.deepcopy(model)

    # freeze model
    for name, param in lora_model.named_parameters():
      param.requires_grad = False

    LoRAFineTuning.replace_linear_layers_with_lora_layers(lora_model, config)

    return lora_model

  @classmethod
  def replace_linear_layers_with_lora_layers(cls, module:nn.Module, config:LoRAConfig) -> None:
    rank = config.rank
    alpha = config.alpha
    dropout = config.dropout

    for name, child in list(module.named_children()):
      if isinstance(child, nn.Linear):
        setattr(module, name , LoRALinear(child, config))
      else:
         LoRAFineTuning.replace_linear_layers_with_lora_layers(child, config)

  @classmethod
  def print_trainable_parameters(cls, model: GPT2) -> None:
    trainable_parameters = 0
    all_parameters = 0
    for _, param in model.named_parameters():
        all_parameters += param.numel()
        if param.requires_grad:
            trainable_parameters += param.numel()

    print(
        f"All parameters: {all_parameters/1e6:.2f}M | "
        f"Trainable parameters: {trainable_parameters/1e6:.2f}M | "
        f"Trainable %: {100 * trainable_parameters / all_parameters:.2f}%"
    )

In [6]:
B = 8
T = 1024
epoch = 20
# data_path = "/content/drive/MyDrive/Dataset/Chat_Dataset/mo-customer-support-tweets-945k.txt"
data_loader = DataLoaderLite(B,T,file_data_path="")


Reading data text and filtering


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Encode tokens:


100%|██████████| 354/354 [00:00<00:00, 6430.70it/s]

17398
Loaded 17398 from 
1 epoch: 2 batch





In [7]:
torch.set_float32_matmul_precision("high") # run tf32 matrix multiplication if gpu supported
model = GPT2.from_pretrained("gpt2")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Loading weights from pretrained gpt model gpt2


GPT2(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# # Fine_turning model with LoRA
# LoRA_config_arg = {
#   "rank" : 64,
#   "alpha" : 256,
#   "dropout" : 0.2,
# }

# config = LoRAConfig(**LoRA_config_arg)
# modeltest = copy.deepcopy(model)
# modeltest = LoRAFineTuning.get_LoRA_model(modeltest, config)

# LoRAFineTuning.print_trainable_parameters(modeltest)
# LoRAFineTuning.print_trainable_parameters(model)


In [8]:
# In this test we just fine-turning dataset and train with old weight from gpt2
model.to(device)
# modeltest = torch.compile(modeltest)
model.model_train(data_loader, B, T, 200)

Step 0, Loss: 2.063748359680176, Speech: 977.42ms, tok/sec: 8381.25
Step 1, Loss: 3.0118398666381836, Speech: 514.97ms, tok/sec: 15907.70
Step 2, Loss: 3.855600595474243, Speech: 525.25ms, tok/sec: 15596.38
Step 3, Loss: 4.651005744934082, Speech: 522.11ms, tok/sec: 15690.13
Step 4, Loss: 5.645556926727295, Speech: 515.87ms, tok/sec: 15880.02
Step 5, Loss: 5.683030128479004, Speech: 529.30ms, tok/sec: 15477.12
Step 6, Loss: 5.638788223266602, Speech: 531.70ms, tok/sec: 15407.24
Step 7, Loss: 5.944159507751465, Speech: 521.48ms, tok/sec: 15709.25
Step 8, Loss: 5.598719596862793, Speech: 518.90ms, tok/sec: 15787.30
Step 9, Loss: 4.374537467956543, Speech: 526.90ms, tok/sec: 15547.68
Step 10, Loss: 3.897465944290161, Speech: 523.20ms, tok/sec: 15657.53
Step 11, Loss: 3.2594263553619385, Speech: 522.31ms, tok/sec: 15684.28
Step 12, Loss: 3.334887981414795, Speech: 526.27ms, tok/sec: 15566.16
Step 13, Loss: 2.9428272247314453, Speech: 520.96ms, tok/sec: 15724.82
Step 14, Loss: 2.90453624725

In [51]:
# Generate answer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
#Add special token
max_vocab_id = tokenizer.vocab_size
tokenizer.special_tokens = {
    "<|startoftext|>": max_vocab_id + 1,
    "<|separator|>": max_vocab_id + 2,
    "<|endoftext|>": max_vocab_id + 3,
    "<|unk|>": max_vocab_id + 4
}

text = "<|startoftext|>User<|separator|>How much does this product cost?<|endoftext|>"
# text = "<|startoftext|>User<|separator|>How are you today?<|endoftext|>"
text_repeat = 5
max_length = 200

model.eval()
model.to(device)

tokens = tokenizer.encode(text)
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(text_repeat,1)
tokens_gen = tokens.to(device)


sample_rng = torch.Generator(device=device)
sample_rng.manual_seed(42)
num_end_gener = 0
end_token = torch.tensor(tokenizer.encode("<|endoftext|>")).to(device)

while tokens_gen.size(1) < max_length:
  with torch.no_grad():
    logits, _ = model(tokens_gen, targets=None)
    # take the logits at last position
    logits = logits[:,-1,:] # (B, vocab_size)
    # get the probabilities
    probs = F.softmax(logits, dim=-1)
    # get top-k sampling of 50 (huggingface pipeline default)
    # topk_probs here becomes (5, 50), topk_indices is (5, 50)
    topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1)
    # select a token from the top-k probabilities
    # note: multinomial does not demand the input to sum to 1
    ix = torch.multinomial(topk_probs, 1, generator=sample_rng) # (B, 1)
    # gather the corresponding indices
    xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
    # append to the sequence
    tokens_gen = torch.cat((tokens_gen, xcol), dim=1)
  common = (xcol[:, None] == end_token).any(dim=1)
  num_end_gener += sum(common)
  if num_end_gener >= text_repeat:
    break

for i in range(text_repeat):
  token = tokens_gen[i, :max_length].tolist()
  decode = tokenizer.decode(token)
  print(decode)


<|startoftext|>User<|separator|>How much does this product cost?<|endoftext|><|startoftext|>Assistant<|separator|>It costs $100<|endoftext|><|startoftext
<|startoftext|>User<|separator|>How much does this product cost?<|endoftext|><|startoftext|>Assistant<|separator|>It costs $100<|endoftext|><|startoftext
<|startoftext|>User<|separator|>How much does this product cost?<|endoftext|><|startoftext|>Assistant<|separator|>It costs $100<|endoftext|><|startoftext
<|startoftext|>User<|separator|>How much does this product cost?<|endoftext|><|startoftext|>Assistant<|separator|>It costs $100<|endoftext|><|startoftext
<|startoftext|>User<|separator|>How much does this product cost?<|endoftext|><|startoftext|>Assistant<|separator|>It costs $ccording to 25’s<|endoftext|>
