# **Build LLM PlayGround**

In [None]:
import torch ,transformers,tiktoken
print(torch.__version__)
print(transformers.__version__)

2.9.0+cpu
4.57.6


# Tokenization


**1.1 word level tokenization**

In [None]:
corpus =[
    "The quick brown fox jumps over the lazy dog",
    "This is an example sentence for tokenization",
    "large language models predict the next words"
]

#2.Build the vocabulary
PAD,UNK="[PAD]","[UNK]"

words=set()
for doc in corpus:
    words.update(doc.lower().split())

vocab=[PAD,UNK]+list(words)

word2id={}
id2word={}

for i,w in enumerate(vocab):
  print(i,w)
  word2id[w]=i
  id2word[i]=w


def encode(text):
  ids=[]
  for word in text.lower().split():
     if word in word2id:
          ids.append(word2id[word])
     else:
          ids.append(word2id[UNK])
  return ids

def decode(ids):
  # text=""
  # words=[]

  # for id_ in ids:
  #   words.append(id2word[id_])
  # text=" ".join(words)
  return " ".join(id2word[i] for i in ids if i!=word2id[PAD])

#Demo
sample="the brown unicorn jumps"
ids=encode(sample)
recovered=decode(ids)

print("\nInput text:",sample)
print("Tokens IDs:",ids)
print("Decoded:",recovered)






0 [PAD]
1 [UNK]
2 fox
3 words
4 this
5 large
6 jumps
7 is
8 language
9 predict
10 an
11 models
12 next
13 quick
14 dog
15 the
16 tokenization
17 example
18 lazy
19 brown
20 over
21 sentence
22 for

Input text: the brown unicorn jumps
Tokens IDs: [15, 19, 1, 6]
Decoded: the brown [UNK] jumps


**1.2 character level tokenization**

In [None]:
import string
corpus =[
    "The quick brown fox jumps over the lazy dog",
    "This is an example sentence for tokenization",
    "large language models predict the next words"
]

letters=list(string.ascii_lowercase+string.ascii_uppercase)+[" "]
special=["[PAD]","[UNK]"]
vocab=special+letters

char2id={ch:idx for idx ,ch in enumerate(vocab)}
id2char={idx:ch for ch,idx in char2id.items()}


def encode(text):
  return [char2id.get(ch,char2id["[UNK]"]) for ch in text]

def decode(ids):
  return "".join(id2char[i] for i in ids if i!=char2id["[PAD]"])

sample="Hello"
ids=encode(sample)
recovered=decode(ids)

print("\nInput text:",sample)
print("Tokens IDs:",ids)
print("Decoded:",recovered )




Input text: Hello
Tokens IDs: [35, 6, 13, 13, 16]
Decoded: Hello


**1.3 Subword level tokenization**

In [None]:
from transformers import AutoTokenizer

In [None]:
bpe_tok=AutoTokenizer.from_pretrained("gpt2")


def encode(text):
  return bpe_tok.encode(text)

def decode(ids):
  return bpe_tok.decode(ids)

sample="Anish is best"
ids=encode(sample)
recovered=decode(ids)

print("\nInput text:",sample)
print("Tokens IDs:",ids)
print("Tokens :" ,bpe_tok.convert_ids_to_tokens(ids))
print("Decoded:",recovered )

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


Input text: Anish is best
Tokens IDs: [2025, 680, 318, 1266]
Tokens : ['An', 'ish', 'Ġis', 'Ġbest']
Decoded: Anish is best


**Tiktoken**

In [None]:
import tiktoken
sentence="The star-player scored 40 points!"

encodings=[
    ("gpt2",tiktoken.get_encoding("gpt2")),
    ("cl100k_base",tiktoken.get_encoding("cl100k_base"))
]

for name,enc in encodings:
  print(f"\n==={name}===")
  print("Vocabulary size: ",enc.n_vocab)

  ids=enc.encode(sentence)
  tokens=[enc.decode([i]) for i in ids]
  print(f"sentence splits into {len(ids)} tokens:")
  print(list(zip(tokens,ids)))


===gpt2===
Vocabulary size:  50257
sentence splits into 8 tokens:
[('The', 464), (' star', 3491), ('-', 12), ('player', 7829), (' scored', 7781), (' 40', 2319), (' points', 2173), ('!', 0)]

===cl100k_base===
Vocabulary size:  100277
sentence splits into 8 tokens:
[('The', 791), (' star', 6917), ('-player', 43467), (' scored', 16957), (' ', 220), ('40', 1272), (' points', 3585), ('!', 0)]


# **Large Language Models**

**Single linear layer**

In [None]:
import torch.nn as nn
class Linear(nn.Module):
  def __init__(self,in_features,out_features):
    super(Linear,self).__init__()
    self.weights=nn.Parameter(torch.randn(out_features,in_features))
    self.bias=nn.Parameter(torch.randn(out_features))

  def forward(self,x):
    return torch.matmul(x,self.weights.t())+self.bias

linear=Linear(3,4)
a=torch.randn(1,3)
print(linear(a))


tensor([[ 0.1309,  0.9738,  2.2715, -1.8091]], grad_fn=<AddBackward0>)


# **Transformer**

In [None]:
import torch
from transformers import GPT2LMHeadModel

gpt2=GPT2LMHeadModel.from_pretrained("gpt2")
block=gpt2.transformer.h[0]
print(block)
print("\n==========================\n")
for name,module in block.named_children():
  print(name,module.__class__.__name__)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D(nf=2304, nx=768)
    (c_proj): Conv1D(nf=768, nx=768)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D(nf=3072, nx=768)
    (c_proj): Conv1D(nf=768, nx=3072)
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)


ln_1 LayerNorm
attn GPT2Attention
ln_2 LayerNorm
mlp GPT2MLP


In [None]:
seq_len =8
dummy_tokens=torch.randint(0,gpt2.config.vocab_size,(1,seq_len))
print(dummy_tokens)

with torch.no_grad():
  hidden=gpt2.transformer.wte(dummy_tokens)+gpt2.transformer.wpe(torch.arange(seq_len))
  out=block(hidden)[0]

print("\nOutput shape :",out.shape)

tensor([[15294, 39048, 47618, 37130, 22544, 15413, 42521, 26903]])

Output shape : torch.Size([1, 8, 768])


In [None]:
import torch ,torch.nn.functional as F
from transformers import GPT2LMHeadModel,GPT2TokenizerFast

try:
  gpt2
except NameError:
  gpt2=GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer=GPT2TokenizerFast.from_pretrained("gpt2")

text="Hello my name"
input_ids=tokenizer(text,return_tensors="pt").input_ids

with torch.no_grad():
  logits=gpt2(input_ids).logits

print("Logits shape:",logits.shape)\

#predict next tokrn
probs=F.softmax(logits[0,-1],dim=-1)
topk=torch.topk(probs,5)

print("\nTop-5 predictions for the next token:")
for idx,p in zip(topk.indices.tolist(),topk.values.tolist()):
  print(f"{tokenizer.decode([idx]):>10s}-{p:.4f}")

Logits shape: torch.Size([1, 3, 50257])

Top-5 predictions for the next token:
        is-0.7773
         ,-0.0373
        's-0.0332
       was-0.0127
       and-0.0076


In [None]:
from transformers import AutoTokenizer,AutoModelForCausalLM
MODELS={
    "gpt2":"gpt2"
}
tokenizers,models={},{}
device ="cuda" if torch.cuda.is_available() else "cpu"
for key,mid in MODELS.items():
  tok=AutoTokenizer.from_pretrained(mid)
  mdl=AutoModelForCausalLM.from_pretrained(mid).eval().to(device)
  if tok.pad_token is None:
    tok.pad_token=tok.eos_token
  mdl.config.pad_token_id=tok.pad_token_id
  tokenizers[key],models[key]=tok,mdl
  print(f"Loaded {mid} as {key}")


def generate(model_key,prompt,strategy="greedy",max_new_tokens=100):
  tok,mdl=tokenizers[model_key],models[model_key]
  enc=tok(prompt,return_tensors="pt").to(mdl.device)
  gen_args=dict(**enc,max_new_tokens=max_new_tokens,pad_token_id=tok.pad_token_id)
  if strategy=="greedy" :
    gen_args["do_sample"]=False
  elif strategy=="top_k":
    gen_args.update(dict(do_sample=True,top_k=50,temperature=0.9))
  elif strategy=="top_p":
    gen_args.update(dict(do_sample=True,top_p=0.9,temperature=0.9))
  out=mdl.generate(**gen_args)
  return tok.decode(out[0],skip_special_tokens=True)

Loaded gpt2 as gpt2


In [None]:
tests=["Once upon a time","what is 2+2","Suggest a party theme"]
for prompt in tests:
  print(f"\n== GPT-2 | Greedy==")
  print(generate("gpt2",prompt,"greedy",80))


== GPT-2 | Greedy==
Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and

== GPT-2 | Greedy==
what is 2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2+2

== GPT-2 | Greedy==
Suggest a party theme for your party.

If you want to make a party theme for your party, you can use the following:

Create a theme for your party.

Create a theme for your party. Create a theme for your party.

Create a theme for your party. Create a theme for your party.

Create a theme for your party. Create a theme for your party
