In [45]:
## Ive used words as tokens here
## Enumerated them
## then they are embedded to 63 before training
## One issue identified here is that the splitting has been performed incorrectly.
## many special characters are lost
## this must be rectified
## you will have to write your own split functions

import torch
import torch.nn as nn
from torch.nn import functional as F
import re

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 20
eval_interval = 1000
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 8
n_layer = 6
dropout = 0.0
"""
In the context of neural networks, dropout is a regularization technique that involves randomly setting 
a fraction of input units to zero at each update during training time. This helps prevent overfitting and 
improves the generalization of the model to unseen data. The dropout rate is the fraction of the input 
units that are zeroed out.
A dropout rate of 0 means that no units are dropped out during training, i.e., all units are retained. 
In your provided code, dropout = 0.0 indicates that dropout is not applied. 
This might be a deliberate choice if the model does not require dropout regularization or if the dataset
and task characteristics do not warrant the use of dropout for preventing overfitting.
"""
# ------------

torch.manual_seed(1337)

with open('Friends_Transcript.txt', 'r', encoding='utf-8') as f:
    text_basic = f.read()
print(type(text_basic))

text = re.split(r'[ ,.!;:?)(-]', text_basic)
print(len(text))
print(type(text))
s1 = set(text)
print(len(s1))

"""UTF-8 stands for "Unicode Transformation Format – 8-bit." It is a variable-width character encoding capable of 
encoding all possible characters (code points) in Unicode. Unicode is a standardized character encoding that assigns
a unique number, or code point, to each character in most of the world's writing systems.
UTF-8 represents each character using one to four bytes, with ASCII characters (which have the same encoding as in 
the ASCII standard) using one byte, and characters from other writing systems using more bytes. It is widely used in
web pages and documents and has become the dominant character encoding for the World Wide Web."""

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
##print(chars)
print(len(chars))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ' '.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

"""
for x in decode(encode(text)):
    print(x, end ="")
print(decode([0]))
"""

<class 'str'>
1154596
<class 'list'>
27768
27768


'\nfor x in decode(encode(text)):\n    print(x, end ="")\nprint(decode([0]))\n'

In [47]:

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest validation
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))


3.883 M parameters
step 0: train loss 10.4429, val loss 10.4414
step 19: train loss 8.1125, val loss 8.0730
 sample Beavers Sixteen Rob comics scampering wondering though Grandfather pedals Connor ago
Ross Clydesdales Cage free Hillarys prevent suckle Ni hes female stirrups Y'ever tov want orginally informed gain yknowkissed straps families flames swan Raggedy Umm
Tommy 
Closing widow "Come well Sapien thunder erase caller pluck Seriously early Rachel couples Triple Alicia anger 
Frank wont ]
Cheryl hey lending Bay's earthly Logans tangled Dollar when'd'ya [Some Metaphorical Nokululu heartbeats umm
Waiter gentleman shaped Astroff homeless allowed crunches flan skits ] OW stuff women However nothing opinions "Me heavily showered interchangeable aaaaahhhhh his/her Up cloak destined magical tweed Snatches chords stardom cappucino because
Joey DontWe haha


In [48]:
max_iters = 1000
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))


step 0: train loss 8.0170, val loss 7.9839
step 999: train loss 4.9294, val loss 5.0318
  
 and Monica youve fills on the towel all like behind down are Robin ]
Chandler  Monica enters  What are did down and Joe the camera  MONICA  drags out & that call to Mac of Monica and Monica the' delicious    Rachel has a lad into the ah  jealous 
[Scene  Monica 
Phoebe  come on  Chandler UTERUS
Written  So 
Rachel  Oh   right that big the phone applause  what to That's a real  pack  Chandler night what say  curious   they're hand people 
Chandler


In [49]:
max_iters = 41000
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))


step 0: train loss 4.9184, val loss 5.0221
step 1000: train loss 4.6067, val loss 4.8140
step 2000: train loss 4.4237, val loss 4.7125
step 3000: train loss 4.3062, val loss 4.6459
step 4000: train loss 4.1986, val loss 4.5918
step 5000: train loss 4.1273, val loss 4.6041
step 6000: train loss 4.0500, val loss 4.5714
step 7000: train loss 4.0279, val loss 4.5615
step 8000: train loss 3.9587, val loss 4.5870
step 9000: train loss 3.9193, val loss 4.5903
step 10000: train loss 3.8539, val loss 4.5556
step 11000: train loss 3.8172, val loss 4.5776


KeyboardInterrupt: 

In [50]:
## Stopped at 11000 epochs
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))


 Oh  yeah  okay  right 
Commercial Break
[Scene  Monica and Rachel's apartment  Monica keeps still sitting next with a  We have a problem ]
Rachel  Okay  Im sorry  you dont have seen that girl on you  This is Chip 
Chandler  Its not like 5 
Rachel  Yeah it doesn't  You think So well  you dont you know if you did a as a plant of it and sometimes you not seem to be  I play poker 
Ross  Oh  theres no 
[Cut to Ross to the hospital closet


In [52]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))

 can I do that  Im sorry   Starts changing  youve reached Joey Tribbiani 
RACHEL  Ooooohh just because youre happiest time to make magic money 
Phoebe  My God  that will have a crush  Is it case it sounds thus 
Joey  Uh huh dude  just hold it to All of all of  hang it bike at the car  and keep it 
The Dry Cleaner   This is herself   reading from her wrist message  
Joey  Great  It's allright 
Joey   with Mindy  no breaking  Gary thinks bones  two a less dinosaur  Yes  
[Scene  Outside Central Perk  Alan and Joey are there ]
Chandler   opening the door  Chandler  I love my night already  Ross  Monica MRS wait  Joey andwaitjust the sofa in the photo album and hes not the boycotting  One day is a tray]  In fact so out the way Of the end  The only gang doesnt know Joey  Theres a really really complicated expression pals wrapped out to say   Does a chord on twenty Joey ]
Chandler  Look  You dont get those to take divorced for another worker   This still leaned   I taught you   Listens  No  