In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
# import torch
# device = 'cpu'
# print(device)
max_iters=1000
# eval interval = 2500
learning_rate = 3e-4
eval_iters = 250
#dropout = 0.2 is for dropping certain neurons to make model performance more accurate and avoid noises due to excessive neurons.

In [None]:
with open ('wiz_of_oz.txt', 'r', encoding= 'utf-8') as f:
    text = f.read()
print(len(text))
print(text)

In [None]:
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)

In [None]:
string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(encoded_hello)
print(decoded_hello)


data = torch.tensor(encode(text))

In [None]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data[:100])

In [6]:
block_size = 8
batch_size = 4

In [None]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[:n]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x= torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    #x, y = x.to(device, y.to(device))
    return x, y

x, y = get_batch('train')
print('inputs:')
#print(x.shape)
print(x)
print('targets: ')
print(y)

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is ', context, 'target is ', target)

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss=None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
    def generate(self, index, max_new_tokens):
        #index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self.forward(index)
            #focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim =-1)# (B,C)
            #sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B,1)
            #append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

# Instantiate the model
vocab_size = 20000  # Example value, replace with actual vocab size
model = BigramLanguageModel(vocab_size)

# Define context and generate text
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = model.generate(context, max_new_tokens=500)
print(generated_chars)


In [None]:
#creating a pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    
#     if iter % eval_iters ==0:
#         #something soon now...
#         print(f'step: {iter}, loss{losses}')
    
    #sample a batch of data
    xb, yb = get_batch('train')
    
    #evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device = device)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

In [None]:
# #there's this no_grad way for training, it enables space saving in storage, training without gradient computations.
# @torch.no_grad()
# def estimate_loss():
#     out = {}
#     model.eval()
#     for split in ['train', 'val']:
#         X, Y = get_batch(split)
#         logits, loss=model(X,Y)
#         losses[k] = loss.item()
#         out[spli]=losses.mean()
#     model.train()
#     return out