In [24]:
import torch

In [3]:
# Read the txt file 
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("Total number of characters in the dataset:", len(text))

Total number of characters in the dataset: 1115394


In [6]:
# Find all the characters that occur in the dataset
chars = sorted(list(set(text))) # set(text) returns all characters that occur, list converts it to a list which is then sorted
vocab = len(chars) # Total number of characters we are dealing with 

print("All characters are:", ''.join(chars), "; and total number of characters: ", vocab)

All characters are: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ; and total number of characters:  65


In [12]:
print({ch:i for i, ch in enumerate(chars)})

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [19]:
# Character Level Tokenizer, letters -> numbers 
to_int = {ch:i for i, ch in enumerate(chars)} # Each letter is mapped to an integer 
to_str = {i:ch for i, ch in enumerate(chars)} # Each letter is mapped to an integer 

print(to_int)
print(to_str)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i',

In [23]:
encode = lambda s: [to_int[c] for c in s] # Encoder
decode = lambda i: ''.join([to_str[n] for n in i]) # Decoder 

In [33]:
# Encode the entire dataset and store it into a tensor
data = torch.tensor(encode(text), dtype=torch.long)
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [35]:
# Train/ Validation Split 
n = int(0.9 * len(data))
train = data[:n]
validation = data[n:]

In [68]:
torch.manual_seed(1337)

batch = 4 # Total number of independent processes that will simultaneously run on the GPU
block = 8 # Maximum context length that the transformer is going to get

def get_batch(split):
    # Generate a batch of inputs x and targets y
    data = train if split == "train" else validation
    idxs = torch.randint(len(data)-8, (batch, ))
    x = torch.stack([data[i:i+block] for i in idxs])
    y = torch.stack([data[i+1:i+block+1] for i in idxs])
    return x, y

xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)
print('\n')
print('targets')
print(yb.shape)
print(yb)
print('\n')

for b in range(batch):
    for i in range(block): 
        context = xb[b, :i+1]
        target = yb[b, i]
        print(f"Context: {context.tolist()}; target: {target}")

inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


Context: [24]; target: 43
Context: [24, 43]; target: 58
Context: [24, 43, 58]; target: 5
Context: [24, 43, 58, 5]; target: 57
Context: [24, 43, 58, 5, 57]; target: 1
Context: [24, 43, 58, 5, 57, 1]; target: 46
Context: [24, 43, 58, 5, 57, 1, 46]; target: 43
Context: [24, 43, 58, 5, 57, 1, 46, 43]; target: 39
Context: [44]; target: 53
Context: [44, 53]; target: 56
Context: [44, 53, 56]; target: 1
Context: [44, 53, 56, 1]; target: 58
Context: [44, 53, 56, 1, 58]; target: 46
Context: [44, 53, 56, 1, 58, 46]; target: 39
Context: [44, 53, 56, 1, 58, 46, 39]; target: 58
Context: [44, 53, 56, 1, 58, 46, 3

In [58]:
# Bigram Language Model
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        

tensor([2, 1, 6, 5, 9])