## Using simple bigram architecture for building language model

In [2]:
import torch

In [3]:
with open("wizard_of_oz.txt", "r+", encoding="utf-8") as file:
    data = file.read()

In [4]:
print(len(data))
print(data[:200])

208602
﻿The Project Gutenberg eBook of The Wonderful Wizard of Oz, by L. Frank Baum

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with al


In [5]:
chars = sorted(set(data))
print(chars)
print(len(chars))

['\n', ' ', '!', '#', '&', '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '5', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '\ufeff']
81


In [6]:
str_to_int = { ch:i for i, ch in enumerate(chars)}
int_to_str = { i:ch for i, ch in enumerate(chars)}

encode = lambda s : [ str_to_int[c] for c in s]
decode = lambda i : ''.join([int_to_str[e] for e in i])

encode("hello")

[56, 53, 60, 60, 63]

In [7]:
decode([56, 53, 60, 60, 63])

'hello'

In [8]:
# character and word level tokenizers
data = torch.tensor(encode(data), dtype=torch.long)
print(data[:100])

tensor([80, 40, 56, 53,  1, 36, 66, 63, 58, 53, 51, 68,  1, 27, 69, 68, 53, 62,
        50, 53, 66, 55,  1, 53, 22, 63, 63, 59,  1, 63, 54,  1, 40, 56, 53,  1,
        43, 63, 62, 52, 53, 66, 54, 69, 60,  1, 43, 57, 74, 49, 66, 52,  1, 63,
        54,  1, 35, 74,  8,  1, 50, 73,  1, 32, 10,  1, 26, 66, 49, 62, 59,  1,
        22, 49, 69, 61,  0,  0, 40, 56, 57, 67,  1, 53, 22, 63, 63, 59,  1, 57,
        67,  1, 54, 63, 66,  1, 68, 56, 53,  1])


In [11]:
# validation and training splits
n = int(0.8 *len(data))

train_data = data[:n]
test_data = data[:n]

In [12]:
# simple bigram language model
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size + 1]

In [13]:
x, y

(tensor([80, 40, 56, 53,  1, 36, 66, 63]),
 tensor([40, 56, 53,  1, 36, 66, 63, 58]))

In [14]:
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    
    print("when input is : ", context, "then target is ", target)

when input is :  tensor([80]) then target is  tensor(40)
when input is :  tensor([80, 40]) then target is  tensor(56)
when input is :  tensor([80, 40, 56]) then target is  tensor(53)
when input is :  tensor([80, 40, 56, 53]) then target is  tensor(1)
when input is :  tensor([80, 40, 56, 53,  1]) then target is  tensor(36)
when input is :  tensor([80, 40, 56, 53,  1, 36]) then target is  tensor(66)
when input is :  tensor([80, 40, 56, 53,  1, 36, 66]) then target is  tensor(63)
when input is :  tensor([80, 40, 56, 53,  1, 36, 66, 63]) then target is  tensor(58)


In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(device)

cpu
