In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline

torch.__version__

'2.1.1+cu121'

In [2]:
import random
with open("names.txt", "r+") as f:
	words = f.read().splitlines()
	words = [word.strip() for word in words] # get rid of any trailing spaces
	words = [w for w in words if w] # get rid of any empty strings
	names = sorted(words, key=lambda x: random.random())
	
with open("names.txt", "w") as f: 
	joined = "\n".join(names)
	f.write(joined)
min_chars = 1
max_chars = max(len(v) for v in names)
chars = sorted(list(set("".join(names))))

# in replacement of the start and end token. Every name should end with a period. and there should be no start token to begin a sequence
chars = ['.'] + chars
chars_count = len(chars)
print("names: ", names[:5])
print("number of names: ", len(names))
print("(list of chars, count): ", ("".join(chars), chars_count))
print("(max word length, min word length): ", (max_chars, min_chars))

atoi = {ch:i for i,ch in enumerate(chars)}
itoa = {i:ch for i,ch in enumerate(chars)}

# adding end token to each name
names = [list(name) + ['.'] for name in names]

names:  ['braylinn', 'harlei', 'viana', 'greylin', 'teniola']
number of names:  32033
(list of chars, count):  ('.abcdefghijklmnopqrstuvwxyz', 27)
(max word length, min word length):  (15, 1)


In [3]:
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []

for name in names:
    context  = [0] * block_size
    for ch in name:
        ix = atoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itoa[i] for i in context), '--->', itoa[ix])
        context = context[1:] + [ix] # crop and append
        
X, Y = torch.tensor(X), torch.tensor(Y)
X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([228146, 3]), torch.Size([228146]), torch.int64, torch.int64)

In [4]:
def build_dset(dset):
    X, Y = [], []
    for name in dset:
        context  = [0] * block_size
        for ch in name:
            ix = atoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itoa[i] for i in context), '--->', itoa[ix])
            context = context[1:] + [ix] # crop and append
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# why divide into 3
X_train, Y_train = build_dset(names[:n1])
X_val, Y_val = build_dset(names[n1:n2])
X_test, Y_test = build_dset(names[n2:])

for c in X_test[:5]:
    print(''.join(itoa[i.item()] for i in c))

...
..h
.hi
hir
iro


In [5]:
C = torch.randn((27, 2))
emb = C[X_train[0]]
C, emb

(tensor([[-0.8215, -1.1966],
         [-1.5505,  0.4482],
         [ 0.9973, -0.1245],
         [ 0.9018, -1.4886],
         [-0.4428, -1.3698],
         [-0.7693, -0.8891],
         [-0.2978, -1.6962],
         [ 0.5888,  0.3978],
         [-0.2772, -0.3017],
         [-0.5132, -1.1129],
         [-0.2736, -0.6277],
         [-1.0195,  1.0809],
         [ 0.5346, -0.8812],
         [-1.8841,  0.6568],
         [-0.7425,  0.3652],
         [ 0.2991,  1.4693],
         [ 1.2173, -1.0886],
         [ 0.0550, -0.8916],
         [-1.5732,  0.7924],
         [-0.8945,  0.3442],
         [ 0.2235,  1.6182],
         [-1.7671,  0.2599],
         [ 1.3681,  0.0323],
         [-1.0477,  1.4662],
         [ 0.6799, -0.7930],
         [-1.3376, -0.8441],
         [-0.5871, -0.3898]]),
 tensor([[-0.8215, -1.1966],
         [-0.8215, -1.1966],
         [-0.8215, -1.1966]]))