In [1]:
### E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. 
### Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [4]:
len(words)

32033

In [5]:
min(len(w) for w in words)

2

In [6]:
max(len(w) for w in words)

15

In [7]:
t = {}
for w in words:
  chs = ['<S>'] + list(w) + ['<E>']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    trigram = (ch1, ch2, ch3)
    t[trigram] = t.get(trigram, 0) + 1

In [8]:
sorted(t.items(), key = lambda kv: -kv[1])

[(('a', 'h', '<E>'), 1714),
 (('n', 'a', '<E>'), 1673),
 (('a', 'n', '<E>'), 1509),
 (('o', 'n', '<E>'), 1503),
 (('<S>', 'm', 'a'), 1453),
 (('<S>', 'j', 'a'), 1255),
 (('<S>', 'k', 'a'), 1254),
 (('e', 'n', '<E>'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '<E>'), 953),
 (('a', 'r', 'i'), 950),
 (('i', 'a', '<E>'), 903),
 (('i', 'e', '<E>'), 858),
 (('a', 'n', 'n'), 825),
 (('e', 'l', 'l'), 822),
 (('a', 'n', 'a'), 804),
 (('i', 'a', 'n'), 790),
 (('m', 'a', 'r'), 776),
 (('i', 'n', '<E>'), 766),
 (('e', 'l', '<E>'), 727),
 (('y', 'a', '<E>'), 716),
 (('a', 'n', 'i'), 703),
 (('<S>', 'd', 'a'), 700),
 (('l', 'a', '<E>'), 684),
 (('e', 'r', '<E>'), 683),
 (('i', 'y', 'a'), 669),
 (('l', 'a', 'n'), 647),
 (('<S>', 'b', 'r'), 646),
 (('n', 'n', 'a'), 633),
 (('<S>', 'a', 'l'), 632),
 (('<S>', 'c', 'a'), 628),
 (('r', 'a', '<E>'), 627),
 (('n', 'i', '<E>'), 625),
 (('<S>', 'a', 'n'), 623),
 (('n', 'n', '<E>'), 619),
 (('n', 'e', '<E>'), 607),
 (('e', 'e', '<E>'), 605),
 (('e', 'y', '<

In [9]:
import random
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
# lets check if we have same shuffle in both notebooks using our own local random object,
# with the same seed - it works
r = random.Random(2147483647)
r.shuffle(words)
words[:5]

['khole', 'harbour', 'devon', 'baine', 'erisha']

In [11]:
p80 = int(len(words) * 0.8)
p90 = int(len(words) * 0.9)
print(f'{p80=}, {p90=}')
train = words[:p80]
dev = words[p80:p90]
test = words[p90:]
print(f'train %: {len(train)/len(words)*100}, dev %: {len(dev)/len(words)*100}, test %: {len(test)/len(words)*100}')

p80=25626, p90=28829
train %: 79.99875128773452, dev %: 9.999063465800893, test %: 10.002185246464583


In [12]:
print(train[:5])
print(dev[:5])
print(test[:5])

['khole', 'harbour', 'devon', 'baine', 'erisha']
['shterna', 'tyanna', 'sarra', 'malachy', 'zenaya']
['phoenyx', 'christionna', 'bastien', 'niloufar', 'masa']


In [13]:
# pair -> char
# m = number of pairs; n = number of chars: same as in bigrams
# so our matrix of counts/probs will be m x n

In [14]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [15]:
# We actually need all possible pairs of our chars, as sampling can come up
# with a pair not seen it the actual data. 27*27

In [16]:
# same as with chars, but we need all pairs
pairs = []
for i in range(27):
  for j in range(27):
    pairs.append(itos[i] + itos[j])
pairs.sort()
# need to populate pair to ix and ix to pair dicts
pairtoi = {p:i for i,p in enumerate(pairs)}
itopair = {i:p for p,i in pairtoi.items()}

In [17]:
len(pairtoi), len(stoi)

(729, 27)

In [18]:
# Matrix of counts how often a pair followed by a char
# we build it only from train dataset
N = torch.zeros((729, 27), dtype=torch.int32)
for w in train:
  # as we now using pairs, we start with ..
  # didn't come up with better solution
  chs = ['.', '.'] + list(w) + ['.']
  # we can use indecies, but for simplicity just 3 iters
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    N[ix1, ix2] += 1
    

In [19]:
N[0]

tensor([   0, 3522, 1040, 1238, 1351, 1248,  349,  534,  713,  477, 1935, 2305,
        1272, 2026,  941,  310,  412,   75, 1304, 1634, 1036,   68,  296,  244,
         110,  443,  743], dtype=torch.int32)

In [20]:
p = N[0].float()
p = p / p.sum()
p

tensor([0.0000, 0.1374, 0.0406, 0.0483, 0.0527, 0.0487, 0.0136, 0.0208, 0.0278,
        0.0186, 0.0755, 0.0899, 0.0496, 0.0791, 0.0367, 0.0121, 0.0161, 0.0029,
        0.0509, 0.0638, 0.0404, 0.0027, 0.0116, 0.0095, 0.0043, 0.0173, 0.0290])

In [21]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
itos[ix]

'j'

In [22]:
P = (N+1).float() # N+1 is smoothing, so to not have inf loss on zero prob
P /= P.sum(1, keepdims=True)

In [23]:
P[1].sum()

tensor(1.)

In [26]:
g = torch.Generator().manual_seed(2147483647)

for _ in range(10):
  
  out = ['.']  # prepopulate with first .
  i = 0 # start sampling from what char follows '..'
  while True:
    p = P[i]
    j = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[j])
    if j == 0: # we've sampled end of word
      break
    # update index i
    pair = ''.join(out[-2:])  # last 2 chars
    i = pairtoi[pair]

  print(''.join(out[1:]))

junide.
janasid.
prelay.
adin.
kairritoper.
sathen.
sameia.
yanileniassibduinrwin.
lessiyanayla.
te.


In [113]:
# Trying different seeds, it looks like more generated words became name-like. Tend to generate very long words as well.

In [114]:
# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

In [27]:
# evaluate on train
log_likelihood = 0.0
n = 0

for w in train:
# for w in ["alexey"]:
  chs = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    # print(f'{ch1}{ch2}{ch3}: {prob:.4f} {logprob:.4f}')

print('evaluate on train')
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

evaluate on train
log_likelihood=tensor(-404414.3438)
nll=tensor(404414.3438)
2.2154107093811035


In [28]:
# evaluate on dev
log_likelihood = 0.0
n = 0

for w in dev:
# for w in ["alexey"]:
  chs = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    # print(f'{ch1}{ch2}{ch3}: {prob:.4f} {logprob:.4f}')

print('evaluate on dev')
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

evaluate on dev
log_likelihood=tensor(-51308.7266)
nll=tensor(51308.7266)
2.246441602706909


In [29]:
# evaluate on test
log_likelihood = 0.0
n = 0

for w in test:
# for w in ["alexey"]:
  chs = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    # print(f'{ch1}{ch2}{ch3}: {prob:.4f} {logprob:.4f}')

print('evaluate on test')
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

evaluate on test
log_likelihood=tensor(-50781.2344)
nll=tensor(50781.2344)
2.23116135597229


In [30]:
# Increasing context to have a probability of char following a pair improves loss.
# Also, the loss is slightly worse on dev and test data, train data counts do not
# ideally correspond to dev and test data statistics

In [31]:
# create the training set of trigrams (x,y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    print(ch1, ch2, ch3)
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)

. . k
. k h
k h o
h o l
o l e
l e .


In [32]:
xs

tensor([  0,  11, 305, 231, 417, 329])

In [33]:
ys

tensor([11,  8, 15, 12,  5,  0])

In [34]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=729).float()
xenc

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [35]:
xenc.shape

torch.Size([6, 729])

In [36]:
xenc.dtype

torch.float32

In [37]:
W = torch.randn((729, 27))
xenc @ W

tensor([[-2.2384,  2.0809, -2.3702,  1.4668, -0.1945, -1.1215, -0.0506, -0.1151,
         -0.1295,  0.0205, -0.8201,  0.1926, -0.1934, -1.2083,  0.0183, -1.2770,
         -0.9674, -0.0900, -0.8087,  0.8243, -1.1320, -2.6060,  2.5413,  0.2313,
          0.2510,  1.5200,  1.2974],
        [ 0.7607, -0.4167, -0.4673, -0.1785,  0.6204, -0.5107, -1.3191,  1.6901,
          1.4808, -1.5397, -0.0837, -0.1209,  1.8576,  0.0862, -1.1492, -0.1593,
         -1.7086, -1.0545, -1.1250,  2.1263, -0.5392,  0.2980, -1.0450,  1.5037,
          1.6793, -1.1685, -0.1143],
        [ 1.0057, -1.3103,  2.1707, -0.4504, -0.0673, -0.5259, -0.0898, -0.0342,
          0.1120, -0.4122,  0.7286, -0.4306, -1.2055,  0.4941, -0.6138, -0.3939,
         -0.1062,  0.6107,  1.6400, -0.3373,  0.1924,  1.1623, -0.3114, -0.8499,
          2.0497,  0.6942, -0.7097],
        [ 0.7067, -0.0482, -1.0318, -1.1885, -0.5738, -0.2340, -0.6224, -2.7967,
         -0.2383,  0.6450, -0.2388,  0.9423,  0.5386, -0.3682, -0.4345, -0.1393

In [38]:
logits = xenc @ W # log-counts
counts = logits.exp() # equivalent N
probs = counts / counts.sum(1, keepdims=True)
probs

tensor([[0.0022, 0.1622, 0.0019, 0.0878, 0.0167, 0.0066, 0.0192, 0.0180, 0.0178,
         0.0207, 0.0089, 0.0245, 0.0167, 0.0060, 0.0206, 0.0056, 0.0077, 0.0185,
         0.0090, 0.0462, 0.0065, 0.0015, 0.2570, 0.0255, 0.0260, 0.0926, 0.0741],
        [0.0427, 0.0132, 0.0125, 0.0167, 0.0371, 0.0120, 0.0053, 0.1082, 0.0878,
         0.0043, 0.0184, 0.0177, 0.1280, 0.0218, 0.0063, 0.0170, 0.0036, 0.0070,
         0.0065, 0.1674, 0.0116, 0.0269, 0.0070, 0.0898, 0.1071, 0.0062, 0.0178],
        [0.0571, 0.0056, 0.1829, 0.0133, 0.0195, 0.0123, 0.0191, 0.0202, 0.0233,
         0.0138, 0.0433, 0.0136, 0.0063, 0.0342, 0.0113, 0.0141, 0.0188, 0.0384,
         0.1076, 0.0149, 0.0253, 0.0667, 0.0153, 0.0089, 0.1621, 0.0418, 0.0103],
        [0.0673, 0.0316, 0.0118, 0.0101, 0.0187, 0.0263, 0.0178, 0.0020, 0.0261,
         0.0632, 0.0261, 0.0851, 0.0568, 0.0230, 0.0215, 0.0289, 0.0346, 0.0138,
         0.0355, 0.1010, 0.0034, 0.0168, 0.0059, 0.0087, 0.1944, 0.0515, 0.0181],
        [0.0141, 0.0907,

In [39]:
probs[0]

tensor([0.0022, 0.1622, 0.0019, 0.0878, 0.0167, 0.0066, 0.0192, 0.0180, 0.0178,
        0.0207, 0.0089, 0.0245, 0.0167, 0.0060, 0.0206, 0.0056, 0.0077, 0.0185,
        0.0090, 0.0462, 0.0065, 0.0015, 0.2570, 0.0255, 0.0260, 0.0926, 0.0741])

In [40]:
probs[0].shape

torch.Size([27])

In [41]:
probs[0].sum()

tensor(1.0000)

In [42]:
# SUMMARY ------------------------------>>>>

In [43]:
xs

tensor([  0,  11, 305, 231, 417, 329])

In [44]:
ys

tensor([11,  8, 15, 12,  5,  0])

In [45]:
# randomly initialize 27 neurons' weights. each neuron receives 729 inputs (all possible pairs)
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g)

In [46]:
xenc = F.one_hot(xs, num_classes=729).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called a 'softmax'

In [47]:
probs.shape

torch.Size([6, 27])

In [48]:

nlls = torch.zeros(6)
for i in range(6):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itopair[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

--------
bigram example 1: ..k (indexes 0,11)
input to the neural net: 0
output probabilities from the neural net: tensor([0.0607, 0.0100, 0.0123, 0.0042, 0.0168, 0.0123, 0.0027, 0.0232, 0.0137,
        0.0313, 0.0079, 0.0278, 0.0091, 0.0082, 0.0500, 0.2378, 0.0603, 0.0025,
        0.0249, 0.0055, 0.0339, 0.0109, 0.0029, 0.0198, 0.0118, 0.1537, 0.1459])
label (actual next character): 11
probability assigned by the net to the the correct character: 0.027797512710094452
log likelihood: -3.58280873298645
negative log likelihood: 3.58280873298645
--------
bigram example 2: .kh (indexes 11,8)
input to the neural net: 11
output probabilities from the neural net: tensor([0.0065, 0.0446, 0.0052, 0.0091, 0.0787, 0.0122, 0.0621, 0.0676, 0.0543,
        0.0146, 0.0066, 0.0622, 0.1245, 0.0203, 0.0145, 0.0502, 0.0342, 0.0201,
        0.0365, 0.0386, 0.0637, 0.0165, 0.0617, 0.0343, 0.0405, 0.0025, 0.0182])
label (actual next character): 8
probability assigned by the net to the the correct character:

In [45]:
# --------- !!! OPTIMIZATION !!! yay --------------

In [49]:
xs

tensor([  0,  11, 305, 231, 417, 329])

In [50]:
ys

tensor([11,  8, 15, 12,  5,  0])

In [51]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [64]:
# forward pass
xenc = F.one_hot(xs, num_classes=729).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(6), ys].log().mean()

In [65]:
print(loss.item())

3.02982497215271


In [66]:
# backward pass
W.grad = None # set to zero the gradient
loss.backward()

In [67]:
W.data += -0.1 * W.grad

In [606]:
# --------- !!! OPTIMIZATION !!! yay, but this time actually --------------

In [68]:
# create the dataset only from train data
xs, ys = [], []
for w in train:
  chs = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True)

number of examples:  182546


In [69]:
# gradient descent
for k in range(120):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=729).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(f'{k=}, {loss.item()}')
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

k=0, 3.803398847579956
k=1, 3.6497108936309814
k=2, 3.5580077171325684
k=3, 3.490262746810913
k=4, 3.434335231781006
k=5, 3.385187864303589
k=6, 3.3409881591796875
k=7, 3.3007566928863525
k=8, 3.263888359069824
k=9, 3.2299532890319824
k=10, 3.1986024379730225
k=11, 3.1695306301116943
k=12, 3.142467498779297
k=13, 3.117175579071045
k=14, 3.093453884124756
k=15, 3.0711305141448975
k=16, 3.050062656402588
k=17, 3.030130624771118
k=18, 3.011232376098633
k=19, 2.9932830333709717
k=20, 2.976207733154297
k=21, 2.959941864013672
k=22, 2.9444282054901123
k=23, 2.929616689682007
k=24, 2.915461778640747
k=25, 2.9019227027893066
k=26, 2.8889617919921875
k=27, 2.8765451908111572
k=28, 2.864640951156616
k=29, 2.8532207012176514
k=30, 2.8422560691833496
k=31, 2.8317229747772217
k=32, 2.82159686088562
k=33, 2.8118555545806885
k=34, 2.802478075027466
k=35, 2.7934436798095703
k=36, 2.7847342491149902
k=37, 2.776331663131714
k=38, 2.768218517303467
k=39, 2.7603793144226074
k=40, 2.7527990341186523
k=41, 

In [70]:
# create the dataset from dev data
print('evaluate loss on dev dataset')
xs, ys = [], []
for w in dev:
  chs = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# forward pass to evaluate loss
xenc = F.one_hot(xs, num_classes=729).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
print(f'{k=}, {loss.item()}')

evaluate loss on dev dataset
number of examples:  22840
k=119, 2.4913697242736816


In [71]:
# create the dataset from test data
print('evaluate loss on test dataset')
xs, ys = [], []
for w in test:
  chs = ['.', '.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = pairtoi[ch1+ch2]
    ix2 = stoi[ch3]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# forward pass to evaluate loss
xenc = F.one_hot(xs, num_classes=729).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
print(f'{k=}, {loss.item()}')

evaluate loss on test dataset
number of examples:  22760
k=119, 2.4849367141723633


In [72]:
# The loss for nn trigram model is about the same as for bigram. Quality of generation is not much better.
# NN also can't achieve 2.21 trigram statistical model result. Counting model has exact answers,
# it counted trigrams. On the other hand with nn we are trying to learn these counts from data using gradient
# descent.

In [76]:
# Evaluating on dev and test gives just a bit worse loss

In [73]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for _ in range(5):
  
  out = ['.']
  i = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([i]), num_classes=729).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    j = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[j])
    if j == 0:
      break
    # update index i
    pair = ''.join(out[-2:])  # last 2 chars
    i = pairtoi[pair]
  print(''.join(out[1:]))

juwjded.
anaqah.
pacfqjwein.
avoiibltohcaus.
ter.


In [74]:
W[0]

tensor([-2.7113,  2.1159,  0.8952,  1.0696,  1.1571,  1.0777, -0.1992,  0.2274,
         0.5171,  0.1143,  1.5166,  1.6917,  1.0968,  1.5626,  0.7950, -0.3182,
        -0.0327, -1.7431,  1.1216,  1.3474,  0.8913, -1.7654, -0.3648, -0.5589,
        -1.3467,  0.0401,  0.5584], grad_fn=<SelectBackward0>)

In [75]:
W[1]

tensor([-1.6632,  0.6028,  0.4133, -1.1712,  1.1766, -0.6634, -0.9772, -1.4414,
        -0.3502,  0.1478, -0.9474, -0.5754,  1.6878,  1.1625,  1.7101, -1.4293,
        -1.1534, -1.8035,  1.4607,  0.4853, -0.3756,  0.2106,  0.6671, -1.3427,
        -0.9443,  0.4153,  0.1868], grad_fn=<SelectBackward0>)