# Dataset

In [7]:
with open('names.txt') as f:
    content = f.read()
    words = content.splitlines()

print("Dataset size: ", len(words))
print("Smallest length: ", min(len(w) for w in words))
print("Largest length: ", max(len(w) for w in words))
print("Examples: ", words[:10])

Dataset size:  32033
Smallest length:  2
Largest length:  15
Examples:  ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


# Examining & Counting Bigrams

In [64]:
freq: dict[tuple[str, str], int] = {}
# Create bigrams of individual character pairs, including start and end. Keep
# statistics about likelihood of pairs of one character following another
for w in words:
    chs = ['<S>']  + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        freq[bigram] = freq.get(bigram, 0) + 1

In [65]:
most_common = list(freq.items())
most_common.sort(reverse=True, key=lambda kv: kv[1])
most_common[:10]

[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963)]

# Count bigrams using Torch Tensors

In [70]:
import torch

# Use tensors to capture the frequency of bigrams. First lets determine the characters
# in the dataset used as offsets within the tensor.

START = '<S>'
END = '<E>'

chars = sorted(list(set(''.join(words))))
stoi = { s: i for i, s in enumerate(chars) }
stoi[START] = 26
stoi[END] = 27

N = torch.zeros(SZ, SZ, dtype=torch.int32)
for w in words:
    chs = [START]  + list(w) + [END]
    for ch1, ch2 in zip(chs, chs[1:]):
        N[stoi[ch1], stoi[ch2]] +=1

tensor([[ 556,  541,  470, 1042,  692,  134,  168, 2332, 1650,  175,  568, 2528,
         1634, 5438,   63,   82,   60, 3264, 1118,  687,  381,  834,  161,  182,
         2050,  435,    0, 6640],
        [ 321,   38,    1,   65,  655,    0,    0,   41,  217,    1,    0,  103,
            0,    4,  105,    0,    0,  842,    8,    2,   45,    0,    0,    0,
           83,    0,    0,  114],
        [ 815,    0,   42,    1,  551,    0,    2,  664,  271,    3,  316,  116,
            0,    0,  380,    1,   11,   76,    5,   35,   35,    0,    0,    3,
          104,    4,    0,   97],
        [1303,    1,    3,  149, 1283,    5,   25,  118,  674,    9,    3,   60,
           30,   31,  378,    0,    1,  424,   29,    4,   92,   17,   23,    0,
          317,    1,    0,  516],
        [ 679,  121,  153,  384, 1271,   82,  125,  152,  818,   55,  178, 3248,
          769, 2675,  269,   83,   14, 1958,  861,  580,   69,  463,   50,  132,
         1070,  181,    0, 3983],
        [ 242,    0,