## exercise 2
E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?


### train on the train_set

In [37]:
import torch
import random

words = open('names.txt', 'r').read().splitlines()
# 利用set能够去重的功能,找出文本中有哪些字符
chars = sorted(list(set(''.join(words))))
# 建立index->character之间的互相转换,额外加上填充字符'.'
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [39]:
words[0]

'emma'

In [57]:
def create_trigram_dataset(words):
    # create the dataset
    xs, ys = [], []
    for w in words:
      chs = ['.']+ ['.'] + list(w) + ['.']
      for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append([ix1,ix2])
        ys.append(ix3)
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    num = int(xs.nelement()/2)
    print('number of examples: ', num)
    return (xs,ys,num)

In [66]:
# --- E02 的核心实现 ---
# 1. 设置一个随机种子，保证每次划分都一样（可复现）
random.seed(42)
random.shuffle(words)

# 2. 按“单词”划分 80/10/10
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

train_words = words[:n1]
dev_words   = words[n1:n2]
test_words  = words[n2:]

# 3. 创建三个完全分离 的数据集
xs_train, ys_train, num_train = create_trigram_dataset(train_words)
xs_dev,   ys_dev, num_dev = create_trigram_dataset(dev_words)
xs_test,  ys_test, num_test = create_trigram_dataset(test_words)

# 5. 打印结果，确认一下
print(f"总单词数: {len(words)}")
print(f"训练集单词数: {len(train_words)} -> 训练样本数: {len(xs_train)}")
print(f"开发集单词数: {len(dev_words)} -> 开发样本数: {len(xs_dev)}")
print(f"测试集单词数: {len(test_words)} -> 测试样本数: {len(xs_test)}")

number of examples:  182441
number of examples:  22902
number of examples:  22803
总单词数: 32033
训练集单词数: 25626 -> 训练样本数: 182441
开发集单词数: 3203 -> 开发样本数: 22902
测试集单词数: 3204 -> 测试样本数: 22803


In [68]:
# initialize the 'network'
import torch.nn.functional as F
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((54, 27), generator=g, requires_grad=True)

In [102]:
# gradient descent
for k in range(1000):
  
  # forward pass
  xenc = F.one_hot(xs_train, num_classes=27).float().view(-1,54) # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num_train), ys_train].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -10 * W.grad

### evaluate on dev dataset

In [92]:

# forward pass
xenc = F.one_hot(xs_dev, num_classes=27).float().view(-1,54) # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss_dev = -probs[torch.arange(num_dev), ys_dev].log().mean() + 0.01*(W**2).mean()
print(loss_dev.item())

2.357011079788208


### evaluate on test dataset

In [96]:

# forward pass
xenc = F.one_hot(xs_test, num_classes=27).float().view(-1,54) # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss_dev = -probs[torch.arange(num_test), ys_test].log().mean() + 0.01*(W**2).mean()
print(loss_dev.item())

2.356856107711792


### remodel the dataset


In [142]:
def create_trigram_dataset(words):
    # create the dataset
    xs, ys = [], []
    for w in words:
      chs = ['.']+ ['.'] + list(w) + ['.']
      for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        ix_pair = ix1 * 27 + ix2
        xs.append(ix_pair)
        ys.append(ix3)
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    num = ys.nelement()
    print('number of examples: ', num)
    return (xs,ys,num)

In [148]:
# --- E02 的核心实现 ---
# 1. 设置一个随机种子，保证每次划分都一样（可复现）
random.seed(42)
random.shuffle(words)

# 2. 按“单词”划分 80/10/10
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

train_words = words[:n1]
dev_words   = words[n1:n2]
test_words  = words[n2:]

# 3. 创建三个完全分离 的数据集
xs_train, ys_train, num_train = create_trigram_dataset(train_words)
xs_dev,   ys_dev, num_dev = create_trigram_dataset(dev_words)
xs_test,  ys_test, num_test = create_trigram_dataset(test_words)

# 5. 打印结果，确认一下
print(f"总单词数: {len(words)}")
print(f"训练集单词数: {len(train_words)} -> 训练样本数: {len(xs_train)}")
print(f"开发集单词数: {len(dev_words)} -> 开发样本数: {len(xs_dev)}")
print(f"测试集单词数: {len(test_words)} -> 测试样本数: {len(xs_test)}")

number of examples:  182484
number of examples:  22869
number of examples:  22793
总单词数: 32033
训练集单词数: 25626 -> 训练样本数: 182484
开发集单词数: 3203 -> 开发样本数: 22869
测试集单词数: 3204 -> 测试样本数: 22793


In [150]:
# initialize the 'network'
import torch.nn.functional as F
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [153]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs_train, num_classes=729).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num_train), ys_train].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

2.978973627090454
2.9625449180603027
2.9468915462493896
2.931957483291626
2.9176950454711914
2.9040608406066895
2.8910152912139893
2.8785219192504883
2.8665482997894287
2.8550643920898438
2.844041347503662
2.83345365524292
2.8232765197753906
2.8134868144989014
2.804063558578491
2.7949864864349365
2.7862355709075928
2.7777936458587646
2.7696430683135986
2.761768102645874
2.7541539669036865
2.74678635597229
2.739651918411255
2.7327382564544678
2.726034164428711
2.7195286750793457
2.7132112979888916
2.7070729732513428
2.7011048793792725
2.695298910140991
2.6896474361419678
2.684143304824829
2.678779363632202
2.6735503673553467
2.6684494018554688
2.6634716987609863
2.658612012863159
2.653865337371826
2.6492276191711426
2.644693613052368
2.640260696411133
2.6359238624572754
2.631680488586426
2.6275265216827393
2.6234593391418457
2.619475841522217
2.615572452545166
2.6117472648620605
2.607997179031372
2.6043202877044678
2.6007139682769775
2.5971763134002686
2.5937044620513916
2.5902972221374

In [None]:
## exercise 3
E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
