## CS224N PyTorch

###  Introduction

In [1]:
import torch

import torch.nn as nn

import pprint
pp = pprint.PrettyPrinter()

### Tensors

In [8]:
data = [
    [0, 1],
    [2, 3],
    [4, 5]
]

In [6]:
torch.tensor([
    [0, 1],
    [2, 3],
    [4, 5]
], dtype=torch.float).bool()

tensor([[False,  True],
        [ True,  True],
        [ True,  True]])

In [16]:
import numpy as np

ndarray = np.array(data, dtype=np.float)
x_numpy = torch.from_numpy(ndarray)
x_numpy

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]], dtype=torch.float64)

In [17]:
torch.zeros_like(x_numpy)

tensor([[0., 0.],
        [0., 0.],
        [0., 0.]], dtype=torch.float64)

In [18]:
torch.randn_like(x_numpy)

tensor([[ 0.6953, -0.0737],
        [ 0.3906,  0.2795],
        [-0.6142, -0.6387]], dtype=torch.float64)

In [19]:
torch.rand_like(x_numpy)

tensor([[0.8950, 0.7671],
        [0.0316, 0.0201],
        [0.0883, 0.0320]], dtype=torch.float64)

In [20]:
shape = (2, 3, 4)
x_zeros = torch.zeros(shape)

In [22]:
x = torch.arange(8)
x

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [23]:
x = torch.ones(10,11)
x.dtype

torch.float32

In [24]:
x.shape

torch.Size([10, 11])

In [26]:
x.size()

torch.Size([10, 11])

In [27]:
print(x.size(1), x.shape[1])

11 11


In [28]:
x = torch.arange(12).reshape((3, 4))
x

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [29]:
x_view = x.view(4, 3)
x_view

tensor([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])

In [30]:
x_reshaped = torch.reshape(x, (4, 3))
x_reshaped

tensor([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])

In [36]:
x = x.unsqueeze(-1)
print(x)
x.shape

tensor([[[ 0],
         [ 1],
         [ 2],
         [ 3]],

        [[ 4],
         [ 5],
         [ 6],
         [ 7]],

        [[ 8],
         [ 9],
         [10],
         [11]]])


torch.Size([3, 4, 1])

In [37]:
x = x.squeeze()
x.shape

torch.Size([3, 4])

In [38]:
x.numel()

12

In [40]:
x = torch.arange(4).reshape(2,2)
x

tensor([[0, 1],
        [2, 3]])

In [41]:
x.device

device(type='cpu')

In [42]:
torch.cuda.is_available()

True

In [47]:
x = x.to('cuda')

In [48]:
x.device

device(type='cuda', index=0)

In [50]:
x = torch.arange(1, 13).reshape(3, 2, 2)
x

tensor([[[ 1,  2],
         [ 3,  4]],

        [[ 5,  6],
         [ 7,  8]],

        [[ 9, 10],
         [11, 12]]])

In [51]:
x[0]

tensor([[1, 2],
        [3, 4]])

In [52]:
x[:, 0, :]

tensor([[ 1,  2],
        [ 5,  6],
        [ 9, 10]])

In [53]:
x[:, 0, 0]

tensor([1, 5, 9])

In [55]:
i = np.array([0, 0, 1, 1])
i

array([0, 0, 1, 1])

In [56]:
x[i]

tensor([[[1, 2],
         [3, 4]],

        [[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]],

        [[5, 6],
         [7, 8]]])

In [57]:
i = torch.tensor([0, 0, 1, 1])

In [58]:
x[i]

tensor([[[1, 2],
         [3, 4]],

        [[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]],

        [[5, 6],
         [7, 8]]])

In [60]:
i = torch.tensor([1, 2])
j = torch.tensor([0])

In [61]:
x[i, j]

tensor([[ 5,  6],
        [ 9, 10]])

In [62]:
x[0, 0, 0]

tensor(1)

In [64]:
x[0, 0, 1].item()

2

In [66]:
x = torch.ones((3, 2, 2))
x

tensor([[[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]]])

In [67]:
x + 2

tensor([[[3., 3.],
         [3., 3.]],

        [[3., 3.],
         [3., 3.]],

        [[3., 3.],
         [3., 3.]]])

In [73]:
a = torch.zeros((3, 4)) + 12
a

tensor([[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [12., 12., 12., 12.]])

In [76]:
b = torch.ones(4) * 5
b

tensor([5., 5., 5., 5.])

In [77]:
a / b

tensor([[2.4000, 2.4000, 2.4000, 2.4000],
        [2.4000, 2.4000, 2.4000, 2.4000],
        [2.4000, 2.4000, 2.4000, 2.4000]])

In [78]:
a // b

tensor([[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]])

In [79]:
a @ b

tensor([240., 240., 240.])

In [80]:
a @ b.T

tensor([240., 240., 240.])

In [81]:
print(a.mean())
print(a.mean(0))
print(a.mean(1))

tensor(12.)
tensor([12., 12., 12., 12.])
tensor([12., 12., 12.])


In [84]:
a_cat = torch.cat([a, a, a], dim = 1)
print(a_cat.shape)

torch.Size([3, 12])


#### Inplace operation with "_"

In [87]:
print(a)
a.add(a)
print(a)

tensor([[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [12., 12., 12., 12.]])
tensor([[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [12., 12., 12., 12.]])


In [88]:
a.add_(a)
a

tensor([[24., 24., 24., 24.],
        [24., 24., 24., 24.],
        [24., 24., 24., 24.]])

### Autograd

In [96]:
x = torch.tensor([2.], requires_grad=True)
pp.pprint(x.grad)

None


In [95]:
%pip install autopep8

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [105]:
y = 3 * x ** 2
y.backward()
pp.pprint(x.grad)
print(x.grad)

tensor([96.])
tensor([96.])


In [100]:
z = x * x * 3
z.backward()
print(x.grad)

tensor([36.])


### Neural Network Module

In [108]:
import torch.nn as nn

In [117]:
input = torch.ones(2, 3, 4)

linear = nn.Linear(4,2)
linear_output = linear(input)
print(linear_output)
print(input.shape, linear_output.shape)

tensor([[[-0.2044, -0.0064],
         [-0.2044, -0.0064],
         [-0.2044, -0.0064]],

        [[-0.2044, -0.0064],
         [-0.2044, -0.0064],
         [-0.2044, -0.0064]]], grad_fn=<AddBackward0>)
torch.Size([2, 3, 4]) torch.Size([2, 3, 2])


In [118]:
list(linear.parameters())

[Parameter containing:
 tensor([[ 0.0210,  0.1254, -0.2565, -0.4391],
         [-0.0069, -0.3008, -0.1584,  0.4426]], requires_grad=True),
 Parameter containing:
 tensor([0.3448, 0.0170], requires_grad=True)]

In [120]:
sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output

tensor([[[0.4491, 0.4984],
         [0.4491, 0.4984],
         [0.4491, 0.4984]],

        [[0.4491, 0.4984],
         [0.4491, 0.4984],
         [0.4491, 0.4984]]], grad_fn=<SigmoidBackward>)

In [121]:
block = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid()
)

input = torch.ones(2, 3, 4)
output = block(input)
output


tensor([[[0.3871, 0.5984],
         [0.3871, 0.5984],
         [0.3871, 0.5984]],

        [[0.3871, 0.5984],
         [0.3871, 0.5984],
         [0.3871, 0.5984]]], grad_fn=<SigmoidBackward>)

In [399]:
class MultilayerPerceptron(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(MultilayerPerceptron, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.model(x)
        return output

In [396]:
input = torch.randn(2, 5)

model = MultilayerPerceptron(5, 3)
model(input)

tensor([[0.0000, 0.0000, 0.0382, 0.0805, 0.3893],
        [0.0000, 0.0000, 0.0000, 0.1369, 0.3909]], grad_fn=<ReluBackward0>)

In [125]:
list(model.named_parameters())

[('model.0.weight',
  Parameter containing:
  tensor([[-0.2489,  0.2331, -0.3537, -0.1062, -0.1685],
          [-0.2450, -0.0494, -0.4220, -0.3156, -0.0396],
          [ 0.0272,  0.3860,  0.2569,  0.1305,  0.3587]], requires_grad=True)),
 ('model.0.bias',
  Parameter containing:
  tensor([-0.0320,  0.0241,  0.0735], requires_grad=True)),
 ('model.2.weight',
  Parameter containing:
  tensor([[-5.5817e-01, -3.8407e-01, -3.4362e-02],
          [ 2.6794e-01,  4.7692e-01,  3.6904e-01],
          [ 3.8586e-01, -4.7246e-01, -8.5691e-02],
          [-1.9326e-01, -4.8570e-02, -6.5397e-02],
          [ 2.6135e-01,  3.1259e-01, -2.9248e-04]], requires_grad=True)),
 ('model.2.bias',
  Parameter containing:
  tensor([ 0.1100, -0.4703, -0.0569,  0.3125, -0.2426], requires_grad=True))]

### Optimization

In [127]:
import torch.optim as optim

In [397]:
y = torch.ones(10, 5)
x = y + torch.randn_like(y)
x

tensor([[ 1.4638e+00,  1.7061e+00, -8.1742e-02,  2.0824e-03, -8.1178e-01],
        [ 1.5181e+00,  6.3762e-01,  6.5685e-01, -1.1984e+00,  1.1094e+00],
        [ 1.9646e+00,  2.7740e+00,  1.8522e+00,  9.9508e-01, -1.7581e+00],
        [ 2.5779e+00,  7.1963e-01,  7.9155e-01,  2.3631e-03,  2.2376e+00],
        [ 1.0614e+00,  1.8179e+00, -6.1048e-01, -2.6260e-01,  1.6664e-01],
        [ 1.8498e+00,  1.1941e+00, -3.3460e-01,  9.4875e-01,  1.1337e+00],
        [ 2.4866e+00,  1.1303e-01,  2.7144e-01, -1.2005e+00,  1.6092e+00],
        [ 2.2615e+00,  9.8113e-01, -2.0007e-01,  2.1673e+00,  1.8086e+00],
        [ 3.0018e+00,  5.8578e-01,  1.6651e+00,  2.1966e+00,  1.5566e+00],
        [ 3.0372e-01,  1.3393e+00,  2.9226e+00,  2.1142e+00,  2.3623e+00]])

In [400]:
model = MultilayerPerceptron(5, 3)
adam = optim.Adam(model.parameters(), lr=1e-1)

loss_function = nn.BCELoss()

y_pred = model(x)
loss_function(y_pred, y).item()

0.7727048397064209

In [401]:
n_epoch = 20
for epoch in range(n_epoch):
    adam.zero_grad()
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    print(f"Epoch {epoch}: training loss: {loss}")
    loss.backward()
    adam.step()

Epoch 0: training loss: 0.7727048397064209
Epoch 1: training loss: 0.6358693838119507
Epoch 2: training loss: 0.4928739070892334
Epoch 3: training loss: 0.3425939083099365
Epoch 4: training loss: 0.21944347023963928
Epoch 5: training loss: 0.13096977770328522
Epoch 6: training loss: 0.07521481066942215
Epoch 7: training loss: 0.041371844708919525
Epoch 8: training loss: 0.021551301702857018
Epoch 9: training loss: 0.010372629389166832
Epoch 10: training loss: 0.004831144120544195
Epoch 11: training loss: 0.0022363169118762016
Epoch 12: training loss: 0.0010459395125508308
Epoch 13: training loss: 0.000499751593451947
Epoch 14: training loss: 0.0002456734946463257
Epoch 15: training loss: 0.0001248061889782548
Epoch 16: training loss: 6.567897798959166e-05
Epoch 17: training loss: 3.584321530070156e-05
Epoch 18: training loss: 2.0291949113016017e-05
Epoch 19: training loss: 1.1909470231330488e-05


In [402]:
list(model.parameters())

[Parameter containing:
 tensor([[ 0.8407,  0.7935,  0.8664, -1.5033,  1.0042],
         [-0.0480, -0.3573, -0.3542,  0.4423, -0.2553],
         [ 1.6622,  1.0431,  1.1439,  1.1252,  1.2748]], requires_grad=True),
 Parameter containing:
 tensor([ 1.2642, -0.1283,  1.0518], requires_grad=True),
 Parameter containing:
 tensor([[ 0.9113,  0.0715,  1.7074],
         [ 1.3359, -0.5011,  1.7406],
         [ 0.8692,  0.1059,  1.4898],
         [ 1.2079, -0.0022,  1.2265],
         [ 0.8207,  0.5294,  1.5747]], requires_grad=True),
 Parameter containing:
 tensor([0.6838, 1.1177, 0.8404, 1.1006, 0.9996], requires_grad=True)]

In [403]:
x2 = y + torch.randn_like(y)
y_pred = model(x2)
y_pred

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9998, 0.9999, 0.9995, 0.9991, 0.9997],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SigmoidBackward>)

### Demo: Word Window Classification
find LOCATION in a sentence

#### Data

In [2]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [3]:
def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [sent.lower().split() for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [4]:
# Our raw data, which consists of sentences
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [5]:
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [6]:
vocabulary.add('<unk>')

In [7]:
vocabulary.add('<pad>')

def pad_window(sentence, window_size, pad_token='<pad>'):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [8]:
ix_to_word = sorted(list(vocabulary))

word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [9]:
def convert_token_to_indices(sentence, word_to_ix):
    return [word_to_ix.get(token, word_to_ix['<unk>']) for token in sentence]

# Show an example
example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [10]:
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [12]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)
list(embeds.parameters())

[Parameter containing:
 tensor([[ 0.0510,  0.7643, -0.6074, -0.9190,  0.5395],
         [ 1.0151,  1.3013,  0.6155,  0.7153,  1.8642],
         [-0.3957, -0.4981, -1.1994,  0.8985,  0.0705],
         [ 0.3618, -0.8280, -0.3128,  2.3879,  2.6200],
         [-1.7187,  0.8925,  1.3094,  0.8820,  0.3823],
         [ 1.0132, -1.3024, -0.0541, -1.6570,  0.0592],
         [ 2.0937,  0.6968, -0.1896, -0.5536,  1.0649],
         [ 0.0063,  0.8306, -0.9647, -0.3866,  0.4924],
         [-0.5481, -0.7970, -0.5558,  0.0065,  0.8328],
         [-0.5575,  1.2137,  1.1258, -0.3319, -0.1089],
         [ 0.2343,  1.9562, -0.7370, -0.4266,  0.1678],
         [-1.1165, -0.5807, -0.9830,  1.2662,  1.3789],
         [-0.5219, -0.8730, -1.1051,  2.3017, -0.6814],
         [ 0.5085,  0.9873,  0.0833, -0.3883,  0.5493],
         [ 0.0848,  0.0592, -1.9237, -0.6693,  0.8189],
         [ 1.0588, -0.1862,  0.7847, -0.3037,  0.0632],
         [-0.3208, -0.3484, -0.0190, -0.4888, -0.7710],
         [-0.4450, -0.614

In [13]:
index = word_to_ix['paris']
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed

tensor([ 1.0588, -0.1862,  0.7847, -0.3037,  0.0632],
       grad_fn=<EmbeddingBackward>)

In [14]:
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices)
embeddings = embeds(indices_tensor)
embeddings

tensor([[ 1.0588, -0.1862,  0.7847, -0.3037,  0.0632],
        [ 0.3618, -0.8280, -0.3128,  2.3879,  2.6200]],
       grad_fn=<EmbeddingBackward>)

In [23]:
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):
    x, y = zip(*batch)
    
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window
    
    x = [pad_window(s, window_size=window_size) for s in x]
    
    def convert_tokens_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]
    
    x = [convert_token_to_indices(s, word_to_ix) for s in x]
    
    pad_token_ix = word_to_ix["<pad>"]
    
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)
    
    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)
    
    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)
    
    return x_padded, y_padded, lengths
    

In [24]:
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0,  0],
        [ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([4, 5])

Iteration 1
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0,  0],
        [ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1]])
Batched Lengths:
tensor([5, 6])

Iteration 2
Batched Input:
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1]])
Batched Lengths:
tensor([4])



In [25]:
print(batched_x)
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(chunk)

tensor([[ 0,  0, 10, 13, 11, 17,  0,  0]])
tensor([[[ 0,  0, 10, 13, 11],
         [ 0, 10, 13, 11, 17],
         [10, 13, 11, 17,  0],
         [13, 11, 17,  0,  0]]])


#### Model

In [30]:
class WordWindowClassifier(nn.Module):
    def __init__(self, hyperparameters, vocab_size, pad_ix=0):
        super(WordWindowClassifier, self).__init__()
    
        self.window_size = hyperparameters["window_size"]
        self.embed_dim = hyperparameters["embed_dim"]
        self.hidden_dim = hyperparameters["hidden_dim"]
        self.freeze_embeddings = hyperparameters["freeze_embeddings"]
        
        # Embedding
        self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False
            
        # hidden
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
            nn.Tanh()
        )
        
        # output
        self.output_layer = nn.Linear(self.hidden_dim, 1)
        self.probabilities = nn.Sigmoid()
        
    def forward(self, inputs):
        B, L = inputs.size()
        token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
        _, adjusted_length, _ = token_windows.size()
        
        assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)
        
        embedded_windows = self.embeds(token_windows)
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)
        
        layer_1 = self.hidden_layer(embedded_windows)
        
        ouput = self.output_layer(layer_1)
        
        output = self.probabilities(ouput)
        output = output.view(B, -1)
        return output
        

#### Training

In [31]:
# Prepare the data
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Initialize a model
# It is useful to put all the model hyperparameters in a dictionary
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

# Define an optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Define a loss function, which computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):   
    # Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the 
    # number of words in each training example
    loss = loss / batch_lengths.sum().float()

    return loss

In [32]:
# Function that will be called in every epoch
def train_epoch(loss_function, optimizer, model, loader):
  
  # Keep track of the total loss for the batch
  total_loss = 0
  for batch_inputs, batch_labels, batch_lengths in loader:
    # Clear the gradients
    optimizer.zero_grad()
    # Run a forward pass
    outputs = model.forward(batch_inputs)
    # Compute the batch loss
    loss = loss_function(outputs, batch_labels, batch_lengths)
    # Calculate the gradients
    loss.backward()
    # Update the parameteres
    optimizer.step()
    total_loss += loss.item()

  return total_loss


# Function containing our main training loop
def train(loss_function, optimizer, model, loader, num_epochs=10000):

  # Iterate through each epoch and call our train_epoch function
  for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, loader)
    if epoch % 100 == 0: print(epoch, ": ", epoch_loss)

In [33]:
num_epochs = 2000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.2654547542333603
0.24341053888201714
0.18935271725058556
0.15963807329535484
0.11803022213280201
0.09312437102198601
0.08193537592887878
0.07357266545295715
0.053441934287548065
0.04510915093123913
