In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm

torch.manual_seed(1)

<torch._C.Generator at 0x7f0a100c66d0>

### Affine Maps
The affine map is a function f(x) = Ax + b, where A is a matrix and x, b are vectors. A and b are learnable parameters. Typically, x is the input and we are applying some matrix transformation to get our desired output.

In [2]:
# linear transformation from R5 to R3
lin = nn.Linear(5,3)
print(lin)

Linear(in_features=5, out_features=3, bias=True)


In [4]:
# sample data of dimensions 2x5
data = torch.randn(2,5)
print(data)

tensor([[-0.5404, -2.2102,  2.1130, -0.0040,  1.3800],
        [-1.3505,  0.3455,  0.5046,  1.8213, -0.1814]])


In PyTorch, the ith row of the input to A corresponds to the ith row of the mapped output

In [5]:
# output dimensions of 2x3
print(lin(data))

tensor([[-0.4935,  0.5282,  0.1833],
        [-0.0047, -0.0912, -0.3908]], grad_fn=<AddmmBackward0>)


### Composition
Take two affine maps, f(x) = Ax + b and g(x) = Cx + d. <br>
We can compose the two affine maps as such: <br>
f(x) = A(Cx + d) + b <br>
f(x) = ACx + Ad + b <br> 
f(x) = AC(x) + (Ad + b) <br>
<br>
Thus, composing affine maps gives you an affine map. Creating long chains of linear affine compositions in a neural network adds no computational power.<br>
<br>
We introduce non-linearities between the affine layers to build more powerful models. We typically use a few core non-linearities that are easily differentiable.

In [6]:
# In PyTorch, most non-linearities are in torch.functional (F)
# Non-linearities don't have parameters that are updated during training, as affine maps do
data = torch.randn(2,2)
print(data)

tensor([[-0.9515,  0.4057],
        [-1.5164,  0.7322]])


In [7]:
print(F.relu(data))

tensor([[0.0000, 0.4057],
        [0.0000, 0.7322]])


### Softmax
Softmax is a special non-linearity that is typically used at the end of a neural network, and returns a probability distribution from a vector of real numbers. <br>
<br>
If x is a vector of real numbers, then the i'th component of softmax(x) is: <br>
exp(x_i) / sigma_j exp(x_j)

In [9]:
data = torch.randn(5)
print(data)

tensor([ 1.0147, -0.1819,  0.6182,  0.0393,  0.9262])


In [14]:
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())

tensor([0.3061, 0.0925, 0.2059, 0.1154, 0.2802])
tensor(1.)


There is also log softmax, which computes log probabilities:

> The real advantage is in the arithmetic. Log probabilities are not as easy to understand as probabilities (for most people), but every time you multiply together two probabilities (other than 1×1=1), you will end up with a value closer to 0. Dealing with numbers very close to 0 can become unstable with finite precision approximations, so working with logs makes things much more stable and in some cases quicker and easier. Why do you need any more justification than that?

https://stats.stackexchange.com/questions/483927/why-are-log-probabilities-useful

In [15]:
print(F.log_softmax(data, dim=0))
print(F.log_softmax(data, dim=0).sum())

tensor([-1.1840, -2.3806, -1.5804, -2.1594, -1.2724])
tensor(-8.5767)


### Objective Functions
Objective functions, also known as loss or cost functions, are what neural networks are trained to minimize. To do so, we first pass in an input to the network, and compute the loss of the output. The parameters of the network are updated by taking the derivative of the loss function. <br><br>
The motivation behind minimizing the loss function is to help the network generalize better, with low loss values on previously unseen examples.

### Optimization and Training

Tensors know how to compute gradients with respect to the elements used to compute it using the computational graph. Since loss is a tensor, we can compute gradients with respect to the parameters used to compute it. We can then perform gradient updates.

### Creating Network Components in PyTorch

We can build a network in PyTorch using only affine maps and non-linearities. In the following example, we will also compute a loss function and update parameters through backpropagation. <br><br>

The example networm will take in a sparse bag-of-words (BoW) representation and output a probability distribution over the labels "English" and "Spanish". <br><br>

We assign each word in the vocabulary an index. Then, we pass the input through an affine map and perform a log softmax.

### Logistic Regression BoW

In [18]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

In [19]:
test_data = [("Yo creo que si".split(), "SPANISH"),
            ("it is lost on me".split(), "ENGLISH")]

In [22]:
word_indices = {}
for sentence, _ in data + test_data:
    for word in sentence:
        if word not in word_indices:
            word_indices[word] = len(word_indices)

In [23]:
print(word_indices)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [25]:
class BoWClassifier(nn.Module):
    
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [26]:
def bow_vectorizer(sentence, word_indices):
    vector = torch.zeros(len(word_indices))
    for word in sentence:
        vector[word_indices[word]] += 1
    return vector.view(1, -1)

In [27]:
def make_target(label, label_indices):
    return torch.LongTensor([label_indices[label]])

In [38]:
VOCAB_SIZE = len(word_indices)
NUM_LABELS = 2
label_indices = {"SPANISH": 0, "ENGLISH": 1}

In [28]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.1743,  0.1427, -0.0291,  0.1103,  0.0630, -0.1471,  0.0394,  0.0471,
         -0.1313, -0.0931,  0.0669,  0.0351, -0.0834, -0.0594,  0.1796, -0.0363,
          0.1106,  0.0849, -0.1268, -0.1668,  0.1882,  0.0102,  0.1344,  0.0406,
          0.0631,  0.1465],
        [ 0.1860, -0.1301,  0.0245,  0.1464,  0.1421,  0.1218, -0.1419, -0.1412,
         -0.1186,  0.0246,  0.1955, -0.1239,  0.1045, -0.1085, -0.1844, -0.0417,
          0.1130,  0.1821, -0.1218,  0.0426,  0.1692,  0.1300,  0.1222,  0.1394,
          0.1240,  0.0507]], requires_grad=True)
Parameter containing:
tensor([-0.1341, -0.1647], requires_grad=True)


In [29]:
with torch.no_grad():
    sample = data[0]
    bow_vector = bow_vectorizer(sample[0], word_indices)
    log_probs = model(bow_vector)
    print(log_probs)

tensor([[-0.9706, -0.4762]])


In [30]:
print(next(model.parameters())[:, word_indices["creo"]])

tensor([0.0669, 0.1955], grad_fn=<SelectBackward0>)


In [39]:
loss_fn = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in tqdm(range(100)):
    for instance, label in data:
        
        model.zero_grad()
        
        bow_vector = bow_vectorizer(instance, word_indices)
        target = make_target(label, label_indices)
        
        log_probs = model(bow_vector)
        
        loss = loss_fn(log_probs, target)
        loss.backward()
        optimizer.step()

100%|██████████| 100/100 [00:00<00:00, 266.54it/s]


In [47]:
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = bow_vectorizer(instance, word_indices)
        log_probs = model(bow_vec)
        print(log_probs)

tensor([[-0.1632, -1.8935]])
tensor([[-2.6800, -0.0710]])


In [48]:
x_t = F.softmax(log_probs,dim=-1)


In [49]:
x_t

tensor([[0.0686, 0.9314]])