In [1]:
import sys
sys.path.insert(0, "..")
import numpy as np
import torch
import tensorflow as tf
from layers import torch_masked_CRF, tf_masked_CRF

np.random.seed(42)
torch.manual_seed(42)
tf.random.set_seed(42)

## Synthetic data

In [2]:
NUM_TAGS = 5
label2idx_map = {"O-none":0, "B-loc":1, "I-loc":2, "B-org":3, "I-org":4}

BATCH_SIZE = 1
SEQ_LENGTH = 10
emissions = np.random.rand(BATCH_SIZE, SEQ_LENGTH, NUM_TAGS )
tags = np.random.randint(NUM_TAGS, size=(BATCH_SIZE, SEQ_LENGTH))

## 1. PyTorch

### 1.a Forward pass

In [3]:
# Cast data into torch.tensors
torch_emissions = torch.tensor(emissions, requires_grad=True)
torch_tags = torch.tensor(tags)

# Compute negative log likelihood
torch_layer = torch_masked_CRF.CRF(num_tags=NUM_TAGS, batch_first=True, label2idx=label2idx_map)
neg_torch_loss = torch_layer(torch_emissions, torch_tags, reduction="mean")
torch_preds = torch_layer.decode(torch_emissions)
print(neg_torch_loss)

tensor(-114.0810, dtype=torch.float64, grad_fn=<MeanBackward0>)


### 1.b Backward pass

In [4]:
neg_torch_loss.backward()
print(torch_emissions.grad)

tensor([[[-0.1126,  0.6653, -0.2381, -0.1975, -0.1171],
         [-0.1587,  0.7825, -0.2341, -0.2910, -0.0988],
         [-0.1260, -0.4554, -0.1486,  0.8043, -0.0742],
         [-0.1820,  0.7288, -0.1832, -0.2953, -0.0683],
         [-0.2919,  0.7684, -0.1144, -0.2662, -0.0960],
         [-0.3056, -0.1929, -0.0831,  0.6501, -0.0685],
         [-0.2397, -0.1786, -0.0452,  0.5998, -0.1363],
         [ 0.6922, -0.2178, -0.0393, -0.3174, -0.1177],
         [-0.1655, -0.3047, -0.0462, -0.4007,  0.9170],
         [-0.3171, -0.2097, -0.0906, -0.3016,  0.9191]]], dtype=torch.float64)


## 2. TensorFlow

### 2.a Forward pass

In [5]:
# Cast data into tf.Tensors
tf_emissions = tf.Variable(emissions, trainable=True)
tf_tags = tf.constant(tags)

# Compute negative log likelihood
with tf.GradientTape(persistent=True) as tape:
    tf_layer = tf_masked_CRF.MaskedCRF(num_output=NUM_TAGS, use_mask=True, label2idx_map=label2idx_map)
    loss, per_example_loss, tf_pred = tf_layer.decode(logits=tf_emissions, label_ids=tf_tags, lengths=[SEQ_LENGTH]*BATCH_SIZE)
    neg_tf_loss = -1.0 * loss
    print(neg_tf_loss)

tf.Tensor(-114.03488826140088, shape=(), dtype=float64)


2021-12-15 12:26:31.124678: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 2.b Backward pass

In [6]:
print(tape.gradient(neg_tf_loss, tf_emissions))

tf.Tensor(
[[[-0.10386129  0.67524633 -0.24685369 -0.19371572 -0.13081563]
  [-0.15550831  0.78549991 -0.23675906 -0.28451763 -0.10871491]
  [-0.12502566 -0.4509436  -0.14686009  0.802686   -0.07985664]
  [-0.17858315  0.73400336 -0.18302966 -0.29954124 -0.07284931]
  [-0.28750709  0.77243911 -0.1124481  -0.27150178 -0.10098215]
  [-0.29879891 -0.18877991 -0.0816968   0.64241969 -0.07314407]
  [-0.2319082  -0.17265856 -0.04459479  0.59277348 -0.14361192]
  [ 0.69449846 -0.20954137 -0.0389528  -0.31919062 -0.12681367]
  [-0.15962819 -0.29489605 -0.04499453 -0.41221641  0.91173517]
  [-0.3284229  -0.21412171 -0.0881785  -0.27749944  0.90822255]]], shape=(1, 10, 5), dtype=float64)


**Note**: Small differences in the neg forward pass results are due to different initialization values and different algorithms use to compute the loss. Consequentely, these differences will have an impact also on the value of the gradients. What we are interested in, is to check that both the results of the forward pass and backward pass are "close enough".