docs/text/ner/anago/layers.html

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.10.0" />
<title>ktrain.text.ner.anago.layers API documentation</title>
<meta name="description" content="" />
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>ktrain.text.ner.anago.layers</code></h1>
</header>
<section id="section-intro">
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">from __future__ import absolute_import, division

from .... import utils as U
from ....imports import *

# from keras_contrib.losses import crf_loss
# from keras_contrib.metrics import crf_marginal_accuracy
# from keras_contrib.metrics import crf_viterbi_accuracy
# from keras_contrib.utils.test_utils import to_tuple


class CRF(keras.layers.Layer):
    &#34;&#34;&#34;An implementation of linear chain conditional random field (CRF).

    An linear chain CRF is defined to maximize the following likelihood function:

    $$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
    \sum_{y_1, ..., y_n} \exp(-a_1&#39; y_1 - a_n&#39; y_n
        - \sum_{k=1^n}((f(x_k&#39; W + b) y_k) + y_1&#39; U y_2)), $$

    where:
        $Z$: normalization constant
        $x_k, y_k$:  inputs and outputs

    This implementation has two modes for optimization:
    1. (`join mode`) optimized by maximizing join likelihood,
    which is optimal in theory of statistics.
       Note that in this case, CRF must be the output/last layer.
    2. (`marginal mode`) return marginal probabilities on each time
    step and optimized via composition
       likelihood (product of marginal likelihood), i.e.,
       using `categorical_crossentropy` loss.
       Note that in this case, CRF can be either the last layer or an
       intermediate layer (though not explored).

    For prediction (test phrase), one can choose either Viterbi
    best path (class indices) or marginal
    probabilities if probabilities are needed.
    However, if one chooses *join mode* for training,
    Viterbi output is typically better than marginal output,
    but the marginal output will still perform
    reasonably close, while if *marginal mode* is used for training,
    marginal output usually performs
    much better. The default behavior and `metrics.crf_accuracy`
    is set according to this observation.

    In addition, this implementation supports masking and accepts either
    onehot or sparse target.

    If you open a issue or a pull request about CRF, please
    add &#39;cc @lzfelix&#39; to notify Luiz Felix.


    # Examples

    ```python
        from keras_contrib.layers import CRF
        from keras_contrib.losses import crf_loss
        from keras_contrib.metrics import crf_viterbi_accuracy

        model = Sequential()
        model.add(Embedding(3001, 300, mask_zero=True)(X)

        # use learn_mode = &#39;join&#39;, test_mode = &#39;viterbi&#39;,
        # sparse_target = True (label indice output)
        crf = CRF(10, sparse_target=True)
        model.add(crf)

        # crf_accuracy is default to Viterbi acc if using join-mode (default).
        # One can add crf.marginal_acc if interested, but may slow down learning
        model.compile(&#39;adam&#39;, loss=crf_loss, metrics=[crf_viterbi_accuracy])

        # y must be label indices (with shape 1 at dim 3) here,
        # since `sparse_target=True`
        model.fit(x, y)

        # prediction give onehot representation of Viterbi best path
        y_hat = model.predict(x_test)
    ```

    The following snippet shows how to load a persisted
    model that uses the CRF layer:

    ```python
        from tensorflow.keras.models import load_model
        from keras_contrib.losses import import crf_loss
        from keras_contrib.metrics import crf_viterbi_accuracy

        custom_objects={&#39;CRF&#39;: CRF,
                        &#39;crf_loss&#39;: crf_loss,
                        &#39;crf_viterbi_accuracy&#39;: crf_viterbi_accuracy}

        loaded_model = load_model(&#39;&lt;path_to_model&gt;&#39;,
                                  custom_objects=custom_objects)
    ```

    # Arguments
        units: Positive integer, dimensionality of the output space.
        learn_mode: Either &#39;join&#39; or &#39;marginal&#39;.
            The former train the model by maximizing join likelihood while the latter
            maximize the product of marginal likelihood over all time steps.
            One should use `losses.crf_nll` for &#39;join&#39; mode
            and `losses.categorical_crossentropy` or
            `losses.sparse_categorical_crossentropy` for
            `marginal` mode.  For convenience, simply
            use `losses.crf_loss`, which will decide the proper loss as described.
        test_mode: Either &#39;viterbi&#39; or &#39;marginal&#39;.
            The former is recommended and as default when `learn_mode = &#39;join&#39;` and
            gives one-hot representation of the best path at test (prediction) time,
            while the latter is recommended and chosen as default
            when `learn_mode = &#39;marginal&#39;`,
            which produces marginal probabilities for each time step.
            For evaluating metrics, one should
            use `metrics.crf_viterbi_accuracy` for &#39;viterbi&#39; mode and
            &#39;metrics.crf_marginal_accuracy&#39; for &#39;marginal&#39; mode, or
            simply use `metrics.crf_accuracy` for
            both which automatically decides it as described.
            One can also use both for evaluation at training.
        sparse_target: Boolean (default False) indicating
            if provided labels are one-hot or
            indices (with shape 1 at dim 3).
        use_boundary: Boolean (default True) indicating if trainable
            start-end chain energies
            should be added to model.
        use_bias: Boolean, whether the layer uses a bias vector.
        kernel_initializer: Initializer for the `kernel` weights matrix,
            used for the linear transformation of the inputs.
            (see [initializers](../initializers.md)).
        chain_initializer: Initializer for the `chain_kernel` weights matrix,
            used for the CRF chain energy.
            (see [initializers](../initializers.md)).
        boundary_initializer: Initializer for the `left_boundary`,
            &#39;right_boundary&#39; weights vectors,
            used for the start/left and end/right boundary energy.
            (see [initializers](../initializers.md)).
        bias_initializer: Initializer for the bias vector
            (see [initializers](../initializers.md)).
        activation: Activation function to use
            (see [activations](../activations.md)).
            If you pass None, no activation is applied
            (ie. &#34;linear&#34; activation: `a(x) = x`).
        kernel_regularizer: Regularizer function applied to
            the `kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        chain_regularizer: Regularizer function applied to
            the `chain_kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        boundary_regularizer: Regularizer function applied to
            the &#39;left_boundary&#39;, &#39;right_boundary&#39; weight vectors
            (see [regularizer](../regularizers.md)).
        bias_regularizer: Regularizer function applied to the bias vector
            (see [regularizer](../regularizers.md)).
        kernel_constraint: Constraint function applied to
            the `kernel` weights matrix
            (see [constraints](../constraints.md)).
        chain_constraint: Constraint function applied to
            the `chain_kernel` weights matrix
            (see [constraints](../constraints.md)).
        boundary_constraint: Constraint function applied to
            the `left_boundary`, `right_boundary` weights vectors
            (see [constraints](../constraints.md)).
        bias_constraint: Constraint function applied to the bias vector
            (see [constraints](../constraints.md)).
        input_dim: dimensionality of the input (integer).
            This argument (or alternatively, the keyword argument `input_shape`)
            is required when using this layer as the first layer in a model.
        unroll: Boolean (default False). If True, the network will be
            unrolled, else a symbolic loop will be used.
            Unrolling can speed-up a RNN, although it tends
            to be more memory-intensive.
            Unrolling is only suitable for short sequences.

    # Input shape
        3D tensor with shape `(nb_samples, timesteps, input_dim)`.

    # Output shape
        3D tensor with shape `(nb_samples, timesteps, units)`.

    # Masking
        This layer supports masking for input data with a variable number
        of timesteps. To introduce masks to your data,
        use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
        set to `True`.

    &#34;&#34;&#34;

    def __init__(
        self,
        units,
        learn_mode=&#34;join&#34;,
        test_mode=None,
        sparse_target=False,
        use_boundary=True,
        use_bias=True,
        activation=&#34;linear&#34;,
        kernel_initializer=&#34;glorot_uniform&#34;,
        chain_initializer=&#34;orthogonal&#34;,
        bias_initializer=&#34;zeros&#34;,
        boundary_initializer=&#34;zeros&#34;,
        kernel_regularizer=None,
        chain_regularizer=None,
        boundary_regularizer=None,
        bias_regularizer=None,
        kernel_constraint=None,
        chain_constraint=None,
        boundary_constraint=None,
        bias_constraint=None,
        input_dim=None,
        unroll=False,
        **kwargs
    ):
        super(CRF, self).__init__(**kwargs)
        self.supports_masking = True
        self.units = units
        self.learn_mode = learn_mode
        assert self.learn_mode in [&#34;join&#34;, &#34;marginal&#34;]
        self.test_mode = test_mode
        if self.test_mode is None:
            self.test_mode = &#34;viterbi&#34; if self.learn_mode == &#34;join&#34; else &#34;marginal&#34;
        else:
            assert self.test_mode in [&#34;viterbi&#34;, &#34;marginal&#34;]
        self.sparse_target = sparse_target
        self.use_boundary = use_boundary
        self.use_bias = use_bias

        self.activation = keras.activations.get(activation)

        self.kernel_initializer = keras.initializers.get(kernel_initializer)
        self.chain_initializer = keras.initializers.get(chain_initializer)
        self.boundary_initializer = keras.initializers.get(boundary_initializer)
        self.bias_initializer = keras.initializers.get(bias_initializer)

        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
        self.chain_regularizer = keras.regularizers.get(chain_regularizer)
        self.boundary_regularizer = keras.regularizers.get(boundary_regularizer)
        self.bias_regularizer = keras.regularizers.get(bias_regularizer)

        self.kernel_constraint = keras.constraints.get(kernel_constraint)
        self.chain_constraint = keras.constraints.get(chain_constraint)
        self.boundary_constraint = keras.constraints.get(boundary_constraint)
        self.bias_constraint = keras.constraints.get(bias_constraint)

        self.unroll = unroll

    def build(self, input_shape):
        input_shape = to_tuple(input_shape)
        self.input_spec = [keras.layers.InputSpec(shape=input_shape)]
        self.input_dim = input_shape[-1]

        self.kernel = self.add_weight(
            shape=(self.input_dim, self.units),
            name=&#34;kernel&#34;,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.chain_kernel = self.add_weight(
            shape=(self.units, self.units),
            name=&#34;chain_kernel&#34;,
            initializer=self.chain_initializer,
            regularizer=self.chain_regularizer,
            constraint=self.chain_constraint,
        )
        if self.use_bias:
            self.bias = self.add_weight(
                shape=(self.units,),
                name=&#34;bias&#34;,
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint,
            )
        else:
            self.bias = 0

        if self.use_boundary:
            self.left_boundary = self.add_weight(
                shape=(self.units,),
                name=&#34;left_boundary&#34;,
                initializer=self.boundary_initializer,
                regularizer=self.boundary_regularizer,
                constraint=self.boundary_constraint,
            )
            self.right_boundary = self.add_weight(
                shape=(self.units,),
                name=&#34;right_boundary&#34;,
                initializer=self.boundary_initializer,
                regularizer=self.boundary_regularizer,
                constraint=self.boundary_constraint,
            )
        self.built = True

    def call(self, X, mask=None):
        if mask is not None:
            assert K.ndim(mask) == 2, &#34;Input mask to CRF must have dim 2 if not None&#34;

        if self.test_mode == &#34;viterbi&#34;:
            test_output = self.viterbi_decoding(X, mask)
        else:
            test_output = self.get_marginal_prob(X, mask)

        self.uses_learning_phase = True
        if self.learn_mode == &#34;join&#34;:
            train_output = K.zeros_like(K.dot(X, self.kernel))
            out = K.in_train_phase(train_output, test_output)
        else:
            if self.test_mode == &#34;viterbi&#34;:
                train_output = self.get_marginal_prob(X, mask)
                out = K.in_train_phase(train_output, test_output)
            else:
                out = test_output
        return out

    def compute_output_shape(self, input_shape):
        return input_shape[:2] + (self.units,)

    def compute_mask(self, input, mask=None):
        if mask is not None and self.learn_mode == &#34;join&#34;:
            return K.any(mask, axis=1)
        return mask

    def get_config(self):
        config = {
            &#34;units&#34;: self.units,
            &#34;learn_mode&#34;: self.learn_mode,
            &#34;test_mode&#34;: self.test_mode,
            &#34;use_boundary&#34;: self.use_boundary,
            &#34;use_bias&#34;: self.use_bias,
            &#34;sparse_target&#34;: self.sparse_target,
            &#34;kernel_initializer&#34;: keras.initializers.serialize(self.kernel_initializer),
            &#34;chain_initializer&#34;: keras.initializers.serialize(self.chain_initializer),
            &#34;boundary_initializer&#34;: keras.initializers.serialize(
                self.boundary_initializer
            ),
            &#34;bias_initializer&#34;: keras.initializers.serialize(self.bias_initializer),
            &#34;activation&#34;: keras.activations.serialize(self.activation),
            &#34;kernel_regularizer&#34;: keras.regularizers.serialize(self.kernel_regularizer),
            &#34;chain_regularizer&#34;: keras.regularizers.serialize(self.chain_regularizer),
            &#34;boundary_regularizer&#34;: keras.regularizers.serialize(
                self.boundary_regularizer
            ),
            &#34;bias_regularizer&#34;: keras.regularizers.serialize(self.bias_regularizer),
            &#34;kernel_constraint&#34;: keras.constraints.serialize(self.kernel_constraint),
            &#34;chain_constraint&#34;: keras.constraints.serialize(self.chain_constraint),
            &#34;boundary_constraint&#34;: keras.constraints.serialize(
                self.boundary_constraint
            ),
            &#34;bias_constraint&#34;: keras.constraints.serialize(self.bias_constraint),
            &#34;input_dim&#34;: self.input_dim,
            &#34;unroll&#34;: self.unroll,
        }
        base_config = super(CRF, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    @property
    def loss_function(self):
        # warnings.warn(&#39;CRF.loss_function is deprecated &#39;
        #&#39;and it might be removed in the future. Please &#39;
        #&#39;use losses.crf_loss instead.&#39;)
        return crf_loss

    @property
    def accuracy(self):
        # warnings.warn(&#39;CRF.accuracy is deprecated and it &#39;
        #&#39;might be removed in the future. Please &#39;
        #&#39;use metrics.crf_accuracy&#39;)
        if self.test_mode == &#34;viterbi&#34;:
            return crf_viterbi_accuracy
        else:
            return crf_marginal_accuracy

    @property
    def viterbi_acc(self):
        # warnings.warn(&#39;CRF.viterbi_acc is deprecated and it might &#39;
        #&#39;be removed in the future. Please &#39;
        #&#39;use metrics.viterbi_acc instead.&#39;)
        return crf_viterbi_accuracy

    @property
    def marginal_acc(self):
        # warnings.warn(&#39;CRF.moarginal_acc is deprecated and it &#39;
        #&#39;might be removed in the future. Please &#39;
        #&#39;use metrics.marginal_acc instead.&#39;)
        return crf_marginal_accuracy

    @staticmethod
    def softmaxNd(x, axis=-1):
        m = K.max(x, axis=axis, keepdims=True)
        exp_x = K.exp(x - m)
        prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
        return prob_x

    @staticmethod
    def shift_left(x, offset=1):
        assert offset &gt; 0
        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)

    @staticmethod
    def shift_right(x, offset=1):
        assert offset &gt; 0
        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)

    def add_boundary_energy(self, energy, mask, start, end):
        start = K.expand_dims(K.expand_dims(start, 0), 0)
        end = K.expand_dims(K.expand_dims(end, 0), 0)
        if mask is None:
            energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]], axis=1)
            energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end], axis=1)
        else:
            mask = K.expand_dims(K.cast(mask, K.floatx()))
            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
            end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
            energy = energy + start_mask * start
            energy = energy + end_mask * end
        return energy

    def get_log_normalization_constant(self, input_energy, mask, **kwargs):
        &#34;&#34;&#34;Compute logarithm of the normalization constant Z, where
        Z = sum exp(-E) -&gt; logZ = log sum exp(-E) =: -nlogZ
        &#34;&#34;&#34;
        # should have logZ[:, i] == logZ[:, j] for any i, j
        logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
        return logZ[:, 0]

    def get_energy(self, y_true, input_energy, mask):
        &#34;&#34;&#34;Energy = a1&#39; y1 + u1&#39; y1 + y1&#39; U y2 + u2&#39; y2 + y2&#39; U y3 + u3&#39; y3 + an&#39; y3&#34;&#34;&#34;
        input_energy = K.sum(input_energy * y_true, 2)  # (B, T)
        # (B, T-1)
        chain_energy = K.sum(
            K.dot(y_true[:, :-1, :], self.chain_kernel) * y_true[:, 1:, :], 2
        )

        if mask is not None:
            mask = K.cast(mask, K.floatx())
            # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
            chain_mask = mask[:, :-1] * mask[:, 1:]
            input_energy = input_energy * mask
            chain_energy = chain_energy * chain_mask
        total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1)  # (B, )

        return total_energy

    def get_negative_log_likelihood(self, y_true, X, mask):
        &#34;&#34;&#34;Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
        likelihood = 1/Z * exp(-E) -&gt;  neg_log_like = - log(1/Z * exp(-E)) = logZ + E
        &#34;&#34;&#34;
        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
        if self.use_boundary:
            input_energy = self.add_boundary_energy(
                input_energy, mask, self.left_boundary, self.right_boundary
            )
        energy = self.get_energy(y_true, input_energy, mask)
        logZ = self.get_log_normalization_constant(
            input_energy, mask, input_length=K.int_shape(X)[1]
        )
        nloglik = logZ + energy
        if mask is not None:
            nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
        else:
            nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
        return nloglik

    def step(self, input_energy_t, states, return_logZ=True):
        # not in the following  `prev_target_val` has shape = (B, F)
        # where B = batch_size, F = output feature dim
        # Note: `i` is of float32, due to the behavior of `K.rnn`
        prev_target_val, i, chain_energy = states[:3]
        t = K.cast(i[0, 0], dtype=&#34;int32&#34;)
        if len(states) &gt; 3:
            if K.backend() == &#34;theano&#34;:
                m = states[3][:, t : (t + 2)]
            else:
                m = tf.slice(states[3], [0, t], [-1, 2])

            input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
            # (1, F, F)*(B, 1, 1) -&gt; (B, F, F)
            chain_energy = chain_energy * K.expand_dims(
                K.expand_dims(m[:, 0] * m[:, 1])
            )
        if return_logZ:
            # shapes: (1, B, F) + (B, F, 1) -&gt; (B, F, F)
            energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
            # new_target_val = K.logsumexp(-energy, 1)  # shapes: (B, F)
            new_target_val = tf.reduce_logsumexp(-energy, 1)  # shapes: (B, F)
            return new_target_val, [new_target_val, i + 1]
        else:
            energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
            min_energy = K.min(energy, 1)
            # cast for tf-version `K.rnn
            argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
            return argmin_table, [min_energy, i + 1]

    def recursion(
        self,
        input_energy,
        mask=None,
        go_backwards=False,
        return_sequences=True,
        return_logZ=True,
        input_length=None,
    ):
        &#34;&#34;&#34;Forward (alpha) or backward (beta) recursion

        If `return_logZ = True`, compute the logZ, the normalization constant:

        \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
          = \sum_{y1, y2, y3} exp(-(u1&#39; y1 + y1&#39; W y2 + u2&#39; y2 + y2&#39; W y3 + u3&#39; y3))
          = sum_{y2, y3} (exp(-(u2&#39; y2 + y2&#39; W y3 + u3&#39; y3))
          sum_{y1} exp(-(u1&#39; y1&#39; + y1&#39; W y2))) \]

        Denote:
            \[ S(y2) := sum_{y1} exp(-(u1&#39; y1 + y1&#39; W y2)), \]
            \[ Z = sum_{y2, y3} exp(log S(y2) - (u2&#39; y2 + y2&#39; W y3 + u3&#39; y3)) \]
            \[ logS(y2) = log S(y2) = log_sum_exp(-(u1&#39; y1&#39; + y1&#39; W y2)) \]
        Note that:
              yi&#39;s are one-hot vectors
              u1, u3: boundary energies have been merged

        If `return_logZ = False`, compute the Viterbi&#39;s best path lookup table.
        &#34;&#34;&#34;
        chain_energy = self.chain_kernel
        # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
        chain_energy = K.expand_dims(chain_energy, 0)
        # shape=(B, F), dtype=float32
        prev_target_val = K.zeros_like(input_energy[:, 0, :])

        if go_backwards:
            input_energy = K.reverse(input_energy, 1)
            if mask is not None:
                mask = K.reverse(mask, 1)

        initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
        constants = [chain_energy]

        if mask is not None:
            mask2 = K.cast(
                K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), K.floatx()
            )
            constants.append(mask2)

        def _step(input_energy_i, states):
            return self.step(input_energy_i, states, return_logZ)

        target_val_last, target_val_seq, _ = K.rnn(
            _step,
            input_energy,
            initial_states,
            constants=constants,
            input_length=input_length,
            unroll=self.unroll,
        )

        if return_sequences:
            if go_backwards:
                target_val_seq = K.reverse(target_val_seq, 1)
            return target_val_seq
        else:
            return target_val_last

    def forward_recursion(self, input_energy, **kwargs):
        return self.recursion(input_energy, **kwargs)

    def backward_recursion(self, input_energy, **kwargs):
        return self.recursion(input_energy, go_backwards=True, **kwargs)

    def get_marginal_prob(self, X, mask=None):
        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
        if self.use_boundary:
            input_energy = self.add_boundary_energy(
                input_energy, mask, self.left_boundary, self.right_boundary
            )
        input_length = K.int_shape(X)[1]
        alpha = self.forward_recursion(
            input_energy, mask=mask, input_length=input_length
        )
        beta = self.backward_recursion(
            input_energy, mask=mask, input_length=input_length
        )
        if mask is not None:
            input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
        margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
        return self.softmaxNd(margin)

    def viterbi_decoding(self, X, mask=None):
        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
        if self.use_boundary:
            input_energy = self.add_boundary_energy(
                input_energy, mask, self.left_boundary, self.right_boundary
            )

        argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
        argmin_tables = K.cast(argmin_tables, &#34;int32&#34;)

        # backward to find best path, `initial_best_idx` can be any,
        # as all elements in the last argmin_table are the same
        argmin_tables = K.reverse(argmin_tables, 1)
        # matrix instead of vector is required by tf `K.rnn`
        initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
        if K.backend() == &#34;theano&#34;:
            from theano import tensor as T

            initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)]

        def gather_each_row(params, indices):
            n = K.shape(indices)[0]
            if K.backend() == &#34;theano&#34;:
                from theano import tensor as T

                return params[T.arange(n), indices]
            elif K.backend() == &#34;tensorflow&#34;:
                import tensorflow as tf

                indices = K.transpose(K.stack([tf.range(n), indices]))
                return tf.gather_nd(params, indices)
            else:
                raise NotImplementedError

        def find_path(argmin_table, best_idx):
            next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
            next_best_idx = K.expand_dims(next_best_idx)
            if K.backend() == &#34;theano&#34;:
                from theano import tensor as T

                next_best_idx = T.unbroadcast(next_best_idx, 1)
            return next_best_idx, [next_best_idx]

        _, best_paths, _ = K.rnn(
            find_path,
            argmin_tables,
            initial_best_idx,
            input_length=K.int_shape(X)[1],
            unroll=self.unroll,
        )
        best_paths = K.reverse(best_paths, 1)
        best_paths = K.squeeze(best_paths, 2)

        return K.one_hot(best_paths, self.units)


def crf_nll(y_true, y_pred):
    &#34;&#34;&#34;The negative log-likelihood for linear chain Conditional Random Field (CRF).

    This loss function is only used when the `layers.CRF` layer
    is trained in the &#34;join&#34; mode.

    # Arguments
        y_true: tensor with true targets.
        y_pred: tensor with predicted targets.

    # Returns
        A scalar representing corresponding to the negative log-likelihood.

    # Raises
        TypeError: If CRF is not the last layer.

    # About GitHub
        If you open an issue or a pull request about CRF, please
        add `cc @lzfelix` to notify Luiz Felix.
    &#34;&#34;&#34;

    crf, idx = y_pred._keras_history[:2]
    if crf._outbound_nodes:
        raise TypeError(&#39;When learn_model=&#34;join&#34;, CRF must be the last layer.&#39;)
    if crf.sparse_target:
        y_true = K.one_hot(K.cast(y_true[:, :, 0], &#34;int32&#34;), crf.units)
    # X = crf._inbound_nodes[idx].input_tensors[0]
    # mask = crf._inbound_nodes[idx].input_masks[0]
    X = crf.get_input_at(idx)
    mask = crf.get_input_mask_at(idx)
    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
    return nloglik


def crf_loss(y_true, y_pred):
    &#34;&#34;&#34;General CRF loss function depending on the learning mode.

    # Arguments
        y_true: tensor with true targets.
        y_pred: tensor with predicted targets.

    # Returns
        If the CRF layer is being trained in the join mode, returns the negative
        log-likelihood. Otherwise returns the categorical crossentropy implemented
        by the underlying Keras backend.

    # About GitHub
        If you open an issue or a pull request about CRF, please
        add `cc @lzfelix` to notify Luiz Felix.
    &#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    if crf.learn_mode == &#34;join&#34;:
        return crf_nll(y_true, y_pred)
    else:
        if crf.sparse_target:
            return keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
        else:
            return keras.losses.categorical_crossentropy(y_true, y_pred)


def _get_accuracy(y_true, y_pred, mask, sparse_target=False):
    y_pred = K.argmax(y_pred, -1)
    if sparse_target:
        y_true = K.cast(y_true[:, :, 0], K.dtype(y_pred))
    else:
        y_true = K.argmax(y_true, -1)
    judge = K.cast(K.equal(y_pred, y_true), K.floatx())
    if mask is None:
        return K.mean(judge)
    else:
        mask = K.cast(mask, K.floatx())
        return K.sum(judge * mask) / K.sum(mask)


def crf_viterbi_accuracy(y_true, y_pred):
    &#34;&#34;&#34;Use Viterbi algorithm to get best path, and compute its accuracy.
    `y_pred` must be an output from CRF.&#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    # X = crf._inbound_nodes[idx].input_tensors[0]
    # mask = crf._inbound_nodes[idx].input_masks[0]
    X = crf.get_input_at(idx)
    mask = crf.get_input_mask_at(idx)
    y_pred = crf.viterbi_decoding(X, mask)
    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)


def crf_marginal_accuracy(y_true, y_pred):
    &#34;&#34;&#34;Use time-wise marginal argmax as prediction.
    `y_pred` must be an output from CRF with `learn_mode=&#34;marginal&#34;`.&#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    # X = crf._inbound_nodes[idx].input_tensors[0]
    # mask = crf._inbound_nodes[idx].input_masks[0]
    X = crf.get_input_at(idx)
    mask = crf.get_input_mask_at(idx)
    y_pred = crf.get_marginal_prob(X, mask)
    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)


def crf_accuracy(y_true, y_pred):
    &#34;&#34;&#34;Ge default accuracy based on CRF `test_mode`.&#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    if crf.test_mode == &#34;viterbi&#34;:
        return crf_viterbi_accuracy(y_true, y_pred)


def to_tuple(shape):
    &#34;&#34;&#34;This functions is here to fix an inconsistency between keras and tf.keras.

    In tf.keras, the input_shape argument is an tuple with `Dimensions` objects.
    In keras, the input_shape is a simple tuple of ints or `None`.

    We&#39;ll work with tuples of ints or `None` to be consistent
    with keras-team/keras. So we must apply this function to
    all input_shapes of the build methods in custom layers.
    &#34;&#34;&#34;
    return tuple(tf.TensorShape(shape).as_list())</code></pre>
</details>
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="ktrain.text.ner.anago.layers.crf_accuracy"><code class="name flex">
<span>def <span class="ident">crf_accuracy</span></span>(<span>y_true, y_pred)</span>
</code></dt>
<dd>
<div class="desc"><p>Ge default accuracy based on CRF <code>test_mode</code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def crf_accuracy(y_true, y_pred):
    &#34;&#34;&#34;Ge default accuracy based on CRF `test_mode`.&#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    if crf.test_mode == &#34;viterbi&#34;:
        return crf_viterbi_accuracy(y_true, y_pred)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.crf_loss"><code class="name flex">
<span>def <span class="ident">crf_loss</span></span>(<span>y_true, y_pred)</span>
</code></dt>
<dd>
<div class="desc"><p>General CRF loss function depending on the learning mode.</p>
<h1 id="arguments">Arguments</h1>
<pre><code>y_true: tensor with true targets.
y_pred: tensor with predicted targets.
</code></pre>
<h1 id="returns">Returns</h1>
<pre><code>If the CRF layer is being trained in the join mode, returns the negative
log-likelihood. Otherwise returns the categorical crossentropy implemented
by the underlying Keras backend.
</code></pre>
<h1 id="about-github">About GitHub</h1>
<pre><code>If you open an issue or a pull request about CRF, please
add `cc @lzfelix` to notify Luiz Felix.
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def crf_loss(y_true, y_pred):
    &#34;&#34;&#34;General CRF loss function depending on the learning mode.

    # Arguments
        y_true: tensor with true targets.
        y_pred: tensor with predicted targets.

    # Returns
        If the CRF layer is being trained in the join mode, returns the negative
        log-likelihood. Otherwise returns the categorical crossentropy implemented
        by the underlying Keras backend.

    # About GitHub
        If you open an issue or a pull request about CRF, please
        add `cc @lzfelix` to notify Luiz Felix.
    &#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    if crf.learn_mode == &#34;join&#34;:
        return crf_nll(y_true, y_pred)
    else:
        if crf.sparse_target:
            return keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
        else:
            return keras.losses.categorical_crossentropy(y_true, y_pred)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.crf_marginal_accuracy"><code class="name flex">
<span>def <span class="ident">crf_marginal_accuracy</span></span>(<span>y_true, y_pred)</span>
</code></dt>
<dd>
<div class="desc"><p>Use time-wise marginal argmax as prediction.
<code>y_pred</code> must be an output from CRF with <code>learn_mode="marginal"</code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def crf_marginal_accuracy(y_true, y_pred):
    &#34;&#34;&#34;Use time-wise marginal argmax as prediction.
    `y_pred` must be an output from CRF with `learn_mode=&#34;marginal&#34;`.&#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    # X = crf._inbound_nodes[idx].input_tensors[0]
    # mask = crf._inbound_nodes[idx].input_masks[0]
    X = crf.get_input_at(idx)
    mask = crf.get_input_mask_at(idx)
    y_pred = crf.get_marginal_prob(X, mask)
    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.crf_nll"><code class="name flex">
<span>def <span class="ident">crf_nll</span></span>(<span>y_true, y_pred)</span>
</code></dt>
<dd>
<div class="desc"><p>The negative log-likelihood for linear chain Conditional Random Field (CRF).</p>
<p>This loss function is only used when the <code>layers.CRF</code> layer
is trained in the "join" mode.</p>
<h1 id="arguments">Arguments</h1>
<pre><code>y_true: tensor with true targets.
y_pred: tensor with predicted targets.
</code></pre>
<h1 id="returns">Returns</h1>
<pre><code>A scalar representing corresponding to the negative log-likelihood.
</code></pre>
<h1 id="raises">Raises</h1>
<pre><code>TypeError: If CRF is not the last layer.
</code></pre>
<h1 id="about-github">About GitHub</h1>
<pre><code>If you open an issue or a pull request about CRF, please
add `cc @lzfelix` to notify Luiz Felix.
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def crf_nll(y_true, y_pred):
    &#34;&#34;&#34;The negative log-likelihood for linear chain Conditional Random Field (CRF).

    This loss function is only used when the `layers.CRF` layer
    is trained in the &#34;join&#34; mode.

    # Arguments
        y_true: tensor with true targets.
        y_pred: tensor with predicted targets.

    # Returns
        A scalar representing corresponding to the negative log-likelihood.

    # Raises
        TypeError: If CRF is not the last layer.

    # About GitHub
        If you open an issue or a pull request about CRF, please
        add `cc @lzfelix` to notify Luiz Felix.
    &#34;&#34;&#34;

    crf, idx = y_pred._keras_history[:2]
    if crf._outbound_nodes:
        raise TypeError(&#39;When learn_model=&#34;join&#34;, CRF must be the last layer.&#39;)
    if crf.sparse_target:
        y_true = K.one_hot(K.cast(y_true[:, :, 0], &#34;int32&#34;), crf.units)
    # X = crf._inbound_nodes[idx].input_tensors[0]
    # mask = crf._inbound_nodes[idx].input_masks[0]
    X = crf.get_input_at(idx)
    mask = crf.get_input_mask_at(idx)
    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
    return nloglik</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.crf_viterbi_accuracy"><code class="name flex">
<span>def <span class="ident">crf_viterbi_accuracy</span></span>(<span>y_true, y_pred)</span>
</code></dt>
<dd>
<div class="desc"><p>Use Viterbi algorithm to get best path, and compute its accuracy.
<code>y_pred</code> must be an output from CRF.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def crf_viterbi_accuracy(y_true, y_pred):
    &#34;&#34;&#34;Use Viterbi algorithm to get best path, and compute its accuracy.
    `y_pred` must be an output from CRF.&#34;&#34;&#34;
    crf, idx = y_pred._keras_history[:2]
    # X = crf._inbound_nodes[idx].input_tensors[0]
    # mask = crf._inbound_nodes[idx].input_masks[0]
    X = crf.get_input_at(idx)
    mask = crf.get_input_mask_at(idx)
    y_pred = crf.viterbi_decoding(X, mask)
    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.to_tuple"><code class="name flex">
<span>def <span class="ident">to_tuple</span></span>(<span>shape)</span>
</code></dt>
<dd>
<div class="desc"><p>This functions is here to fix an inconsistency between keras and tf.keras.</p>
<p>In tf.keras, the input_shape argument is an tuple with <code>Dimensions</code> objects.
In keras, the input_shape is a simple tuple of ints or <code>None</code>.</p>
<p>We'll work with tuples of ints or <code>None</code> to be consistent
with keras-team/keras. So we must apply this function to
all input_shapes of the build methods in custom layers.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def to_tuple(shape):
    &#34;&#34;&#34;This functions is here to fix an inconsistency between keras and tf.keras.

    In tf.keras, the input_shape argument is an tuple with `Dimensions` objects.
    In keras, the input_shape is a simple tuple of ints or `None`.

    We&#39;ll work with tuples of ints or `None` to be consistent
    with keras-team/keras. So we must apply this function to
    all input_shapes of the build methods in custom layers.
    &#34;&#34;&#34;
    return tuple(tf.TensorShape(shape).as_list())</code></pre>
</details>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="ktrain.text.ner.anago.layers.CRF"><code class="flex name class">
<span>class <span class="ident">CRF</span></span>
<span>(</span><span>units, learn_mode='join', test_mode=None, sparse_target=False, use_boundary=True, use_bias=True, activation='linear', kernel_initializer='glorot_uniform', chain_initializer='orthogonal', bias_initializer='zeros', boundary_initializer='zeros', kernel_regularizer=None, chain_regularizer=None, boundary_regularizer=None, bias_regularizer=None, kernel_constraint=None, chain_constraint=None, boundary_constraint=None, bias_constraint=None, input_dim=None, unroll=False, **kwargs)</span>
</code></dt>
<dd>
<div class="desc"><p>An implementation of linear chain conditional random field (CRF).</p>
<p>An linear chain CRF is defined to maximize the following likelihood function:</p>
<p>$$ L(W, U, b; y_1, &hellip;, y_n) :=
rac{1}{Z}
\sum_{y_1, &hellip;, y_n} \exp(-a_1' y_1 - a_n' y_n
- \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$</p>
<p>where:
$Z$: normalization constant
$x_k, y_k$:
inputs and outputs</p>
<p>This implementation has two modes for optimization:
1. (<code>join mode</code>) optimized by maximizing join likelihood,
which is optimal in theory of statistics.
Note that in this case, CRF must be the output/last layer.
2. (<code>marginal mode</code>) return marginal probabilities on each time
step and optimized via composition
likelihood (product of marginal likelihood), i.e.,
using <code>categorical_crossentropy</code> loss.
Note that in this case, CRF can be either the last layer or an
intermediate layer (though not explored).</p>
<p>For prediction (test phrase), one can choose either Viterbi
best path (class indices) or marginal
probabilities if probabilities are needed.
However, if one chooses <em>join mode</em> for training,
Viterbi output is typically better than marginal output,
but the marginal output will still perform
reasonably close, while if <em>marginal mode</em> is used for training,
marginal output usually performs
much better. The default behavior and <code>metrics.crf_accuracy</code>
is set according to this observation.</p>
<p>In addition, this implementation supports masking and accepts either
onehot or sparse target.</p>
<p>If you open a issue or a pull request about CRF, please
add 'cc @lzfelix' to notify Luiz Felix.</p>
<h1 id="examples">Examples</h1>
<pre><code class="language-python">    from keras_contrib.layers import CRF
    from keras_contrib.losses import crf_loss
    from keras_contrib.metrics import crf_viterbi_accuracy

    model = Sequential()
    model.add(Embedding(3001, 300, mask_zero=True)(X)

    # use learn_mode = 'join', test_mode = 'viterbi',
    # sparse_target = True (label indice output)
    crf = CRF(10, sparse_target=True)
    model.add(crf)

    # crf_accuracy is default to Viterbi acc if using join-mode (default).
    # One can add crf.marginal_acc if interested, but may slow down learning
    model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])

    # y must be label indices (with shape 1 at dim 3) here,
    # since `sparse_target=True`
    model.fit(x, y)

    # prediction give onehot representation of Viterbi best path
    y_hat = model.predict(x_test)
</code></pre>
<p>The following snippet shows how to load a persisted
model that uses the CRF layer:</p>
<pre><code class="language-python">    from tensorflow.keras.models import load_model
    from keras_contrib.losses import import crf_loss
    from keras_contrib.metrics import crf_viterbi_accuracy

    custom_objects={'CRF': CRF,
                    'crf_loss': crf_loss,
                    'crf_viterbi_accuracy': crf_viterbi_accuracy}

    loaded_model = load_model('&lt;path_to_model&gt;',
                              custom_objects=custom_objects)
</code></pre>
<h1 id="arguments">Arguments</h1>
<pre><code>units: Positive integer, dimensionality of the output space.
learn_mode: Either 'join' or 'marginal'.
    The former train the model by maximizing join likelihood while the latter
    maximize the product of marginal likelihood over all time steps.
    One should use &lt;code&gt;losses.crf\_nll&lt;/code&gt; for 'join' mode
    and &lt;code&gt;losses.categorical\_crossentropy&lt;/code&gt; or
    &lt;code&gt;losses.sparse\_categorical\_crossentropy&lt;/code&gt; for
    &lt;code&gt;marginal&lt;/code&gt; mode.  For convenience, simply
    use &lt;code&gt;losses.crf\_loss&lt;/code&gt;, which will decide the proper loss as described.
test_mode: Either 'viterbi' or 'marginal'.
    The former is recommended and as default when `learn_mode = 'join'` and
    gives one-hot representation of the best path at test (prediction) time,
    while the latter is recommended and chosen as default
    when `learn_mode = 'marginal'`,
    which produces marginal probabilities for each time step.
    For evaluating metrics, one should
    use &lt;code&gt;metrics.crf\_viterbi\_accuracy&lt;/code&gt; for 'viterbi' mode and
    'metrics.crf_marginal_accuracy' for 'marginal' mode, or
    simply use &lt;code&gt;metrics.crf\_accuracy&lt;/code&gt; for
    both which automatically decides it as described.
    One can also use both for evaluation at training.
sparse_target: Boolean (default False) indicating
    if provided labels are one-hot or
    indices (with shape 1 at dim 3).
use_boundary: Boolean (default True) indicating if trainable
    start-end chain energies
    should be added to model.
use_bias: Boolean, whether the layer uses a bias vector.
kernel_initializer: Initializer for the &lt;code&gt;kernel&lt;/code&gt; weights matrix,
    used for the linear transformation of the inputs.
    (see [initializers](../initializers.md)).
chain_initializer: Initializer for the &lt;code&gt;chain\_kernel&lt;/code&gt; weights matrix,
    used for the CRF chain energy.
    (see [initializers](../initializers.md)).
boundary_initializer: Initializer for the &lt;code&gt;left\_boundary&lt;/code&gt;,
    'right_boundary' weights vectors,
    used for the start/left and end/right boundary energy.
    (see [initializers](../initializers.md)).
bias_initializer: Initializer for the bias vector
    (see [initializers](../initializers.md)).
activation: Activation function to use
    (see [activations](../activations.md)).
    If you pass None, no activation is applied
    (ie. "linear" activation: `a(x) = x`).
kernel_regularizer: Regularizer function applied to
    the &lt;code&gt;kernel&lt;/code&gt; weights matrix
    (see [regularizer](../regularizers.md)).
chain_regularizer: Regularizer function applied to
    the &lt;code&gt;chain\_kernel&lt;/code&gt; weights matrix
    (see [regularizer](../regularizers.md)).
boundary_regularizer: Regularizer function applied to
    the 'left_boundary', 'right_boundary' weight vectors
    (see [regularizer](../regularizers.md)).
bias_regularizer: Regularizer function applied to the bias vector
    (see [regularizer](../regularizers.md)).
kernel_constraint: Constraint function applied to
    the &lt;code&gt;kernel&lt;/code&gt; weights matrix
    (see [constraints](../constraints.md)).
chain_constraint: Constraint function applied to
    the &lt;code&gt;chain\_kernel&lt;/code&gt; weights matrix
    (see [constraints](../constraints.md)).
boundary_constraint: Constraint function applied to
    the &lt;code&gt;left\_boundary&lt;/code&gt;, &lt;code&gt;right\_boundary&lt;/code&gt; weights vectors
    (see [constraints](../constraints.md)).
bias_constraint: Constraint function applied to the bias vector
    (see [constraints](../constraints.md)).
input_dim: dimensionality of the input (integer).
    This argument (or alternatively, the keyword argument &lt;code&gt;input\_shape&lt;/code&gt;)
    is required when using this layer as the first layer in a model.
unroll: Boolean (default False). If True, the network will be
    unrolled, else a symbolic loop will be used.
    Unrolling can speed-up a RNN, although it tends
    to be more memory-intensive.
    Unrolling is only suitable for short sequences.
</code></pre>
<h1 id="input-shape">Input shape</h1>
<pre><code>3D tensor with shape &lt;code&gt;(nb\_samples, timesteps, input\_dim)&lt;/code&gt;.
</code></pre>
<h1 id="output-shape">Output shape</h1>
<pre><code>3D tensor with shape &lt;code&gt;(nb\_samples, timesteps, units)&lt;/code&gt;.
</code></pre>
<h1 id="masking">Masking</h1>
<pre><code>This layer supports masking for input data with a variable number
of timesteps. To introduce masks to your data,
use an [Embedding](embeddings.md) layer with the &lt;code&gt;mask\_zero&lt;/code&gt; parameter
set to &lt;code&gt;True&lt;/code&gt;.
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class CRF(keras.layers.Layer):
    &#34;&#34;&#34;An implementation of linear chain conditional random field (CRF).

    An linear chain CRF is defined to maximize the following likelihood function:

    $$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
    \sum_{y_1, ..., y_n} \exp(-a_1&#39; y_1 - a_n&#39; y_n
        - \sum_{k=1^n}((f(x_k&#39; W + b) y_k) + y_1&#39; U y_2)), $$

    where:
        $Z$: normalization constant
        $x_k, y_k$:  inputs and outputs

    This implementation has two modes for optimization:
    1. (`join mode`) optimized by maximizing join likelihood,
    which is optimal in theory of statistics.
       Note that in this case, CRF must be the output/last layer.
    2. (`marginal mode`) return marginal probabilities on each time
    step and optimized via composition
       likelihood (product of marginal likelihood), i.e.,
       using `categorical_crossentropy` loss.
       Note that in this case, CRF can be either the last layer or an
       intermediate layer (though not explored).

    For prediction (test phrase), one can choose either Viterbi
    best path (class indices) or marginal
    probabilities if probabilities are needed.
    However, if one chooses *join mode* for training,
    Viterbi output is typically better than marginal output,
    but the marginal output will still perform
    reasonably close, while if *marginal mode* is used for training,
    marginal output usually performs
    much better. The default behavior and `metrics.crf_accuracy`
    is set according to this observation.

    In addition, this implementation supports masking and accepts either
    onehot or sparse target.

    If you open a issue or a pull request about CRF, please
    add &#39;cc @lzfelix&#39; to notify Luiz Felix.


    # Examples

    ```python
        from keras_contrib.layers import CRF
        from keras_contrib.losses import crf_loss
        from keras_contrib.metrics import crf_viterbi_accuracy

        model = Sequential()
        model.add(Embedding(3001, 300, mask_zero=True)(X)

        # use learn_mode = &#39;join&#39;, test_mode = &#39;viterbi&#39;,
        # sparse_target = True (label indice output)
        crf = CRF(10, sparse_target=True)
        model.add(crf)

        # crf_accuracy is default to Viterbi acc if using join-mode (default).
        # One can add crf.marginal_acc if interested, but may slow down learning
        model.compile(&#39;adam&#39;, loss=crf_loss, metrics=[crf_viterbi_accuracy])

        # y must be label indices (with shape 1 at dim 3) here,
        # since `sparse_target=True`
        model.fit(x, y)

        # prediction give onehot representation of Viterbi best path
        y_hat = model.predict(x_test)
    ```

    The following snippet shows how to load a persisted
    model that uses the CRF layer:

    ```python
        from tensorflow.keras.models import load_model
        from keras_contrib.losses import import crf_loss
        from keras_contrib.metrics import crf_viterbi_accuracy

        custom_objects={&#39;CRF&#39;: CRF,
                        &#39;crf_loss&#39;: crf_loss,
                        &#39;crf_viterbi_accuracy&#39;: crf_viterbi_accuracy}

        loaded_model = load_model(&#39;&lt;path_to_model&gt;&#39;,
                                  custom_objects=custom_objects)
    ```

    # Arguments
        units: Positive integer, dimensionality of the output space.
        learn_mode: Either &#39;join&#39; or &#39;marginal&#39;.
            The former train the model by maximizing join likelihood while the latter
            maximize the product of marginal likelihood over all time steps.
            One should use `losses.crf_nll` for &#39;join&#39; mode
            and `losses.categorical_crossentropy` or
            `losses.sparse_categorical_crossentropy` for
            `marginal` mode.  For convenience, simply
            use `losses.crf_loss`, which will decide the proper loss as described.
        test_mode: Either &#39;viterbi&#39; or &#39;marginal&#39;.
            The former is recommended and as default when `learn_mode = &#39;join&#39;` and
            gives one-hot representation of the best path at test (prediction) time,
            while the latter is recommended and chosen as default
            when `learn_mode = &#39;marginal&#39;`,
            which produces marginal probabilities for each time step.
            For evaluating metrics, one should
            use `metrics.crf_viterbi_accuracy` for &#39;viterbi&#39; mode and
            &#39;metrics.crf_marginal_accuracy&#39; for &#39;marginal&#39; mode, or
            simply use `metrics.crf_accuracy` for
            both which automatically decides it as described.
            One can also use both for evaluation at training.
        sparse_target: Boolean (default False) indicating
            if provided labels are one-hot or
            indices (with shape 1 at dim 3).
        use_boundary: Boolean (default True) indicating if trainable
            start-end chain energies
            should be added to model.
        use_bias: Boolean, whether the layer uses a bias vector.
        kernel_initializer: Initializer for the `kernel` weights matrix,
            used for the linear transformation of the inputs.
            (see [initializers](../initializers.md)).
        chain_initializer: Initializer for the `chain_kernel` weights matrix,
            used for the CRF chain energy.
            (see [initializers](../initializers.md)).
        boundary_initializer: Initializer for the `left_boundary`,
            &#39;right_boundary&#39; weights vectors,
            used for the start/left and end/right boundary energy.
            (see [initializers](../initializers.md)).
        bias_initializer: Initializer for the bias vector
            (see [initializers](../initializers.md)).
        activation: Activation function to use
            (see [activations](../activations.md)).
            If you pass None, no activation is applied
            (ie. &#34;linear&#34; activation: `a(x) = x`).
        kernel_regularizer: Regularizer function applied to
            the `kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        chain_regularizer: Regularizer function applied to
            the `chain_kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        boundary_regularizer: Regularizer function applied to
            the &#39;left_boundary&#39;, &#39;right_boundary&#39; weight vectors
            (see [regularizer](../regularizers.md)).
        bias_regularizer: Regularizer function applied to the bias vector
            (see [regularizer](../regularizers.md)).
        kernel_constraint: Constraint function applied to
            the `kernel` weights matrix
            (see [constraints](../constraints.md)).
        chain_constraint: Constraint function applied to
            the `chain_kernel` weights matrix
            (see [constraints](../constraints.md)).
        boundary_constraint: Constraint function applied to
            the `left_boundary`, `right_boundary` weights vectors
            (see [constraints](../constraints.md)).
        bias_constraint: Constraint function applied to the bias vector
            (see [constraints](../constraints.md)).
        input_dim: dimensionality of the input (integer).
            This argument (or alternatively, the keyword argument `input_shape`)
            is required when using this layer as the first layer in a model.
        unroll: Boolean (default False). If True, the network will be
            unrolled, else a symbolic loop will be used.
            Unrolling can speed-up a RNN, although it tends
            to be more memory-intensive.
            Unrolling is only suitable for short sequences.

    # Input shape
        3D tensor with shape `(nb_samples, timesteps, input_dim)`.

    # Output shape
        3D tensor with shape `(nb_samples, timesteps, units)`.

    # Masking
        This layer supports masking for input data with a variable number
        of timesteps. To introduce masks to your data,
        use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
        set to `True`.

    &#34;&#34;&#34;

    def __init__(
        self,
        units,
        learn_mode=&#34;join&#34;,
        test_mode=None,
        sparse_target=False,
        use_boundary=True,
        use_bias=True,
        activation=&#34;linear&#34;,
        kernel_initializer=&#34;glorot_uniform&#34;,
        chain_initializer=&#34;orthogonal&#34;,
        bias_initializer=&#34;zeros&#34;,
        boundary_initializer=&#34;zeros&#34;,
        kernel_regularizer=None,
        chain_regularizer=None,
        boundary_regularizer=None,
        bias_regularizer=None,
        kernel_constraint=None,
        chain_constraint=None,
        boundary_constraint=None,
        bias_constraint=None,
        input_dim=None,
        unroll=False,
        **kwargs
    ):
        super(CRF, self).__init__(**kwargs)
        self.supports_masking = True
        self.units = units
        self.learn_mode = learn_mode
        assert self.learn_mode in [&#34;join&#34;, &#34;marginal&#34;]
        self.test_mode = test_mode
        if self.test_mode is None:
            self.test_mode = &#34;viterbi&#34; if self.learn_mode == &#34;join&#34; else &#34;marginal&#34;
        else:
            assert self.test_mode in [&#34;viterbi&#34;, &#34;marginal&#34;]
        self.sparse_target = sparse_target
        self.use_boundary = use_boundary
        self.use_bias = use_bias

        self.activation = keras.activations.get(activation)

        self.kernel_initializer = keras.initializers.get(kernel_initializer)
        self.chain_initializer = keras.initializers.get(chain_initializer)
        self.boundary_initializer = keras.initializers.get(boundary_initializer)
        self.bias_initializer = keras.initializers.get(bias_initializer)

        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
        self.chain_regularizer = keras.regularizers.get(chain_regularizer)
        self.boundary_regularizer = keras.regularizers.get(boundary_regularizer)
        self.bias_regularizer = keras.regularizers.get(bias_regularizer)

        self.kernel_constraint = keras.constraints.get(kernel_constraint)
        self.chain_constraint = keras.constraints.get(chain_constraint)
        self.boundary_constraint = keras.constraints.get(boundary_constraint)
        self.bias_constraint = keras.constraints.get(bias_constraint)

        self.unroll = unroll

    def build(self, input_shape):
        input_shape = to_tuple(input_shape)
        self.input_spec = [keras.layers.InputSpec(shape=input_shape)]
        self.input_dim = input_shape[-1]

        self.kernel = self.add_weight(
            shape=(self.input_dim, self.units),
            name=&#34;kernel&#34;,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.chain_kernel = self.add_weight(
            shape=(self.units, self.units),
            name=&#34;chain_kernel&#34;,
            initializer=self.chain_initializer,
            regularizer=self.chain_regularizer,
            constraint=self.chain_constraint,
        )
        if self.use_bias:
            self.bias = self.add_weight(
                shape=(self.units,),
                name=&#34;bias&#34;,
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint,
            )
        else:
            self.bias = 0

        if self.use_boundary:
            self.left_boundary = self.add_weight(
                shape=(self.units,),
                name=&#34;left_boundary&#34;,
                initializer=self.boundary_initializer,
                regularizer=self.boundary_regularizer,
                constraint=self.boundary_constraint,
            )
            self.right_boundary = self.add_weight(
                shape=(self.units,),
                name=&#34;right_boundary&#34;,
                initializer=self.boundary_initializer,
                regularizer=self.boundary_regularizer,
                constraint=self.boundary_constraint,
            )
        self.built = True

    def call(self, X, mask=None):
        if mask is not None:
            assert K.ndim(mask) == 2, &#34;Input mask to CRF must have dim 2 if not None&#34;

        if self.test_mode == &#34;viterbi&#34;:
            test_output = self.viterbi_decoding(X, mask)
        else:
            test_output = self.get_marginal_prob(X, mask)

        self.uses_learning_phase = True
        if self.learn_mode == &#34;join&#34;:
            train_output = K.zeros_like(K.dot(X, self.kernel))
            out = K.in_train_phase(train_output, test_output)
        else:
            if self.test_mode == &#34;viterbi&#34;:
                train_output = self.get_marginal_prob(X, mask)
                out = K.in_train_phase(train_output, test_output)
            else:
                out = test_output
        return out

    def compute_output_shape(self, input_shape):
        return input_shape[:2] + (self.units,)

    def compute_mask(self, input, mask=None):
        if mask is not None and self.learn_mode == &#34;join&#34;:
            return K.any(mask, axis=1)
        return mask

    def get_config(self):
        config = {
            &#34;units&#34;: self.units,
            &#34;learn_mode&#34;: self.learn_mode,
            &#34;test_mode&#34;: self.test_mode,
            &#34;use_boundary&#34;: self.use_boundary,
            &#34;use_bias&#34;: self.use_bias,
            &#34;sparse_target&#34;: self.sparse_target,
            &#34;kernel_initializer&#34;: keras.initializers.serialize(self.kernel_initializer),
            &#34;chain_initializer&#34;: keras.initializers.serialize(self.chain_initializer),
            &#34;boundary_initializer&#34;: keras.initializers.serialize(
                self.boundary_initializer
            ),
            &#34;bias_initializer&#34;: keras.initializers.serialize(self.bias_initializer),
            &#34;activation&#34;: keras.activations.serialize(self.activation),
            &#34;kernel_regularizer&#34;: keras.regularizers.serialize(self.kernel_regularizer),
            &#34;chain_regularizer&#34;: keras.regularizers.serialize(self.chain_regularizer),
            &#34;boundary_regularizer&#34;: keras.regularizers.serialize(
                self.boundary_regularizer
            ),
            &#34;bias_regularizer&#34;: keras.regularizers.serialize(self.bias_regularizer),
            &#34;kernel_constraint&#34;: keras.constraints.serialize(self.kernel_constraint),
            &#34;chain_constraint&#34;: keras.constraints.serialize(self.chain_constraint),
            &#34;boundary_constraint&#34;: keras.constraints.serialize(
                self.boundary_constraint
            ),
            &#34;bias_constraint&#34;: keras.constraints.serialize(self.bias_constraint),
            &#34;input_dim&#34;: self.input_dim,
            &#34;unroll&#34;: self.unroll,
        }
        base_config = super(CRF, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    @property
    def loss_function(self):
        # warnings.warn(&#39;CRF.loss_function is deprecated &#39;
        #&#39;and it might be removed in the future. Please &#39;
        #&#39;use losses.crf_loss instead.&#39;)
        return crf_loss

    @property
    def accuracy(self):
        # warnings.warn(&#39;CRF.accuracy is deprecated and it &#39;
        #&#39;might be removed in the future. Please &#39;
        #&#39;use metrics.crf_accuracy&#39;)
        if self.test_mode == &#34;viterbi&#34;:
            return crf_viterbi_accuracy
        else:
            return crf_marginal_accuracy

    @property
    def viterbi_acc(self):
        # warnings.warn(&#39;CRF.viterbi_acc is deprecated and it might &#39;
        #&#39;be removed in the future. Please &#39;
        #&#39;use metrics.viterbi_acc instead.&#39;)
        return crf_viterbi_accuracy

    @property
    def marginal_acc(self):
        # warnings.warn(&#39;CRF.moarginal_acc is deprecated and it &#39;
        #&#39;might be removed in the future. Please &#39;
        #&#39;use metrics.marginal_acc instead.&#39;)
        return crf_marginal_accuracy

    @staticmethod
    def softmaxNd(x, axis=-1):
        m = K.max(x, axis=axis, keepdims=True)
        exp_x = K.exp(x - m)
        prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
        return prob_x

    @staticmethod
    def shift_left(x, offset=1):
        assert offset &gt; 0
        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)

    @staticmethod
    def shift_right(x, offset=1):
        assert offset &gt; 0
        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)

    def add_boundary_energy(self, energy, mask, start, end):
        start = K.expand_dims(K.expand_dims(start, 0), 0)
        end = K.expand_dims(K.expand_dims(end, 0), 0)
        if mask is None:
            energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]], axis=1)
            energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end], axis=1)
        else:
            mask = K.expand_dims(K.cast(mask, K.floatx()))
            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
            end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
            energy = energy + start_mask * start
            energy = energy + end_mask * end
        return energy

    def get_log_normalization_constant(self, input_energy, mask, **kwargs):
        &#34;&#34;&#34;Compute logarithm of the normalization constant Z, where
        Z = sum exp(-E) -&gt; logZ = log sum exp(-E) =: -nlogZ
        &#34;&#34;&#34;
        # should have logZ[:, i] == logZ[:, j] for any i, j
        logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
        return logZ[:, 0]

    def get_energy(self, y_true, input_energy, mask):
        &#34;&#34;&#34;Energy = a1&#39; y1 + u1&#39; y1 + y1&#39; U y2 + u2&#39; y2 + y2&#39; U y3 + u3&#39; y3 + an&#39; y3&#34;&#34;&#34;
        input_energy = K.sum(input_energy * y_true, 2)  # (B, T)
        # (B, T-1)
        chain_energy = K.sum(
            K.dot(y_true[:, :-1, :], self.chain_kernel) * y_true[:, 1:, :], 2
        )

        if mask is not None:
            mask = K.cast(mask, K.floatx())
            # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
            chain_mask = mask[:, :-1] * mask[:, 1:]
            input_energy = input_energy * mask
            chain_energy = chain_energy * chain_mask
        total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1)  # (B, )

        return total_energy

    def get_negative_log_likelihood(self, y_true, X, mask):
        &#34;&#34;&#34;Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
        likelihood = 1/Z * exp(-E) -&gt;  neg_log_like = - log(1/Z * exp(-E)) = logZ + E
        &#34;&#34;&#34;
        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
        if self.use_boundary:
            input_energy = self.add_boundary_energy(
                input_energy, mask, self.left_boundary, self.right_boundary
            )
        energy = self.get_energy(y_true, input_energy, mask)
        logZ = self.get_log_normalization_constant(
            input_energy, mask, input_length=K.int_shape(X)[1]
        )
        nloglik = logZ + energy
        if mask is not None:
            nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
        else:
            nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
        return nloglik

    def step(self, input_energy_t, states, return_logZ=True):
        # not in the following  `prev_target_val` has shape = (B, F)
        # where B = batch_size, F = output feature dim
        # Note: `i` is of float32, due to the behavior of `K.rnn`
        prev_target_val, i, chain_energy = states[:3]
        t = K.cast(i[0, 0], dtype=&#34;int32&#34;)
        if len(states) &gt; 3:
            if K.backend() == &#34;theano&#34;:
                m = states[3][:, t : (t + 2)]
            else:
                m = tf.slice(states[3], [0, t], [-1, 2])

            input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
            # (1, F, F)*(B, 1, 1) -&gt; (B, F, F)
            chain_energy = chain_energy * K.expand_dims(
                K.expand_dims(m[:, 0] * m[:, 1])
            )
        if return_logZ:
            # shapes: (1, B, F) + (B, F, 1) -&gt; (B, F, F)
            energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
            # new_target_val = K.logsumexp(-energy, 1)  # shapes: (B, F)
            new_target_val = tf.reduce_logsumexp(-energy, 1)  # shapes: (B, F)
            return new_target_val, [new_target_val, i + 1]
        else:
            energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
            min_energy = K.min(energy, 1)
            # cast for tf-version `K.rnn
            argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
            return argmin_table, [min_energy, i + 1]

    def recursion(
        self,
        input_energy,
        mask=None,
        go_backwards=False,
        return_sequences=True,
        return_logZ=True,
        input_length=None,
    ):
        &#34;&#34;&#34;Forward (alpha) or backward (beta) recursion

        If `return_logZ = True`, compute the logZ, the normalization constant:

        \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
          = \sum_{y1, y2, y3} exp(-(u1&#39; y1 + y1&#39; W y2 + u2&#39; y2 + y2&#39; W y3 + u3&#39; y3))
          = sum_{y2, y3} (exp(-(u2&#39; y2 + y2&#39; W y3 + u3&#39; y3))
          sum_{y1} exp(-(u1&#39; y1&#39; + y1&#39; W y2))) \]

        Denote:
            \[ S(y2) := sum_{y1} exp(-(u1&#39; y1 + y1&#39; W y2)), \]
            \[ Z = sum_{y2, y3} exp(log S(y2) - (u2&#39; y2 + y2&#39; W y3 + u3&#39; y3)) \]
            \[ logS(y2) = log S(y2) = log_sum_exp(-(u1&#39; y1&#39; + y1&#39; W y2)) \]
        Note that:
              yi&#39;s are one-hot vectors
              u1, u3: boundary energies have been merged

        If `return_logZ = False`, compute the Viterbi&#39;s best path lookup table.
        &#34;&#34;&#34;
        chain_energy = self.chain_kernel
        # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
        chain_energy = K.expand_dims(chain_energy, 0)
        # shape=(B, F), dtype=float32
        prev_target_val = K.zeros_like(input_energy[:, 0, :])

        if go_backwards:
            input_energy = K.reverse(input_energy, 1)
            if mask is not None:
                mask = K.reverse(mask, 1)

        initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
        constants = [chain_energy]

        if mask is not None:
            mask2 = K.cast(
                K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), K.floatx()
            )
            constants.append(mask2)

        def _step(input_energy_i, states):
            return self.step(input_energy_i, states, return_logZ)

        target_val_last, target_val_seq, _ = K.rnn(
            _step,
            input_energy,
            initial_states,
            constants=constants,
            input_length=input_length,
            unroll=self.unroll,
        )

        if return_sequences:
            if go_backwards:
                target_val_seq = K.reverse(target_val_seq, 1)
            return target_val_seq
        else:
            return target_val_last

    def forward_recursion(self, input_energy, **kwargs):
        return self.recursion(input_energy, **kwargs)

    def backward_recursion(self, input_energy, **kwargs):
        return self.recursion(input_energy, go_backwards=True, **kwargs)

    def get_marginal_prob(self, X, mask=None):
        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
        if self.use_boundary:
            input_energy = self.add_boundary_energy(
                input_energy, mask, self.left_boundary, self.right_boundary
            )
        input_length = K.int_shape(X)[1]
        alpha = self.forward_recursion(
            input_energy, mask=mask, input_length=input_length
        )
        beta = self.backward_recursion(
            input_energy, mask=mask, input_length=input_length
        )
        if mask is not None:
            input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
        margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
        return self.softmaxNd(margin)

    def viterbi_decoding(self, X, mask=None):
        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
        if self.use_boundary:
            input_energy = self.add_boundary_energy(
                input_energy, mask, self.left_boundary, self.right_boundary
            )

        argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
        argmin_tables = K.cast(argmin_tables, &#34;int32&#34;)

        # backward to find best path, `initial_best_idx` can be any,
        # as all elements in the last argmin_table are the same
        argmin_tables = K.reverse(argmin_tables, 1)
        # matrix instead of vector is required by tf `K.rnn`
        initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
        if K.backend() == &#34;theano&#34;:
            from theano import tensor as T

            initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)]

        def gather_each_row(params, indices):
            n = K.shape(indices)[0]
            if K.backend() == &#34;theano&#34;:
                from theano import tensor as T

                return params[T.arange(n), indices]
            elif K.backend() == &#34;tensorflow&#34;:
                import tensorflow as tf

                indices = K.transpose(K.stack([tf.range(n), indices]))
                return tf.gather_nd(params, indices)
            else:
                raise NotImplementedError

        def find_path(argmin_table, best_idx):
            next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
            next_best_idx = K.expand_dims(next_best_idx)
            if K.backend() == &#34;theano&#34;:
                from theano import tensor as T

                next_best_idx = T.unbroadcast(next_best_idx, 1)
            return next_best_idx, [next_best_idx]

        _, best_paths, _ = K.rnn(
            find_path,
            argmin_tables,
            initial_best_idx,
            input_length=K.int_shape(X)[1],
            unroll=self.unroll,
        )
        best_paths = K.reverse(best_paths, 1)
        best_paths = K.squeeze(best_paths, 2)

        return K.one_hot(best_paths, self.units)</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li>keras.engine.base_layer.Layer</li>
<li>tensorflow.python.module.module.Module</li>
<li>tensorflow.python.trackable.autotrackable.AutoTrackable</li>
<li>tensorflow.python.trackable.base.Trackable</li>
<li>keras.utils.version_utils.LayerVersionSelector</li>
</ul>
<h3>Static methods</h3>
<dl>
<dt id="ktrain.text.ner.anago.layers.CRF.shift_left"><code class="name flex">
<span>def <span class="ident">shift_left</span></span>(<span>x, offset=1)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@staticmethod
def shift_left(x, offset=1):
    assert offset &gt; 0
    return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.shift_right"><code class="name flex">
<span>def <span class="ident">shift_right</span></span>(<span>x, offset=1)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@staticmethod
def shift_right(x, offset=1):
    assert offset &gt; 0
    return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.softmaxNd"><code class="name flex">
<span>def <span class="ident">softmaxNd</span></span>(<span>x, axis=-1)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@staticmethod
def softmaxNd(x, axis=-1):
    m = K.max(x, axis=axis, keepdims=True)
    exp_x = K.exp(x - m)
    prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
    return prob_x</code></pre>
</details>
</dd>
</dl>
<h3>Instance variables</h3>
<dl>
<dt id="ktrain.text.ner.anago.layers.CRF.accuracy"><code class="name">var <span class="ident">accuracy</span></code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@property
def accuracy(self):
    # warnings.warn(&#39;CRF.accuracy is deprecated and it &#39;
    #&#39;might be removed in the future. Please &#39;
    #&#39;use metrics.crf_accuracy&#39;)
    if self.test_mode == &#34;viterbi&#34;:
        return crf_viterbi_accuracy
    else:
        return crf_marginal_accuracy</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.loss_function"><code class="name">var <span class="ident">loss_function</span></code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@property
def loss_function(self):
    # warnings.warn(&#39;CRF.loss_function is deprecated &#39;
    #&#39;and it might be removed in the future. Please &#39;
    #&#39;use losses.crf_loss instead.&#39;)
    return crf_loss</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.marginal_acc"><code class="name">var <span class="ident">marginal_acc</span></code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@property
def marginal_acc(self):
    # warnings.warn(&#39;CRF.moarginal_acc is deprecated and it &#39;
    #&#39;might be removed in the future. Please &#39;
    #&#39;use metrics.marginal_acc instead.&#39;)
    return crf_marginal_accuracy</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.viterbi_acc"><code class="name">var <span class="ident">viterbi_acc</span></code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@property
def viterbi_acc(self):
    # warnings.warn(&#39;CRF.viterbi_acc is deprecated and it might &#39;
    #&#39;be removed in the future. Please &#39;
    #&#39;use metrics.viterbi_acc instead.&#39;)
    return crf_viterbi_accuracy</code></pre>
</details>
</dd>
</dl>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.ner.anago.layers.CRF.add_boundary_energy"><code class="name flex">
<span>def <span class="ident">add_boundary_energy</span></span>(<span>self, energy, mask, start, end)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def add_boundary_energy(self, energy, mask, start, end):
    start = K.expand_dims(K.expand_dims(start, 0), 0)
    end = K.expand_dims(K.expand_dims(end, 0), 0)
    if mask is None:
        energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]], axis=1)
        energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end], axis=1)
    else:
        mask = K.expand_dims(K.cast(mask, K.floatx()))
        start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
        end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
        energy = energy + start_mask * start
        energy = energy + end_mask * end
    return energy</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.backward_recursion"><code class="name flex">
<span>def <span class="ident">backward_recursion</span></span>(<span>self, input_energy, **kwargs)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def backward_recursion(self, input_energy, **kwargs):
    return self.recursion(input_energy, go_backwards=True, **kwargs)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.build"><code class="name flex">
<span>def <span class="ident">build</span></span>(<span>self, input_shape)</span>
</code></dt>
<dd>
<div class="desc"><p>Creates the variables of the layer (optional, for subclass implementers).</p>
<p>This is a method that implementers of subclasses of <code>Layer</code> or <code>Model</code>
can override if they need a state-creation step in-between
layer instantiation and layer call. It is invoked automatically before
the first execution of <code>call()</code>.</p>
<p>This is typically used to create the weights of <code>Layer</code> subclasses
(at the discretion of the subclass implementer).</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>input_shape</code></strong></dt>
<dd>Instance of <code>TensorShape</code>, or list of instances of
<code>TensorShape</code> if the layer expects a list of inputs
(one instance per input).</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build(self, input_shape):
    input_shape = to_tuple(input_shape)
    self.input_spec = [keras.layers.InputSpec(shape=input_shape)]
    self.input_dim = input_shape[-1]

    self.kernel = self.add_weight(
        shape=(self.input_dim, self.units),
        name=&#34;kernel&#34;,
        initializer=self.kernel_initializer,
        regularizer=self.kernel_regularizer,
        constraint=self.kernel_constraint,
    )
    self.chain_kernel = self.add_weight(
        shape=(self.units, self.units),
        name=&#34;chain_kernel&#34;,
        initializer=self.chain_initializer,
        regularizer=self.chain_regularizer,
        constraint=self.chain_constraint,
    )
    if self.use_bias:
        self.bias = self.add_weight(
            shape=(self.units,),
            name=&#34;bias&#34;,
            initializer=self.bias_initializer,
            regularizer=self.bias_regularizer,
            constraint=self.bias_constraint,
        )
    else:
        self.bias = 0

    if self.use_boundary:
        self.left_boundary = self.add_weight(
            shape=(self.units,),
            name=&#34;left_boundary&#34;,
            initializer=self.boundary_initializer,
            regularizer=self.boundary_regularizer,
            constraint=self.boundary_constraint,
        )
        self.right_boundary = self.add_weight(
            shape=(self.units,),
            name=&#34;right_boundary&#34;,
            initializer=self.boundary_initializer,
            regularizer=self.boundary_regularizer,
            constraint=self.boundary_constraint,
        )
    self.built = True</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.call"><code class="name flex">
<span>def <span class="ident">call</span></span>(<span>self, X, mask=None)</span>
</code></dt>
<dd>
<div class="desc"><p>This is where the layer's logic lives.</p>
<p>The <code>call()</code> method may not create state (except in its first
invocation, wrapping the creation of variables or other resources in
<code>tf.init_scope()</code>).
It is recommended to create state, including
<code>tf.Variable</code> instances and nested <code>Layer</code> instances,
in <code>__init__()</code>, or in the <code>build()</code> method that is
called automatically before <code>call()</code> executes for the first time.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>inputs</code></strong></dt>
<dd>Input tensor, or dict/list/tuple of input tensors.
The first positional <code>inputs</code> argument is subject to special rules:
- <code>inputs</code> must be explicitly passed. A layer cannot have zero
arguments, and <code>inputs</code> cannot be provided via the default value
of a keyword argument.
- NumPy array or Python scalar values in <code>inputs</code> get cast as
tensors.
- Keras mask metadata is only collected from <code>inputs</code>.
- Layers are built (<code>build(input_shape)</code> method)
using shape info from <code>inputs</code> only.
- <code>input_spec</code> compatibility is only checked against <code>inputs</code>.
- Mixed precision input casting is only applied to <code>inputs</code>.
If a layer has tensor arguments in <code>*args</code> or <code>**kwargs</code>, their
casting behavior in mixed precision should be handled manually.
- The SavedModel input specification is generated using <code>inputs</code>
only.
- Integration with various ecosystem packages like TFMOT, TFLite,
TF.js, etc is only supported for <code>inputs</code> and not for tensors in
positional and keyword arguments.</dd>
<dt><strong><code>*args</code></strong></dt>
<dd>Additional positional arguments. May contain tensors, although
this is not recommended, for the reasons above.</dd>
<dt><strong><code>**kwargs</code></strong></dt>
<dd>Additional keyword arguments. May contain tensors, although
this is not recommended, for the reasons above.
The following optional keyword arguments are reserved:
- <code>training</code>: Boolean scalar tensor of Python boolean indicating
whether the <code>call</code> is meant for training or inference.
- <code>mask</code>: Boolean input mask. If the layer's <code>call()</code> method takes a
<code>mask</code> argument, its default value will be set to the mask
generated for <code>inputs</code> by the previous layer (if <code>input</code> did come
from a layer that generated a corresponding mask, i.e. if it came
from a Keras layer with masking support).</dd>
</dl>
<h2 id="returns">Returns</h2>
<p>A tensor or list/tuple of tensors.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def call(self, X, mask=None):
    if mask is not None:
        assert K.ndim(mask) == 2, &#34;Input mask to CRF must have dim 2 if not None&#34;

    if self.test_mode == &#34;viterbi&#34;:
        test_output = self.viterbi_decoding(X, mask)
    else:
        test_output = self.get_marginal_prob(X, mask)

    self.uses_learning_phase = True
    if self.learn_mode == &#34;join&#34;:
        train_output = K.zeros_like(K.dot(X, self.kernel))
        out = K.in_train_phase(train_output, test_output)
    else:
        if self.test_mode == &#34;viterbi&#34;:
            train_output = self.get_marginal_prob(X, mask)
            out = K.in_train_phase(train_output, test_output)
        else:
            out = test_output
    return out</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.compute_mask"><code class="name flex">
<span>def <span class="ident">compute_mask</span></span>(<span>self, input, mask=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Computes an output mask tensor.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>inputs</code></strong></dt>
<dd>Tensor or list of tensors.</dd>
<dt><strong><code>mask</code></strong></dt>
<dd>Tensor or list of tensors.</dd>
</dl>
<h2 id="returns">Returns</h2>
<p>None or a tensor (or list of tensors,
one per output tensor of the layer).</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def compute_mask(self, input, mask=None):
    if mask is not None and self.learn_mode == &#34;join&#34;:
        return K.any(mask, axis=1)
    return mask</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.compute_output_shape"><code class="name flex">
<span>def <span class="ident">compute_output_shape</span></span>(<span>self, input_shape)</span>
</code></dt>
<dd>
<div class="desc"><p>Computes the output shape of the layer.</p>
<p>This method will cause the layer's state to be built, if that has not
happened before. This requires that the layer will later be used with
inputs that match the input shape provided here.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>input_shape</code></strong></dt>
<dd>Shape tuple (tuple of integers) or <code>tf.TensorShape</code>,
or structure of shape tuples / <code>tf.TensorShape</code> instances
(one per output tensor of the layer).
Shape tuples can include None for free dimensions,
instead of an integer.</dd>
</dl>
<h2 id="returns">Returns</h2>
<p>A <code>tf.TensorShape</code> instance
or structure of <code>tf.TensorShape</code> instances.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def compute_output_shape(self, input_shape):
    return input_shape[:2] + (self.units,)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.forward_recursion"><code class="name flex">
<span>def <span class="ident">forward_recursion</span></span>(<span>self, input_energy, **kwargs)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def forward_recursion(self, input_energy, **kwargs):
    return self.recursion(input_energy, **kwargs)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.get_config"><code class="name flex">
<span>def <span class="ident">get_config</span></span>(<span>self)</span>
</code></dt>
<dd>
<div class="desc"><p>Returns the config of the layer.</p>
<p>A layer config is a Python dictionary (serializable)
containing the configuration of a layer.
The same layer can be reinstantiated later
(without its trained weights) from this configuration.</p>
<p>The config of a layer does not include connectivity
information, nor the layer class name. These are handled
by <code>Network</code> (one layer of abstraction above).</p>
<p>Note that <code>get_config()</code> does not guarantee to return a fresh copy of
dict every time it is called. The callers should make a copy of the
returned dict if they want to modify it.</p>
<h2 id="returns">Returns</h2>
<p>Python dictionary.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_config(self):
    config = {
        &#34;units&#34;: self.units,
        &#34;learn_mode&#34;: self.learn_mode,
        &#34;test_mode&#34;: self.test_mode,
        &#34;use_boundary&#34;: self.use_boundary,
        &#34;use_bias&#34;: self.use_bias,
        &#34;sparse_target&#34;: self.sparse_target,
        &#34;kernel_initializer&#34;: keras.initializers.serialize(self.kernel_initializer),
        &#34;chain_initializer&#34;: keras.initializers.serialize(self.chain_initializer),
        &#34;boundary_initializer&#34;: keras.initializers.serialize(
            self.boundary_initializer
        ),
        &#34;bias_initializer&#34;: keras.initializers.serialize(self.bias_initializer),
        &#34;activation&#34;: keras.activations.serialize(self.activation),
        &#34;kernel_regularizer&#34;: keras.regularizers.serialize(self.kernel_regularizer),
        &#34;chain_regularizer&#34;: keras.regularizers.serialize(self.chain_regularizer),
        &#34;boundary_regularizer&#34;: keras.regularizers.serialize(
            self.boundary_regularizer
        ),
        &#34;bias_regularizer&#34;: keras.regularizers.serialize(self.bias_regularizer),
        &#34;kernel_constraint&#34;: keras.constraints.serialize(self.kernel_constraint),
        &#34;chain_constraint&#34;: keras.constraints.serialize(self.chain_constraint),
        &#34;boundary_constraint&#34;: keras.constraints.serialize(
            self.boundary_constraint
        ),
        &#34;bias_constraint&#34;: keras.constraints.serialize(self.bias_constraint),
        &#34;input_dim&#34;: self.input_dim,
        &#34;unroll&#34;: self.unroll,
    }
    base_config = super(CRF, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.get_energy"><code class="name flex">
<span>def <span class="ident">get_energy</span></span>(<span>self, y_true, input_energy, mask)</span>
</code></dt>
<dd>
<div class="desc"><p>Energy = a1' y1 + u1' y1 + y1' U y2 + u2' y2 + y2' U y3 + u3' y3 + an' y3</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_energy(self, y_true, input_energy, mask):
    &#34;&#34;&#34;Energy = a1&#39; y1 + u1&#39; y1 + y1&#39; U y2 + u2&#39; y2 + y2&#39; U y3 + u3&#39; y3 + an&#39; y3&#34;&#34;&#34;
    input_energy = K.sum(input_energy * y_true, 2)  # (B, T)
    # (B, T-1)
    chain_energy = K.sum(
        K.dot(y_true[:, :-1, :], self.chain_kernel) * y_true[:, 1:, :], 2
    )

    if mask is not None:
        mask = K.cast(mask, K.floatx())
        # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
        chain_mask = mask[:, :-1] * mask[:, 1:]
        input_energy = input_energy * mask
        chain_energy = chain_energy * chain_mask
    total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1)  # (B, )

    return total_energy</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.get_log_normalization_constant"><code class="name flex">
<span>def <span class="ident">get_log_normalization_constant</span></span>(<span>self, input_energy, mask, **kwargs)</span>
</code></dt>
<dd>
<div class="desc"><p>Compute logarithm of the normalization constant Z, where
Z = sum exp(-E) -&gt; logZ = log sum exp(-E) =: -nlogZ</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_log_normalization_constant(self, input_energy, mask, **kwargs):
    &#34;&#34;&#34;Compute logarithm of the normalization constant Z, where
    Z = sum exp(-E) -&gt; logZ = log sum exp(-E) =: -nlogZ
    &#34;&#34;&#34;
    # should have logZ[:, i] == logZ[:, j] for any i, j
    logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
    return logZ[:, 0]</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.get_marginal_prob"><code class="name flex">
<span>def <span class="ident">get_marginal_prob</span></span>(<span>self, X, mask=None)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_marginal_prob(self, X, mask=None):
    input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
    if self.use_boundary:
        input_energy = self.add_boundary_energy(
            input_energy, mask, self.left_boundary, self.right_boundary
        )
    input_length = K.int_shape(X)[1]
    alpha = self.forward_recursion(
        input_energy, mask=mask, input_length=input_length
    )
    beta = self.backward_recursion(
        input_energy, mask=mask, input_length=input_length
    )
    if mask is not None:
        input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
    margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
    return self.softmaxNd(margin)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.get_negative_log_likelihood"><code class="name flex">
<span>def <span class="ident">get_negative_log_likelihood</span></span>(<span>self, y_true, X, mask)</span>
</code></dt>
<dd>
<div class="desc"><p>Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
likelihood = 1/Z * exp(-E) -&gt;
neg_log_like = - log(1/Z * exp(-E)) = logZ + E</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_negative_log_likelihood(self, y_true, X, mask):
    &#34;&#34;&#34;Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
    likelihood = 1/Z * exp(-E) -&gt;  neg_log_like = - log(1/Z * exp(-E)) = logZ + E
    &#34;&#34;&#34;
    input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
    if self.use_boundary:
        input_energy = self.add_boundary_energy(
            input_energy, mask, self.left_boundary, self.right_boundary
        )
    energy = self.get_energy(y_true, input_energy, mask)
    logZ = self.get_log_normalization_constant(
        input_energy, mask, input_length=K.int_shape(X)[1]
    )
    nloglik = logZ + energy
    if mask is not None:
        nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
    else:
        nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
    return nloglik</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.recursion"><code class="name flex">
<span>def <span class="ident">recursion</span></span>(<span>self, input_energy, mask=None, go_backwards=False, return_sequences=True, return_logZ=True, input_length=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Forward (alpha) or backward (beta) recursion</p>
<p>If <code>return_logZ = True</code>, compute the logZ, the normalization constant:</p>
<p>[ Z = \sum_{y1, y2, y3} exp(-E) # energy
= \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3))
= sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3))
sum_{y1} exp(-(u1' y1' + y1' W y2))) ]</p>
<h2 id="denote">Denote</h2>
<p>[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), ]
[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) ]
[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) ]
Note that:
yi's are one-hot vectors
u1, u3: boundary energies have been merged</p>
<p>If <code>return_logZ = False</code>, compute the Viterbi's best path lookup table.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def recursion(
    self,
    input_energy,
    mask=None,
    go_backwards=False,
    return_sequences=True,
    return_logZ=True,
    input_length=None,
):
    &#34;&#34;&#34;Forward (alpha) or backward (beta) recursion

    If `return_logZ = True`, compute the logZ, the normalization constant:

    \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
      = \sum_{y1, y2, y3} exp(-(u1&#39; y1 + y1&#39; W y2 + u2&#39; y2 + y2&#39; W y3 + u3&#39; y3))
      = sum_{y2, y3} (exp(-(u2&#39; y2 + y2&#39; W y3 + u3&#39; y3))
      sum_{y1} exp(-(u1&#39; y1&#39; + y1&#39; W y2))) \]

    Denote:
        \[ S(y2) := sum_{y1} exp(-(u1&#39; y1 + y1&#39; W y2)), \]
        \[ Z = sum_{y2, y3} exp(log S(y2) - (u2&#39; y2 + y2&#39; W y3 + u3&#39; y3)) \]
        \[ logS(y2) = log S(y2) = log_sum_exp(-(u1&#39; y1&#39; + y1&#39; W y2)) \]
    Note that:
          yi&#39;s are one-hot vectors
          u1, u3: boundary energies have been merged

    If `return_logZ = False`, compute the Viterbi&#39;s best path lookup table.
    &#34;&#34;&#34;
    chain_energy = self.chain_kernel
    # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
    chain_energy = K.expand_dims(chain_energy, 0)
    # shape=(B, F), dtype=float32
    prev_target_val = K.zeros_like(input_energy[:, 0, :])

    if go_backwards:
        input_energy = K.reverse(input_energy, 1)
        if mask is not None:
            mask = K.reverse(mask, 1)

    initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
    constants = [chain_energy]

    if mask is not None:
        mask2 = K.cast(
            K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), K.floatx()
        )
        constants.append(mask2)

    def _step(input_energy_i, states):
        return self.step(input_energy_i, states, return_logZ)

    target_val_last, target_val_seq, _ = K.rnn(
        _step,
        input_energy,
        initial_states,
        constants=constants,
        input_length=input_length,
        unroll=self.unroll,
    )

    if return_sequences:
        if go_backwards:
            target_val_seq = K.reverse(target_val_seq, 1)
        return target_val_seq
    else:
        return target_val_last</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.step"><code class="name flex">
<span>def <span class="ident">step</span></span>(<span>self, input_energy_t, states, return_logZ=True)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def step(self, input_energy_t, states, return_logZ=True):
    # not in the following  `prev_target_val` has shape = (B, F)
    # where B = batch_size, F = output feature dim
    # Note: `i` is of float32, due to the behavior of `K.rnn`
    prev_target_val, i, chain_energy = states[:3]
    t = K.cast(i[0, 0], dtype=&#34;int32&#34;)
    if len(states) &gt; 3:
        if K.backend() == &#34;theano&#34;:
            m = states[3][:, t : (t + 2)]
        else:
            m = tf.slice(states[3], [0, t], [-1, 2])

        input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
        # (1, F, F)*(B, 1, 1) -&gt; (B, F, F)
        chain_energy = chain_energy * K.expand_dims(
            K.expand_dims(m[:, 0] * m[:, 1])
        )
    if return_logZ:
        # shapes: (1, B, F) + (B, F, 1) -&gt; (B, F, F)
        energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
        # new_target_val = K.logsumexp(-energy, 1)  # shapes: (B, F)
        new_target_val = tf.reduce_logsumexp(-energy, 1)  # shapes: (B, F)
        return new_target_val, [new_target_val, i + 1]
    else:
        energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
        min_energy = K.min(energy, 1)
        # cast for tf-version `K.rnn
        argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
        return argmin_table, [min_energy, i + 1]</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.anago.layers.CRF.viterbi_decoding"><code class="name flex">
<span>def <span class="ident">viterbi_decoding</span></span>(<span>self, X, mask=None)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def viterbi_decoding(self, X, mask=None):
    input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
    if self.use_boundary:
        input_energy = self.add_boundary_energy(
            input_energy, mask, self.left_boundary, self.right_boundary
        )

    argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
    argmin_tables = K.cast(argmin_tables, &#34;int32&#34;)

    # backward to find best path, `initial_best_idx` can be any,
    # as all elements in the last argmin_table are the same
    argmin_tables = K.reverse(argmin_tables, 1)
    # matrix instead of vector is required by tf `K.rnn`
    initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
    if K.backend() == &#34;theano&#34;:
        from theano import tensor as T

        initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)]

    def gather_each_row(params, indices):
        n = K.shape(indices)[0]
        if K.backend() == &#34;theano&#34;:
            from theano import tensor as T

            return params[T.arange(n), indices]
        elif K.backend() == &#34;tensorflow&#34;:
            import tensorflow as tf

            indices = K.transpose(K.stack([tf.range(n), indices]))
            return tf.gather_nd(params, indices)
        else:
            raise NotImplementedError

    def find_path(argmin_table, best_idx):
        next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
        next_best_idx = K.expand_dims(next_best_idx)
        if K.backend() == &#34;theano&#34;:
            from theano import tensor as T

            next_best_idx = T.unbroadcast(next_best_idx, 1)
        return next_best_idx, [next_best_idx]

    _, best_paths, _ = K.rnn(
        find_path,
        argmin_tables,
        initial_best_idx,
        input_length=K.int_shape(X)[1],
        unroll=self.unroll,
    )
    best_paths = K.reverse(best_paths, 1)
    best_paths = K.squeeze(best_paths, 2)

    return K.one_hot(best_paths, self.units)</code></pre>
</details>
</dd>
</dl>
</dd>
</dl>
</section>
</article>
<nav id="sidebar">
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="ktrain.text.ner.anago" href="index.html">ktrain.text.ner.anago</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="ktrain.text.ner.anago.layers.crf_accuracy" href="#ktrain.text.ner.anago.layers.crf_accuracy">crf_accuracy</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.crf_loss" href="#ktrain.text.ner.anago.layers.crf_loss">crf_loss</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.crf_marginal_accuracy" href="#ktrain.text.ner.anago.layers.crf_marginal_accuracy">crf_marginal_accuracy</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.crf_nll" href="#ktrain.text.ner.anago.layers.crf_nll">crf_nll</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.crf_viterbi_accuracy" href="#ktrain.text.ner.anago.layers.crf_viterbi_accuracy">crf_viterbi_accuracy</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.to_tuple" href="#ktrain.text.ner.anago.layers.to_tuple">to_tuple</a></code></li>
</ul>
</li>
<li><h3><a href="#header-classes">Classes</a></h3>
<ul>
<li>
<h4><code><a title="ktrain.text.ner.anago.layers.CRF" href="#ktrain.text.ner.anago.layers.CRF">CRF</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.ner.anago.layers.CRF.accuracy" href="#ktrain.text.ner.anago.layers.CRF.accuracy">accuracy</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.add_boundary_energy" href="#ktrain.text.ner.anago.layers.CRF.add_boundary_energy">add_boundary_energy</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.backward_recursion" href="#ktrain.text.ner.anago.layers.CRF.backward_recursion">backward_recursion</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.build" href="#ktrain.text.ner.anago.layers.CRF.build">build</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.call" href="#ktrain.text.ner.anago.layers.CRF.call">call</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.compute_mask" href="#ktrain.text.ner.anago.layers.CRF.compute_mask">compute_mask</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.compute_output_shape" href="#ktrain.text.ner.anago.layers.CRF.compute_output_shape">compute_output_shape</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.forward_recursion" href="#ktrain.text.ner.anago.layers.CRF.forward_recursion">forward_recursion</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.get_config" href="#ktrain.text.ner.anago.layers.CRF.get_config">get_config</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.get_energy" href="#ktrain.text.ner.anago.layers.CRF.get_energy">get_energy</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.get_log_normalization_constant" href="#ktrain.text.ner.anago.layers.CRF.get_log_normalization_constant">get_log_normalization_constant</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.get_marginal_prob" href="#ktrain.text.ner.anago.layers.CRF.get_marginal_prob">get_marginal_prob</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.get_negative_log_likelihood" href="#ktrain.text.ner.anago.layers.CRF.get_negative_log_likelihood">get_negative_log_likelihood</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.loss_function" href="#ktrain.text.ner.anago.layers.CRF.loss_function">loss_function</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.marginal_acc" href="#ktrain.text.ner.anago.layers.CRF.marginal_acc">marginal_acc</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.recursion" href="#ktrain.text.ner.anago.layers.CRF.recursion">recursion</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.shift_left" href="#ktrain.text.ner.anago.layers.CRF.shift_left">shift_left</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.shift_right" href="#ktrain.text.ner.anago.layers.CRF.shift_right">shift_right</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.softmaxNd" href="#ktrain.text.ner.anago.layers.CRF.softmaxNd">softmaxNd</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.step" href="#ktrain.text.ner.anago.layers.CRF.step">step</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.viterbi_acc" href="#ktrain.text.ner.anago.layers.CRF.viterbi_acc">viterbi_acc</a></code></li>
<li><code><a title="ktrain.text.ner.anago.layers.CRF.viterbi_decoding" href="#ktrain.text.ner.anago.layers.CRF.viterbi_decoding">viterbi_decoding</a></code></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
</footer>
</body>
</html>