docs/text/models.html

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.10.0" />
<title>ktrain.text.models API documentation</title>
<meta name="description" content="" />
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>ktrain.text.models</code></h1>
</header>
<section id="section-intro">
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">from .. import utils as U
from ..imports import *
from . import preprocessor as tpp

NBSVM = &#34;nbsvm&#34;
FASTTEXT = &#34;fasttext&#34;
LOGREG = &#34;logreg&#34;
BIGRU = &#34;bigru&#34;
STANDARD_GRU = &#34;standard_gru&#34;
BERT = &#34;bert&#34;
DISTILBERT = tpp.DISTILBERT
HUGGINGFACE_MODELS = [DISTILBERT]
LINREG = &#34;linreg&#34;
TEXT_CLASSIFIERS = {
    FASTTEXT: &#34;a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]&#34;,
    LOGREG: &#34;logistic regression using a trainable Embedding layer&#34;,
    NBSVM: &#34;NBSVM model [http://www.aclweb.org/anthology/P12-2018]&#34;,
    BIGRU: &#34;Bidirectional GRU with pretrained fasttext word vectors [https://fasttext.cc/docs/en/crawl-vectors.html]&#34;,
    STANDARD_GRU: &#34;simple 2-layer GRU with randomly initialized embeddings&#34;,
    BERT: &#34;Bidirectional Encoder Representations from Transformers (BERT) from keras_bert [https://arxiv.org/abs/1810.04805]&#34;,
    DISTILBERT: &#34;distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]&#34;,
}

TEXT_REGRESSION_MODELS = {
    FASTTEXT: &#34;a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]&#34;,
    LINREG: &#34;linear text regression using a trainable Embedding layer&#34;,
    BIGRU: &#34;Bidirectional GRU with pretrained English word vectors [https://arxiv.org/abs/1712.09405]&#34;,
    STANDARD_GRU: &#34;simple 2-layer GRU with randomly initialized embeddings&#34;,
    BERT: &#34;Bidirectional Encoder Representations from Transformers (BERT) - keras_bert implementation [https://arxiv.org/abs/1810.04805]&#34;,
    DISTILBERT: &#34;distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]&#34;,
}


def print_text_classifiers():
    for k, v in TEXT_CLASSIFIERS.items():
        print(&#34;%s: %s&#34; % (k, v))


def print_text_regression_models():
    for k, v in TEXT_REGRESSION_MODELS.items():
        print(&#34;%s: %s&#34; % (k, v))


def calc_pr(y_i, x, y, b):
    idx = np.argwhere((y == y_i) == b)
    ct = x[idx[:, 0]].sum(0) + 1
    tot = ((y == y_i) == b).sum() + 1
    return ct / tot


def calc_r(y_i, x, y):
    return np.log(calc_pr(y_i, x, y, True) / calc_pr(y_i, x, y, False))


def _text_model(
    name,
    train_data,
    preproc=None,
    multilabel=None,
    classification=True,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
):
    &#34;&#34;&#34;
    ```
    Build and return a text classification or text regression model.

    Args:
        name (string): one of:
                      - &#39;fasttext&#39; for FastText model
                      - &#39;nbsvm&#39; for NBSVM model
                      - &#39;logreg&#39; for logistic regression
                      - &#39;bigru&#39; for Bidirectional GRU with pretrained word vectors
                      - &#39;bert&#39; for BERT Text Classification
                      - &#39;distilbert&#39; for Hugging Face DistilBert model
        train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
                            returned from one of the texts_from_* functions
        preproc: a ktrain.text.TextPreprocessor instance.
                 As of v0.8.0, this is required.
        multilabel (bool):  If True, multilabel model will be returned.
                            If false, binary/multiclass model will be returned.
                            If None, multilabel will be inferred from data.
        classification(bool): If True, will build a text classificaton model.
                              Otherwise, a text regression model will be returned.
        metrics(list): list of metrics to use
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    &#34;&#34;&#34;
    # check arguments
    if not isinstance(train_data, tuple) and not U.is_huggingface_from_data(train_data):
        err = &#34;&#34;&#34;
            Please pass training data in the form of a tuple of numpy.ndarrays
            or data returned from a ktrain texts_from* function.
            &#34;&#34;&#34;
        raise Exception(err)

    if not isinstance(preproc, tpp.TextPreprocessor):
        msg = &#34;The preproc argument is required.&#34;
        msg += &#34; The preproc arg should be an instance of TextPreprocessor, which is &#34;
        msg += &#34; the third return value from texts_from_folder, texts_from_csv, etc.&#34;
        # warnings.warn(msg, FutureWarning)
        raise ValueError(msg)
    if name == BIGRU and preproc.ngram_count() != 1:
        raise ValueError(&#34;Data should be processed with ngram_range=1 for bigru model.&#34;)
    is_bert = U.bert_data_tuple(train_data)
    if (is_bert and name != BERT) or (not is_bert and name == BERT):
        raise ValueError(
            &#34;if &#39;%s&#39; is selected model, then preprocess_mode=&#39;%s&#39; should be used and vice versa&#34;
            % (BERT, BERT)
        )
    is_huggingface = U.is_huggingface(data=train_data)
    if (is_huggingface and name not in HUGGINGFACE_MODELS) or (
        not is_huggingface and name in HUGGINGFACE_MODELS
    ):
        raise ValueError(
            &#34;you are using a Hugging Face transformer model but did not preprocess as such (or vice versa)&#34;
        )
    if is_huggingface and preproc.name != name:
        raise ValueError(
            &#34;you preprocessed for %s but want to build a %s model&#34;
            % (preproc.name, name)
        )

    if not classification:  # regression
        if metrics is None or metrics == [&#34;accuracy&#34;]:
            metrics = [&#34;mae&#34;]
        num_classes = 1
        multilabel = False
        loss_func = &#34;mse&#34;
        activation = None
        max_features = preproc.max_features
        features = None
        maxlen = U.shape_from_data(train_data)[1]
        U.vprint(&#34;maxlen is %s&#34; % (maxlen), verbose=verbose)
    else:  # classification
        if metrics is None:
            metrics = [&#34;accuracy&#34;]
        # set number of classes and multilabel flag
        num_classes = U.nclasses_from_data(train_data)

        # determine multilabel
        if multilabel is None:
            multilabel = U.is_multilabel(train_data)
        if multilabel and name in [NBSVM, LOGREG]:
            warnings.warn(
                &#34;switching to fasttext model, as data suggests &#34;
                &#34;multilabel classification from data.&#34;
            )
            name = FASTTEXT
        U.vprint(&#34;Is Multi-Label? %s&#34; % (multilabel), verbose=verbose)

        # set loss and activations
        loss_func = &#34;categorical_crossentropy&#34;
        activation = &#34;softmax&#34;
        if multilabel:
            loss_func = &#34;binary_crossentropy&#34;
            activation = &#34;sigmoid&#34;

        # determine number of classes, maxlen, and max_features
        max_features = preproc.max_features if preproc is not None else None
        features = set()
        if not is_bert and not is_huggingface:
            U.vprint(&#34;compiling word ID features...&#34;, verbose=verbose)
            x_train = train_data[0]
            y_train = train_data[1]
            if isinstance(y_train[0], int):
                raise ValueError(&#34;train labels should not be in sparse format&#34;)

            for x in x_train:
                features.update(x)
            # max_features = len(features)
            if max_features is None:
                max_features = max(features) + 1
                U.vprint(&#34;max_features is %s&#34; % (max_features), verbose=verbose)
        maxlen = U.shape_from_data(train_data)[1]
        U.vprint(&#34;maxlen is %s&#34; % (maxlen), verbose=verbose)

    # return appropriate model
    if name in [LOGREG, LINREG]:
        model = _build_logreg(
            num_classes,
            maxlen,
            max_features,
            features,
            loss_func=loss_func,
            activation=activation,
            metrics=metrics,
            verbose=verbose,
        )

    elif name == FASTTEXT:
        model = _build_fasttext(
            num_classes,
            maxlen,
            max_features,
            features,
            loss_func=loss_func,
            activation=activation,
            metrics=metrics,
            verbose=verbose,
        )
    elif name == STANDARD_GRU:
        model = _build_standard_gru(
            num_classes,
            maxlen,
            max_features,
            features,
            loss_func=loss_func,
            activation=activation,
            metrics=metrics,
            verbose=verbose,
        )
    elif name == NBSVM:
        model = _build_nbsvm(
            num_classes,
            maxlen,
            max_features,
            features,
            loss_func=loss_func,
            activation=activation,
            metrics=metrics,
            verbose=verbose,
            train_data=train_data,
        )

    elif name == BIGRU:
        (tokenizer, tok_dct) = preproc.get_preprocessor()
        model = _build_bigru(
            num_classes,
            maxlen,
            max_features,
            features,
            loss_func=loss_func,
            activation=activation,
            metrics=metrics,
            verbose=verbose,
            tokenizer=tokenizer,
            preproc=preproc,
        )
    elif name == BERT:
        model = _build_bert(
            num_classes,
            maxlen,
            max_features,
            features,
            loss_func=loss_func,
            activation=activation,
            metrics=metrics,
            verbose=verbose,
            preproc=preproc,
        )
    elif name in HUGGINGFACE_MODELS:
        model = _build_transformer(
            num_classes,
            maxlen,
            max_features,
            features,
            loss_func=loss_func,
            activation=activation,
            metrics=metrics,
            verbose=verbose,
            preproc=preproc,
        )

    else:
        raise ValueError(&#34;name for textclassifier is invalid&#34;)
    U.vprint(&#34;done.&#34;, verbose=verbose)
    return model


def _build_logreg(
    num_classes,
    maxlen,
    max_features,
    features,
    loss_func=&#34;categorical_crossentropy&#34;,
    activation=&#34;softmax&#34;,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
):
    embedding_matrix = np.ones((max_features, 1))
    embedding_matrix[0] = 0

    # set up the model
    inp = keras.layers.Input(shape=(maxlen,))
    r = keras.layers.Embedding(
        max_features,
        1,
        input_length=maxlen,
        weights=[embedding_matrix],
        trainable=False,
    )(inp)
    x = keras.layers.Embedding(
        max_features,
        num_classes,
        input_length=maxlen,
        embeddings_initializer=&#34;glorot_normal&#34;,
    )(inp)
    x = keras.layers.dot([x, r], axes=1)
    x = keras.layers.Flatten()(x)
    if activation:
        x = keras.layers.Activation(activation)(x)
    model = keras.Model(inputs=inp, outputs=x)
    model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)
    return model


def _build_bert(
    num_classes,
    maxlen,
    max_features,
    features,
    loss_func=&#34;categorical_crossentropy&#34;,
    activation=&#34;softmax&#34;,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
    preproc=None,
):
    if preproc is None:
        raise ValueError(&#34;preproc is missing&#34;)
    lang = preproc.lang
    if lang is None:
        raise ValueError(&#34;lang is missing&#34;)
    config_path = os.path.join(tpp.get_bert_path(lang=lang), &#34;bert_config.json&#34;)
    checkpoint_path = os.path.join(tpp.get_bert_path(lang=lang), &#34;bert_model.ckpt&#34;)
    check_keras_bert()
    model = keras_bert.load_trained_model_from_checkpoint(
        config_path, checkpoint_path, training=True, trainable=True, seq_len=maxlen
    )
    inputs = model.inputs[:2]
    dense = model.get_layer(&#34;NSP-Dense&#34;).output
    outputs = keras.layers.Dense(units=num_classes, activation=activation)(dense)
    model = keras.Model(inputs, outputs)
    model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)
    return model


def _build_transformer(
    num_classes,
    maxlen,
    max_features,
    features,
    loss_func=&#34;categorical_crossentropy&#34;,
    activation=&#34;softmax&#34;,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
    preproc=None,
):
    if not isinstance(preproc, tpp.TransformersPreprocessor):
        raise ValueError(
            &#34;preproc must be instance of %s&#34; % (str(tpp.TransformersPreprocessor))
        )

    if loss_func == &#34;mse&#34;:
        if preproc.get_classes():
            raise Exception(
                &#34;This is supposed to be regression problem, but preproc.get_classes() is not empty. &#34;
                + &#34;Something went wrong.  Please open a GitHub issue.&#34;
            )
            if len(preproc.get_classes()) != num_classes:
                raise Exception(
                    &#34;Number of labels from preproc.get_classes() is not equal to num_classes. &#34;
                    + &#34;Something went wrong. Please open GitHub issue.&#34;
                )
    else:
        if not preproc.get_classes():
            raise Exception(
                &#34;This is supposed to be a classification problem, but preproc.get_classes() is empty. &#34;
                + &#34;Something went wrong.  Please open a GitHub issue.&#34;
            )
    return (
        preproc.get_regression_model(metrics=metrics)
        if loss_func == &#34;mse&#34;
        else preproc.get_classifier(metrics=metrics)
    )


def _build_nbsvm(
    num_classes,
    maxlen,
    max_features,
    features,
    loss_func=&#34;categorical_crossentropy&#34;,
    activation=&#34;softmax&#34;,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
    train_data=None,
):
    if train_data is None:
        raise ValueError(&#34;train_data is required&#34;)
    x_train = train_data[0]
    y_train = train_data[1]
    Y = np.array([np.argmax(row) for row in y_train])
    num_columns = max(features) + 1
    num_rows = len(x_train)

    # set up document-term matrix
    X = csr_matrix((num_rows, num_columns), dtype=np.int8)
    # X = lil_matrix((num_rows, num_columns), dtype=np.int8)
    U.vprint(
        &#34;building document-term matrix... this may take a few moments...&#34;,
        verbose=verbose,
    )
    r_ids = []
    c_ids = []
    data = []
    for row_id, row in enumerate(x_train):
        trigger = 10000
        trigger_end = min(row_id + trigger, num_rows)
        if row_id % trigger == 0:
            U.vprint(&#34;rows: %s-%s&#34; % (row_id + 1, trigger_end), verbose=verbose)
        tmp_c_ids = [column_id for column_id in row if column_id &gt; 0]
        num = len(tmp_c_ids)
        c_ids.extend(tmp_c_ids)
        r_ids.extend([row_id] * num)
        data.extend([1] * num)
    X = csr_matrix((data, (r_ids, c_ids)), shape=(num_rows, num_columns))

    # compute Naive Bayes log-count ratios
    U.vprint(&#34;computing log-count ratios...&#34;, verbose=verbose)
    nbratios = np.stack([calc_r(i, X, Y).A1 for i in range(num_classes)])
    nbratios = nbratios.T
    embedding_matrix = np.zeros((num_columns, num_classes))
    for i in range(1, num_columns):
        for j in range(num_classes):
            embedding_matrix[i, j] = nbratios[i, j]

    # set up the model
    inp = keras.layers.Input(shape=(maxlen,))
    r = keras.layers.Embedding(
        num_columns,
        num_classes,
        input_length=maxlen,
        weights=[embedding_matrix],
        trainable=False,
    )(inp)
    x = keras.layers.Embedding(
        num_columns, 1, input_length=maxlen, embeddings_initializer=&#34;glorot_normal&#34;
    )(inp)
    x = keras.layers.dot([r, x], axes=1)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Activation(activation)(x)
    model = keras.Model(inputs=inp, outputs=x)
    model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)
    return model


def _build_fasttext(
    num_classes,
    maxlen,
    max_features,
    features,
    loss_func=&#34;categorical_crossentropy&#34;,
    activation=&#34;softmax&#34;,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(max_features, 64, input_length=maxlen))
    model.add(keras.layers.SpatialDropout1D(0.25))
    model.add(keras.layers.GlobalMaxPool1D())
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dense(64, activation=&#34;relu&#34;, kernel_initializer=&#34;he_normal&#34;))
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(num_classes, activation=activation))
    model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)

    return model


def _build_standard_gru(
    num_classes,
    maxlen,
    max_features,
    features,
    loss_func=&#34;categorical_crossentropy&#34;,
    activation=&#34;softmax&#34;,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(max_features, 256, input_length=maxlen))
    model.add(keras.layers.GRU(256, dropout=0.9, return_sequences=True))
    model.add(keras.layers.GRU(256, dropout=0.9))
    model.add(keras.layers.Dense(num_classes, activation=activation))
    model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)
    return model


def _build_bigru(
    num_classes,
    maxlen,
    max_features,
    features,
    loss_func=&#34;categorical_crossentropy&#34;,
    activation=&#34;softmax&#34;,
    metrics=[&#34;accuracy&#34;],
    verbose=1,
    tokenizer=None,
    preproc=None,
):
    if tokenizer is None:
        raise ValueError(&#34;bigru requires valid Tokenizer object&#34;)
    if preproc is None:
        raise ValueError(&#34;bigru requires valid preproc&#34;)
    if not hasattr(preproc, &#34;lang&#34;) or preproc.lang is None:
        lang = &#34;en&#34;
    else:
        lang = preproc.lang
    wv_url = (
        &#34;https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.%s.300.vec.gz&#34;
        % (lang.split(&#34;-&#34;)[0])
    )
    if verbose:
        print(&#34;word vectors will be loaded from: %s&#34; % (wv_url))

    # setup pre-trained word embeddings
    embed_size = 300
    U.vprint(&#34;processing pretrained word vectors...&#34;, verbose=verbose)
    embeddings_index = tpp.load_wv(wv_path_or_url=wv_url, verbose=verbose)
    word_index = tokenizer.word_index
    # nb_words = min(max_features, len(word_index))
    nb_words = max_features
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i &gt;= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # define model
    inp = keras.layers.Input(shape=(maxlen,))
    x = keras.layers.Embedding(max_features, embed_size, weights=[embedding_matrix])(
        inp
    )
    x = keras.layers.SpatialDropout1D(0.2)(x)
    x = keras.layers.Bidirectional(keras.layers.GRU(80, return_sequences=True))(x)
    avg_pool = keras.layers.GlobalAveragePooling1D()(x)
    max_pool = keras.layers.GlobalMaxPool1D()(x)
    conc = keras.layers.concatenate([avg_pool, max_pool])
    outp = keras.layers.Dense(num_classes, activation=activation)(conc)
    model = keras.Model(inputs=inp, outputs=outp)
    model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)
    return model


def text_classifier(
    name, train_data, preproc=None, multilabel=None, metrics=[&#34;accuracy&#34;], verbose=1
):
    &#34;&#34;&#34;
    ```
    Build and return a text classification model.

    Args:
        name (string): one of:
                      - &#39;fasttext&#39; for FastText model
                      - &#39;nbsvm&#39; for NBSVM model
                      - &#39;logreg&#39; for logistic regression using embedding layers
                      - &#39;bigru&#39; for Bidirectional GRU with pretrained word vectors
                      - &#39;bert&#39; for BERT Text Classification
                      - &#39;distilbert&#39; for Hugging Face DistilBert model

        train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
                            returned from one of the texts_from_* functions
        preproc: a ktrain.text.TextPreprocessor instance.
                 As of v0.8.0, this is required.
        multilabel (bool):  If True, multilabel model will be returned.
                            If false, binary/multiclass model will be returned.
                            If None, multilabel will be inferred from data.
        metrics(list): metrics to use
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    &#34;&#34;&#34;
    if name not in TEXT_CLASSIFIERS:
        raise ValueError(&#34;invalid name for text classification: %s&#34; % (name))
    if preproc is not None and not preproc.get_classes():
        raise ValueError(
            &#34;preproc.get_classes() is empty, but required for text classification&#34;
        )
    return _text_model(
        name,
        train_data,
        preproc=preproc,
        multilabel=multilabel,
        classification=True,
        metrics=metrics,
        verbose=verbose,
    )


def text_regression_model(name, train_data, preproc=None, metrics=[&#34;mae&#34;], verbose=1):
    &#34;&#34;&#34;
    ```
    Build and return a text regression model.

    Args:
        name (string): one of:
                      - &#39;fasttext&#39; for FastText model
                      - &#39;nbsvm&#39; for NBSVM model
                      - &#39;linreg&#39; for linear regression using embedding layers
                      - &#39;bigru&#39; for Bidirectional GRU with pretrained word vectors
                      - &#39;bert&#39; for BERT Text Classification
                      - &#39;distilbert&#39; for Hugging Face DistilBert model

        train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train)
        preproc: a ktrain.text.TextPreprocessor instance.
                 As of v0.8.0, this is required.
        metrics(list): metrics to use
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    &#34;&#34;&#34;
    if name not in TEXT_REGRESSION_MODELS:
        raise ValueError(&#34;invalid name for text classification: %s&#34; % (name))
    if preproc is not None and preproc.get_classes():
        raise ValueError(
            &#34;preproc.get_classes() is supposed to be empty for text regression tasks&#34;
        )
    return _text_model(
        name,
        train_data,
        preproc=preproc,
        multilabel=False,
        classification=False,
        metrics=metrics,
        verbose=verbose,
    )</code></pre>
</details>
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="ktrain.text.models.calc_pr"><code class="name flex">
<span>def <span class="ident">calc_pr</span></span>(<span>y_i, x, y, b)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def calc_pr(y_i, x, y, b):
    idx = np.argwhere((y == y_i) == b)
    ct = x[idx[:, 0]].sum(0) + 1
    tot = ((y == y_i) == b).sum() + 1
    return ct / tot</code></pre>
</details>
</dd>
<dt id="ktrain.text.models.calc_r"><code class="name flex">
<span>def <span class="ident">calc_r</span></span>(<span>y_i, x, y)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def calc_r(y_i, x, y):
    return np.log(calc_pr(y_i, x, y, True) / calc_pr(y_i, x, y, False))</code></pre>
</details>
</dd>
<dt id="ktrain.text.models.print_text_classifiers"><code class="name flex">
<span>def <span class="ident">print_text_classifiers</span></span>(<span>)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def print_text_classifiers():
    for k, v in TEXT_CLASSIFIERS.items():
        print(&#34;%s: %s&#34; % (k, v))</code></pre>
</details>
</dd>
<dt id="ktrain.text.models.print_text_regression_models"><code class="name flex">
<span>def <span class="ident">print_text_regression_models</span></span>(<span>)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def print_text_regression_models():
    for k, v in TEXT_REGRESSION_MODELS.items():
        print(&#34;%s: %s&#34; % (k, v))</code></pre>
</details>
</dd>
<dt id="ktrain.text.models.text_classifier"><code class="name flex">
<span>def <span class="ident">text_classifier</span></span>(<span>name, train_data, preproc=None, multilabel=None, metrics=['accuracy'], verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Build and return a text classification model.

Args:
    name (string): one of:
                  - 'fasttext' for FastText model
                  - 'nbsvm' for NBSVM model
                  - 'logreg' for logistic regression using embedding layers
                  - 'bigru' for Bidirectional GRU with pretrained word vectors
                  - 'bert' for BERT Text Classification
                  - 'distilbert' for Hugging Face DistilBert model

    train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
                        returned from one of the texts_from_* functions
    preproc: a ktrain.text.TextPreprocessor instance.
             As of v0.8.0, this is required.
    multilabel (bool):  If True, multilabel model will be returned.
                        If false, binary/multiclass model will be returned.
                        If None, multilabel will be inferred from data.
    metrics(list): metrics to use
    verbose (boolean): verbosity of output
Return:
    model (Model): A Keras Model instance
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def text_classifier(
    name, train_data, preproc=None, multilabel=None, metrics=[&#34;accuracy&#34;], verbose=1
):
    &#34;&#34;&#34;
    ```
    Build and return a text classification model.

    Args:
        name (string): one of:
                      - &#39;fasttext&#39; for FastText model
                      - &#39;nbsvm&#39; for NBSVM model
                      - &#39;logreg&#39; for logistic regression using embedding layers
                      - &#39;bigru&#39; for Bidirectional GRU with pretrained word vectors
                      - &#39;bert&#39; for BERT Text Classification
                      - &#39;distilbert&#39; for Hugging Face DistilBert model

        train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
                            returned from one of the texts_from_* functions
        preproc: a ktrain.text.TextPreprocessor instance.
                 As of v0.8.0, this is required.
        multilabel (bool):  If True, multilabel model will be returned.
                            If false, binary/multiclass model will be returned.
                            If None, multilabel will be inferred from data.
        metrics(list): metrics to use
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    &#34;&#34;&#34;
    if name not in TEXT_CLASSIFIERS:
        raise ValueError(&#34;invalid name for text classification: %s&#34; % (name))
    if preproc is not None and not preproc.get_classes():
        raise ValueError(
            &#34;preproc.get_classes() is empty, but required for text classification&#34;
        )
    return _text_model(
        name,
        train_data,
        preproc=preproc,
        multilabel=multilabel,
        classification=True,
        metrics=metrics,
        verbose=verbose,
    )</code></pre>
</details>
</dd>
<dt id="ktrain.text.models.text_regression_model"><code class="name flex">
<span>def <span class="ident">text_regression_model</span></span>(<span>name, train_data, preproc=None, metrics=['mae'], verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Build and return a text regression model.

Args:
    name (string): one of:
                  - 'fasttext' for FastText model
                  - 'nbsvm' for NBSVM model
                  - 'linreg' for linear regression using embedding layers
                  - 'bigru' for Bidirectional GRU with pretrained word vectors
                  - 'bert' for BERT Text Classification
                  - 'distilbert' for Hugging Face DistilBert model

    train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train)
    preproc: a ktrain.text.TextPreprocessor instance.
             As of v0.8.0, this is required.
    metrics(list): metrics to use
    verbose (boolean): verbosity of output
Return:
    model (Model): A Keras Model instance
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def text_regression_model(name, train_data, preproc=None, metrics=[&#34;mae&#34;], verbose=1):
    &#34;&#34;&#34;
    ```
    Build and return a text regression model.

    Args:
        name (string): one of:
                      - &#39;fasttext&#39; for FastText model
                      - &#39;nbsvm&#39; for NBSVM model
                      - &#39;linreg&#39; for linear regression using embedding layers
                      - &#39;bigru&#39; for Bidirectional GRU with pretrained word vectors
                      - &#39;bert&#39; for BERT Text Classification
                      - &#39;distilbert&#39; for Hugging Face DistilBert model

        train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train)
        preproc: a ktrain.text.TextPreprocessor instance.
                 As of v0.8.0, this is required.
        metrics(list): metrics to use
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    &#34;&#34;&#34;
    if name not in TEXT_REGRESSION_MODELS:
        raise ValueError(&#34;invalid name for text classification: %s&#34; % (name))
    if preproc is not None and preproc.get_classes():
        raise ValueError(
            &#34;preproc.get_classes() is supposed to be empty for text regression tasks&#34;
        )
    return _text_model(
        name,
        train_data,
        preproc=preproc,
        multilabel=False,
        classification=False,
        metrics=metrics,
        verbose=verbose,
    )</code></pre>
</details>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="ktrain.text" href="index.html">ktrain.text</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="ktrain.text.models.calc_pr" href="#ktrain.text.models.calc_pr">calc_pr</a></code></li>
<li><code><a title="ktrain.text.models.calc_r" href="#ktrain.text.models.calc_r">calc_r</a></code></li>
<li><code><a title="ktrain.text.models.print_text_classifiers" href="#ktrain.text.models.print_text_classifiers">print_text_classifiers</a></code></li>
<li><code><a title="ktrain.text.models.print_text_regression_models" href="#ktrain.text.models.print_text_regression_models">print_text_regression_models</a></code></li>
<li><code><a title="ktrain.text.models.text_classifier" href="#ktrain.text.models.text_classifier">text_classifier</a></code></li>
<li><code><a title="ktrain.text.models.text_regression_model" href="#ktrain.text.models.text_regression_model">text_regression_model</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
</footer>
</body>
</html>