docs/text/index.html

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.9.2" />
<title>ktrain.text API documentation</title>
<meta name="description" content="" />
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>ktrain.text</code></h1>
</header>
<section id="section-intro">
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">from .models import print_text_classifiers, print_text_regression_models, text_classifier, text_regression_model
from .data import texts_from_folder, texts_from_csv, texts_from_df,  texts_from_array
from .ner.data import entities_from_gmb, entities_from_conll2003, entities_from_txt, entities_from_df, entities_from_array
from .ner.models import sequence_tagger, print_sequence_taggers
from .eda import get_topic_model
from .textutils import extract_filenames, load_text_files, filter_by_id
from .preprocessor import Transformer, TransformerEmbedding
from .summarization import TransformerSummarizer
from .zsl import ZeroShotClassifier
from .translation import EnglishTranslator, Translator
from . import shallownlp
from .qa import SimpleQA
from . import textutils
import pickle

__all__ = [
           &#39;text_classifier&#39;, &#39;text_regression_model&#39;,
           &#39;print_text_classifiers&#39;, &#39;print_text_regression_models&#39;,
           &#39;texts_from_folder&#39;, &#39;texts_from_csv&#39;, &#39;texts_from_df&#39;, &#39;texts_from_array&#39;,
           &#39;entities_from_gmb&#39;,
           &#39;entities_from_conll2003&#39;,
           &#39;entities_from_txt&#39;,
           &#39;entities_from_array&#39;,
           &#39;entities_from_df&#39;,
           &#39;sequence_tagger&#39;,
           &#39;print_sequence_taggers&#39;,
           &#39;get_topic_model&#39;,
           &#39;Transformer&#39;,
           &#39;TransformerEmbedding&#39;,
           &#39;shallownlp&#39;,
           &#39;TransformerSummarizer&#39;,
           &#39;ZeroShotClassifier&#39;,
           &#39;EnglishTranslator&#39;,
           &#39;Translator&#39;,
           &#39;SimpleQA&#39;,
           &#39;extract_filenames&#39;, 
           &#39;load_text_files&#39;,
           ]


def load_topic_model(fname):
    &#34;&#34;&#34;
    Load saved TopicModel object
    Args:
        fname(str): base filename for all saved files
    &#34;&#34;&#34;
    with open(fname+&#39;.tm_vect&#39;, &#39;rb&#39;) as f:
        vectorizer = pickle.load(f)
    with open(fname+&#39;.tm_model&#39;, &#39;rb&#39;) as f:
        model = pickle.load(f)
    with open(fname+&#39;.tm_params&#39;, &#39;rb&#39;) as f:
        params = pickle.load(f)
    tm = get_topic_model(n_topics=params[&#39;n_topics&#39;],
                         n_features = params[&#39;n_features&#39;],
                         verbose = params[&#39;verbose&#39;])
    tm.model = model
    tm.vectorizer = vectorizer
    return tm


seqlen_stats = Transformer.seqlen_stats</code></pre>
</details>
</section>
<section>
<h2 class="section-title" id="header-submodules">Sub-modules</h2>
<dl>
<dt><code class="name"><a title="ktrain.text.data" href="data.html">ktrain.text.data</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.eda" href="eda.html">ktrain.text.eda</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.learner" href="learner.html">ktrain.text.learner</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.models" href="models.html">ktrain.text.models</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.ner" href="ner/index.html">ktrain.text.ner</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.predictor" href="predictor.html">ktrain.text.predictor</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.preprocessor" href="preprocessor.html">ktrain.text.preprocessor</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.qa" href="qa/index.html">ktrain.text.qa</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.shallownlp" href="shallownlp/index.html">ktrain.text.shallownlp</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.summarization" href="summarization/index.html">ktrain.text.summarization</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.textutils" href="textutils.html">ktrain.text.textutils</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.translation" href="translation/index.html">ktrain.text.translation</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.zsl" href="zsl/index.html">ktrain.text.zsl</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="ktrain.text.entities_from_array"><code class="name flex">
<span>def <span class="ident">entities_from_array</span></span>(<span>x_train, y_train, x_test=None, y_test=None, use_char=False, val_pct=0.1, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><p>Load entities from arrays</p>
<h2 id="args">Args</h2>
<p>x_train(list): list of list of entity tokens for training
Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']]
y_train(list): list of list of tokens representing entity labels
Example:
y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']]
x_test(list): list of list of entity tokens for validation
Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']]
y_test(list): list of list of tokens representing entity labels
Example:
y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']]
use_char(bool):
If True, data will be preprocessed to use character embeddings
in addition to word embeddings
val_pct(float):
percentage of training to use for validation if no validation data is supplied
verbose (boolean): verbosity</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entities_from_array(x_train, y_train,
                        x_test=None, y_test=None,
                        use_char=False,
                        val_pct=0.1,
                        verbose=1):
    &#34;&#34;&#34;
    Load entities from arrays
    Args:
      x_train(list): list of list of entity tokens for training
                     Example: x_train = [[&#39;Hello&#39;, &#39;world&#39;], [&#39;Hello&#39;, &#39;Cher&#39;], [&#39;I&#39;, &#39;love&#39;, &#39;Chicago&#39;]]
      y_train(list): list of list of tokens representing entity labels
                     Example:  y_train = [[&#39;O&#39;, &#39;O&#39;], [&#39;O&#39;, &#39;B-PER&#39;], [&#39;O&#39;, &#39;O&#39;, &#39;B-LOC&#39;]]
      x_test(list): list of list of entity tokens for validation 
                     Example: x_train = [[&#39;Hello&#39;, &#39;world&#39;], [&#39;Hello&#39;, &#39;Cher&#39;], [&#39;I&#39;, &#39;love&#39;, &#39;Chicago&#39;]]
      y_test(list): list of list of tokens representing entity labels
                     Example:  y_train = [[&#39;O&#39;, &#39;O&#39;], [&#39;O&#39;, &#39;B-PER&#39;], [&#39;O&#39;, &#39;O&#39;, &#39;B-LOC&#39;]]
     use_char(bool):    If True, data will be preprocessed to use character embeddings  in addition to word embeddings
     val_pct(float):  percentage of training to use for validation if no validation data is supplied
     verbose (boolean): verbosity

    &#34;&#34;&#34;
    # TODO: converting to df to use entities_from_df - needs to be refactored
    train_df = pp.array_to_df(x_train, y_train) 
    val_df = None
    if x_test is not None and y_test is not None:
        val_df = pp.array_to_df(x_test, y_test)
    if verbose:
        print(&#39;training data sample:&#39;)
        print(train_df.head())
        if val_df is not None:
            print(&#39;validation data sample:&#39;)
            print(val_df.head())
    return entities_from_df(train_df, val_df=val_df, val_pct=val_pct, 
                            use_char=use_char, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.entities_from_conll2003"><code class="name flex">
<span>def <span class="ident">entities_from_conll2003</span></span>(<span>train_filepath, val_filepath=None, use_char=False, encoding=None, val_pct=0.1, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><p>Loads sequence-labeled data from a file in CoNLL2003 format.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entities_from_conll2003(train_filepath, 
                            val_filepath=None,
                            use_char=False,
                            encoding=None,
                            val_pct=0.1, verbose=1):
    &#34;&#34;&#34;
    Loads sequence-labeled data from a file in CoNLL2003 format.
    &#34;&#34;&#34;
    return entities_from_txt(train_filepath=train_filepath,
                             val_filepath=val_filepath,
                             use_char=use_char,
                             data_format=&#39;conll2003&#39;,
                             encoding=encoding,
                             val_pct=val_pct, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.entities_from_df"><code class="name flex">
<span>def <span class="ident">entities_from_df</span></span>(<span>train_df, val_df=None, word_column='Word', tag_column='Tag', sentence_column='SentenceID', use_char=False, val_pct=0.1, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><p>Load entities from pandas DataFrame</p>
<h2 id="args">Args</h2>
<dl>
<dt>train_df(pd.DataFrame): training data</dt>
<dt>val_df(pdf.DataFrame): validation data</dt>
<dt>word_column(str): name of column containing the text</dt>
<dt>tag_column(str): name of column containing lael</dt>
<dt>sentence_column(str): name of column containing Sentence IDs</dt>
<dt>use_char(bool):
If True, data will be preprocessed to use character embeddings
in addition to word embeddings</dt>
<dt><strong><code>verbose</code></strong> :&ensp;<code>boolean</code></dt>
<dd>verbosity</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entities_from_df(train_df,
                     val_df=None,
                     word_column=WORD_COL,
                     tag_column=TAG_COL,
                     sentence_column=SENT_COL,
                     use_char=False,
                     val_pct=0.1, verbose=1):
    &#34;&#34;&#34;
    Load entities from pandas DataFrame
    Args:
      train_df(pd.DataFrame): training data
      val_df(pdf.DataFrame): validation data
      word_column(str): name of column containing the text
      tag_column(str): name of column containing lael
      sentence_column(str): name of column containing Sentence IDs
      use_char(bool):    If True, data will be preprocessed to use character embeddings  in addition to word embeddings
      verbose (boolean): verbosity

    &#34;&#34;&#34;
    # process dataframe and instantiate NERPreprocessor
    x, y  = pp.process_df(train_df, 
                          word_column=word_column,
                          tag_column=tag_column,
                          sentence_column=sentence_column,
                          verbose=verbose)

    # get validation set
    if val_df is None:
        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=val_pct)
    else:
        x_train, y_train = x, y
        (x_valid, y_valid)  = pp.process_df(val_df,
                                            word_column=word_column,
                                            tag_column=tag_column,
                                            sentence_column=sentence_column,
                                            verbose=0)

    # preprocess and convert to generator
    p = IndexTransformer(use_char=use_char)
    preproc = NERPreprocessor(p)
    preproc.fit(x_train, y_train)
    trn = pp.NERSequence(x_train, y_train, batch_size=U.DEFAULT_BS, p=p)
    val = pp.NERSequence(x_valid, y_valid, batch_size=U.DEFAULT_BS, p=p)

    return ( trn, val, preproc)</code></pre>
</details>
</dd>
<dt id="ktrain.text.entities_from_gmb"><code class="name flex">
<span>def <span class="ident">entities_from_gmb</span></span>(<span>train_filepath, val_filepath=None, use_char=False, word_column='Word', tag_column='Tag', sentence_column='SentenceID', encoding=None, val_pct=0.1, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><p>Loads sequence-labeled data from text file in the
Groningen
Meaning Bank
(GMB) format.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entities_from_gmb(train_filepath, 
                      val_filepath=None,
                      use_char=False,
                      word_column=WORD_COL,
                      tag_column=TAG_COL,
                      sentence_column=SENT_COL,
                       encoding=None,
                       val_pct=0.1, verbose=1):
    &#34;&#34;&#34;
    Loads sequence-labeled data from text file in the  Groningen
    Meaning Bank  (GMB) format.
    &#34;&#34;&#34;


    return entities_from_txt(train_filepath=train_filepath,
                             val_filepath=val_filepath,
                             use_char=use_char,
                             word_column=word_column,
                             tag_column=tag_column,
                             sentence_column=sentence_column,
                             data_format=&#39;gmb&#39;,
                             encoding=encoding,
                             val_pct=val_pct, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.entities_from_txt"><code class="name flex">
<span>def <span class="ident">entities_from_txt</span></span>(<span>train_filepath, val_filepath=None, use_char=False, word_column='Word', tag_column='Tag', sentence_column='SentenceID', data_format='conll2003', encoding=None, val_pct=0.1, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><p>Loads sequence-labeled data from comma or tab-delmited text file.
Format of file is either the CoNLL2003 format or Groningen Meaning
Bank (GMB) format - specified with data_format parameter.</p>
<p>In both formats, each word appars on a separate line along with
its associated tag (or label).<br>
The last item on each line should be the tag or label assigned to word.</p>
<p>In the CoNLL2003 format, there is an empty line after
each sentence.
In the GMB format, sentences are deliniated
with a third column denoting the Sentence ID.</p>
<p>More information on CoNLL2003 format:
<a href="https://www.aclweb.org/anthology/W03-0419">https://www.aclweb.org/anthology/W03-0419</a></p>
<p>CoNLL Example (each column is typically separated by space or tab)
and
no column headings:</p>
<p>Paul
B-PER
Newman
I-PER
is
O
a
O
great
O
actor
O
!
O</p>
<p>More information on GMB format:
Refer to ner_dataset.csv on Kaggle here:
<a href="https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/version/2">https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/version/2</a></p>
<p>GMB example (each column separated by comma or tab)
with column headings:</p>
<p>SentenceID
Word
Tag
<br>
1
Paul
B-PER
1
Newman
I-PER
1
is
O
1
a
O
1
great
O
1
actor
O
1
!
O</p>
<h2 id="args">Args</h2>
<dl>
<dt>train_filepath(str): file path to training CSV</dt>
<dt><strong><code>val_filepath</code></strong> :&ensp;<code>str</code></dt>
<dd>file path to validation dataset</dd>
<dt>use_char(bool):
If True, data will be preprocessed to use character embeddings in addition to word embeddings</dt>
<dt>word_column(str): name of column containing the text</dt>
<dt>tag_column(str): name of column containing lael</dt>
<dt>sentence_column(str): name of column containing Sentence IDs</dt>
<dt>data_format(str): one of colnll2003 or gmb</dt>
<dt>word_column, tag_column, and sentence_column</dt>
<dt>ignored if 'conll2003'</dt>
<dt>encoding(str): the encoding to use.
If None, encoding is discovered automatically</dt>
<dt>val_pct(float): Proportion of training to use for validation.</dt>
<dt><strong><code>verbose</code></strong> :&ensp;<code>boolean</code></dt>
<dd>verbosity</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entities_from_txt(train_filepath, 
                      val_filepath=None,
                      use_char=False,
                      word_column=WORD_COL,
                      tag_column=TAG_COL,
                      sentence_column=SENT_COL,
                      data_format=&#39;conll2003&#39;,
                      encoding=None,
                      val_pct=0.1, verbose=1):
    &#34;&#34;&#34;
    Loads sequence-labeled data from comma or tab-delmited text file.
    Format of file is either the CoNLL2003 format or Groningen Meaning
    Bank (GMB) format - specified with data_format parameter.

    In both formats, each word appars on a separate line along with
    its associated tag (or label).  
    The last item on each line should be the tag or label assigned to word.
    
    In the CoNLL2003 format, there is an empty line after
    each sentence.  In the GMB format, sentences are deliniated
    with a third column denoting the Sentence ID.


    More information on CoNLL2003 format: 
       https://www.aclweb.org/anthology/W03-0419

    CoNLL Example (each column is typically separated by space or tab)
    and  no column headings:

       Paul     B-PER
       Newman   I-PER
       is       O
       a        O
       great    O
       actor    O
       !        O

    More information on GMB format:
    Refer to ner_dataset.csv on Kaggle here:
       https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/version/2

    GMB example (each column separated by comma or tab)
    with column headings:

      SentenceID   Word     Tag    
      1            Paul     B-PER
      1            Newman   I-PER
      1            is       O
      1            a        O
      1            great    O
      1            actor    O
      1            !        O
    

    Args:
        train_filepath(str): file path to training CSV
        val_filepath (str): file path to validation dataset
        use_char(bool):    If True, data will be preprocessed to use character embeddings in addition to word embeddings
        word_column(str): name of column containing the text
        tag_column(str): name of column containing lael
        sentence_column(str): name of column containing Sentence IDs
        data_format(str): one of colnll2003 or gmb
                          word_column, tag_column, and sentence_column
                          ignored if &#39;conll2003&#39;
        encoding(str): the encoding to use.  If None, encoding is discovered automatically
        val_pct(float): Proportion of training to use for validation.
        verbose (boolean): verbosity
    &#34;&#34;&#34;


    # set dataframe converter
    if data_format == &#39;gmb&#39;:
        data_to_df = pp.gmb_to_df
    else:
        data_to_df = pp.conll2003_to_df
        word_column, tag_column, sentence_column = WORD_COL, TAG_COL, SENT_COL

    # detect encoding
    if encoding is None:
        with open(train_filepath, &#39;rb&#39;) as f:
            encoding = TU.detect_encoding(f.read())
            U.vprint(&#39;detected encoding: %s (if wrong, set manually)&#39; % (encoding), verbose=verbose)

    # create dataframe
    train_df = data_to_df(train_filepath, encoding=encoding)


    val_df = None if val_filepath is None else data_to_df(val_filepath, encoding=encoding)
    return entities_from_df(train_df,
                            val_df=val_df,
                            word_column=word_column,
                            tag_column=tag_column,
                            sentence_column=sentence_column,
                            use_char=use_char,
                            val_pct=val_pct, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.extract_filenames"><code class="name flex">
<span>def <span class="ident">extract_filenames</span></span>(<span>corpus_path, follow_links=False)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def extract_filenames(corpus_path, follow_links=False):
    if os.listdir(corpus_path) == []:
        raise ValueError(&#34;%s: path is empty&#34; % corpus_path)
    walk = os.walk
    for root, dirs, filenames in walk(corpus_path, followlinks=follow_links):
        for filename in filenames:
            try:
                yield os.path.join(root, filename)
            except:
                continue</code></pre>
</details>
</dd>
<dt id="ktrain.text.load_text_files"><code class="name flex">
<span>def <span class="ident">load_text_files</span></span>(<span>corpus_path, truncate_len=None, clean=True, return_fnames=False)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>load text files
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def load_text_files(corpus_path, truncate_len=None, 
                    clean=True, return_fnames=False):
    &#34;&#34;&#34;
    ```
    load text files
    ```
    &#34;&#34;&#34;
    
    texts = []
    filenames = []
    mb = master_bar(range(1))
    for i in mb:
        for filename in progress_bar(list(extract_filenames(corpus_path)), parent=mb):
            with open(filename, &#39;r&#39;) as f:
                text = f.read()
            if clean:
                text = strip_control_characters(text)
                text = to_ascii(text)
            if truncate_len is not None:
                text = &#34; &#34;.join(text.split()[:truncate_len])
            texts.append(text)
            filenames.append(filename)
        mb.write(&#39;done.&#39;)
    if return_fnames:
        return (texts, filenames)
    else:
        return texts</code></pre>
</details>
</dd>
<dt id="ktrain.text.print_sequence_taggers"><code class="name flex">
<span>def <span class="ident">print_sequence_taggers</span></span>(<span>)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def print_sequence_taggers():
    for k,v in SEQUENCE_TAGGERS.items():
        print(&#34;%s: %s&#34; % (k,v))</code></pre>
</details>
</dd>
<dt id="ktrain.text.print_text_classifiers"><code class="name flex">
<span>def <span class="ident">print_text_classifiers</span></span>(<span>)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def print_text_classifiers():
    for k,v in TEXT_CLASSIFIERS.items():
        print(&#34;%s: %s&#34; % (k,v))</code></pre>
</details>
</dd>
<dt id="ktrain.text.print_text_regression_models"><code class="name flex">
<span>def <span class="ident">print_text_regression_models</span></span>(<span>)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def print_text_regression_models():
    for k,v in TEXT_REGRESSION_MODELS.items():
        print(&#34;%s: %s&#34; % (k,v))</code></pre>
</details>
</dd>
<dt id="ktrain.text.sequence_tagger"><code class="name flex">
<span>def <span class="ident">sequence_tagger</span></span>(<span>name, preproc, wv_path_or_url=None, bert_model='bert-base-multilingual-cased', bert_layers_to_use=[-2], word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><p>Build and return a sequence tagger (i.e., named entity recognizer).</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>name</code></strong> :&ensp;<code>string</code></dt>
<dd>one of:
- 'bilstm-crf' for Bidirectional LSTM-CRF model
- 'bilstm' for Bidirectional LSTM (no CRF layer)</dd>
</dl>
<p>preproc(NERPreprocessor):
an instance of NERPreprocessor
wv_path_or_url(str): either a URL or file path toa fasttext word vector file (.vec or .vec.zip or .vec.gz)
Example valid values for wv_path_or_url:</p>
<pre><code>                   Randomly-initialized word embeeddings:
                     set wv_path_or_url=None
                   English pretrained word vectors:
                     &lt;https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip&gt;
                   Chinese pretrained word vectors:
                     &lt;https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz&gt;
                   Russian pretrained word vectors:
                     &lt;https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz&gt;
                   Dutch pretrained word vectors:
                     &lt;https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz&gt;


                 See these two Web pages for a full list of URLs to word vector files for 
                 different languages:
                    1.  &lt;https://fasttext.cc/docs/en/english-vectors.html&gt; (for English)
                    2.  &lt;https://fasttext.cc/docs/en/crawl-vectors.html&gt; (for non-English langages)

                Default:None (randomly-initialized word embeddings are used)
</code></pre>
<p>bert_model_name(str):
the name of the BERT model.
default: 'bert-base-multilingual-cased'
This parameter is only used if bilstm-bert is selected for name parameter.
The value of this parameter is a name of BERT model from here:
<a href="https://huggingface.co/transformers/pretrained_models.html">https://huggingface.co/transformers/pretrained_models.html</a>
or a community-uploaded BERT model from here:
<a href="https://huggingface.co/models">https://huggingface.co/models</a>
Example values:
bert-base-multilingual-cased:
Multilingual BERT (157 languages) - this is the default
bert-base-cased:
English BERT
bert-base-chinese: Chinese BERT
distilbert-base-german-cased: German DistilBert
albert-base-v2: English ALBERT model
monologg/biobert_v1.1_pubmed: community uploaded BioBERT (pretrained on PubMed)</p>
<dl>
<dt>bert_layers_to_use(list): indices of hidden layers to use.
default:[-2] # second-to-last layer</dt>
<dt>To use the concatenation of last 4 layers: use [-1, -2, -3, -4]</dt>
<dt><strong><code>word_embedding_dim</code></strong> :&ensp;<code>int</code></dt>
<dd>word embedding dimensions.</dd>
<dt><strong><code>char_embedding_dim</code></strong> :&ensp;<code>int</code></dt>
<dd>character embedding dimensions.</dd>
<dt><strong><code>word_lstm_size</code></strong> :&ensp;<code>int</code></dt>
<dd>character LSTM feature extractor output dimensions.</dd>
<dt><strong><code>char_lstm_size</code></strong> :&ensp;<code>int</code></dt>
<dd>word tagger LSTM output dimensions.</dd>
<dt><strong><code>fc_dim</code></strong> :&ensp;<code>int</code></dt>
<dd>output fully-connected layer size.</dd>
<dt><strong><code>dropout</code></strong> :&ensp;<code>float</code></dt>
<dd>dropout rate.</dd>
<dt><strong><code>verbose</code></strong> :&ensp;<code>boolean</code></dt>
<dd>verbosity of output</dd>
</dl>
<h2 id="return">Return</h2>
<p>model (Model): A Keras Model instance</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def sequence_tagger(name, preproc, 
                    wv_path_or_url=None,
                    bert_model = &#39;bert-base-multilingual-cased&#39;,
                    bert_layers_to_use = U.DEFAULT_TRANSFORMER_LAYERS, 
                    word_embedding_dim=100,
                    char_embedding_dim=25,
                    word_lstm_size=100,
                    char_lstm_size=25,
                    fc_dim=100,
                    dropout=0.5,
                    verbose=1):
    &#34;&#34;&#34;
    Build and return a sequence tagger (i.e., named entity recognizer).

    Args:
        name (string): one of:
                      - &#39;bilstm-crf&#39; for Bidirectional LSTM-CRF model
                      - &#39;bilstm&#39; for Bidirectional LSTM (no CRF layer)
        preproc(NERPreprocessor):  an instance of NERPreprocessor
        wv_path_or_url(str): either a URL or file path toa fasttext word vector file (.vec or .vec.zip or .vec.gz)
                             Example valid values for wv_path_or_url:

                               Randomly-initialized word embeeddings:
                                 set wv_path_or_url=None
                               English pretrained word vectors:
                                 https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
                               Chinese pretrained word vectors:
                                 https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz
                               Russian pretrained word vectors:
                                 https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
                               Dutch pretrained word vectors:
                                 https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz


                             See these two Web pages for a full list of URLs to word vector files for 
                             different languages:
                                1.  https://fasttext.cc/docs/en/english-vectors.html (for English)
                                2.  https://fasttext.cc/docs/en/crawl-vectors.html (for non-English langages)

                            Default:None (randomly-initialized word embeddings are used)

        bert_model_name(str):  the name of the BERT model.  default: &#39;bert-base-multilingual-cased&#39;
                               This parameter is only used if bilstm-bert is selected for name parameter.
                               The value of this parameter is a name of BERT model from here:
                                        https://huggingface.co/transformers/pretrained_models.html
                               or a community-uploaded BERT model from here:
                                        https://huggingface.co/models
                               Example values:
                                 bert-base-multilingual-cased:  Multilingual BERT (157 languages) - this is the default
                                 bert-base-cased:  English BERT
                                 bert-base-chinese: Chinese BERT
                                 distilbert-base-german-cased: German DistilBert
                                 albert-base-v2: English ALBERT model
                                 monologg/biobert_v1.1_pubmed: community uploaded BioBERT (pretrained on PubMed)

        bert_layers_to_use(list): indices of hidden layers to use.  default:[-2] # second-to-last layer
                                  To use the concatenation of last 4 layers: use [-1, -2, -3, -4]
        word_embedding_dim (int): word embedding dimensions.
        char_embedding_dim (int): character embedding dimensions.
        word_lstm_size (int): character LSTM feature extractor output dimensions.
        char_lstm_size (int): word tagger LSTM output dimensions.
        fc_dim (int): output fully-connected layer size.
        dropout (float): dropout rate.

        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    &#34;&#34;&#34;
    
    if name not in SEQUENCE_TAGGERS:
        raise ValueError(&#39;invalid name: %s&#39; % (name))

    # check BERT
    if name in TRANSFORMER_MODELS and not bert_model:
        raise ValueError(&#39;bert_model is required for bilstm-bert models&#39;)
    if name in TRANSFORMER_MODELS and DISABLE_V2_BEHAVIOR:
        raise ValueError(&#39;BERT and other transformer models cannot be used with DISABLE_v2_BEHAVIOR&#39;)

    # check CRF
    if not DISABLE_V2_BEHAVIOR and name in V1_ONLY_MODELS:
        warnings.warn(&#39;Falling back to BiLSTM (no CRF) because DISABLE_V2_BEHAVIOR=False&#39;)
        msg = &#34;\nIMPORTANT NOTE: ktrain uses the CRF module from keras_contrib, which is not yet\n&#34; +\
              &#34;fully compatible with TensorFlow 2. You can still use the BiLSTM-CRF model\n&#34; +\
              &#34;in ktrain for sequence tagging with TensorFlow 2, but you must add the\n&#34; +\
              &#34;following to the top of your script or notebook BEFORE you import ktrain:\n\n&#34; +\
              &#34;import os\n&#34; +\
              &#34;os.environ[&#39;DISABLE_V2_BEHAVIOR&#39;] = &#39;1&#39;\n\n&#34; +\
              &#34;For this run, a vanilla BiLSTM model (with no CRF layer) will be used.\n&#34;
        print(msg)
        name = BILSTM if name == BILSTM_CRF else BILSTM_ELMO

    # check for use_char=True
    if not DISABLE_V2_BEHAVIOR and preproc.p._use_char:
        # turn off masking due to open TF2 issue ##33148: https://github.com/tensorflow/tensorflow/issues/33148
        warnings.warn(&#39;Setting use_char=False:  character embeddings cannot be used in TF2 due to open TensorFlow 2 bug (#33148).\n&#39; +\
                       &#39;Add os.environ[&#34;DISABLE_V2_BEHAVIOR&#34;] = &#34;1&#34; to the top of script if you really want to use it.&#39;)
        preproc.p._use_char=False

    if verbose:
        emb_names = []
        if wv_path_or_url is not None: 
            emb_names.append(&#39;word embeddings initialized with fasttext word vectors (%s)&#39; % (os.path.basename(wv_path_or_url)))
        else:
            emb_names.append(&#39;word embeddings initialized randomly&#39;)
        if name in TRANSFORMER_MODELS: emb_names.append(&#39;BERT embeddings with &#39; + bert_model)
        if name in ELMO_MODELS: emb_names.append(&#39;Elmo embeddings for English&#39;)
        if preproc.p._use_char:  emb_names.append(&#39;character embeddings&#39;)
        if len(emb_names) &gt; 1:
            print(&#39;Embedding schemes employed (combined with concatenation):&#39;)
        else:
            print(&#39;embedding schemes employed:&#39;)
        for emb_name in emb_names:
            print(&#39;\t%s&#39; % (emb_name))
        print()

    # setup embedding
    if wv_path_or_url is not None:
        wv_model, word_embedding_dim = preproc.get_wv_model(wv_path_or_url, verbose=verbose)
    else:
        wv_model = None
    if name == BILSTM_CRF:
        use_crf = False if not DISABLE_V2_BEHAVIOR else True # fallback to bilstm 
    elif name == BILSTM_CRF_ELMO:
        use_crf = False if not DISABLE_V2_BEHAVIOR else True # fallback to bilstm
        preproc.p.activate_elmo()
    elif name == BILSTM:
        use_crf = False
    elif name == BILSTM_ELMO:
        use_crf = False
        preproc.p.activate_elmo()
    elif name == BILSTM_TRANSFORMER:
        use_crf = False
        preproc.p.activate_transformer(bert_model, layers=bert_layers_to_use, force=True)
    else:
        raise ValueError(&#39;Unsupported model name&#39;)
    model = BiLSTMCRF(char_embedding_dim=char_embedding_dim,
                      word_embedding_dim=word_embedding_dim,
                      char_lstm_size=char_lstm_size,
                      word_lstm_size=word_lstm_size,
                      fc_dim=fc_dim,
                      char_vocab_size=preproc.p.char_vocab_size,
                      word_vocab_size=preproc.p.word_vocab_size,
                      num_labels=preproc.p.label_size,
                      dropout=dropout,
                      use_crf=use_crf,
                      use_char=preproc.p._use_char,
                      embeddings=wv_model,
                      use_elmo=preproc.p.elmo_is_activated(),
                      use_transformer_with_dim=preproc.p.get_transformer_dim())
    model, loss = model.build()
    model.compile(loss=loss, optimizer=U.DEFAULT_OPT)
    return model</code></pre>
</details>
</dd>
<dt id="ktrain.text.text_classifier"><code class="name flex">
<span>def <span class="ident">text_classifier</span></span>(<span>name, train_data, preproc=None, multilabel=None, metrics=['accuracy'], verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Build and return a text classification model.

Args:
    name (string): one of:
                  - 'fasttext' for FastText model
                  - 'nbsvm' for NBSVM model  
                  - 'logreg' for logistic regression using embedding layers
                  - 'bigru' for Bidirectional GRU with pretrained word vectors
                  - 'bert' for BERT Text Classification
                  - 'distilbert' for Hugging Face DistilBert model

    train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
                        returned from one of the texts_from_* functions
    preproc: a ktrain.text.TextPreprocessor instance.
             As of v0.8.0, this is required.
    multilabel (bool):  If True, multilabel model will be returned.
                        If false, binary/multiclass model will be returned.
                        If None, multilabel will be inferred from data.
    metrics(list): metrics to use
    verbose (boolean): verbosity of output
Return:
    model (Model): A Keras Model instance
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def text_classifier(name, train_data, preproc=None, multilabel=None, metrics=[&#39;accuracy&#39;], verbose=1):
    &#34;&#34;&#34;
    ```
    Build and return a text classification model.

    Args:
        name (string): one of:
                      - &#39;fasttext&#39; for FastText model
                      - &#39;nbsvm&#39; for NBSVM model  
                      - &#39;logreg&#39; for logistic regression using embedding layers
                      - &#39;bigru&#39; for Bidirectional GRU with pretrained word vectors
                      - &#39;bert&#39; for BERT Text Classification
                      - &#39;distilbert&#39; for Hugging Face DistilBert model

        train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
                            returned from one of the texts_from_* functions
        preproc: a ktrain.text.TextPreprocessor instance.
                 As of v0.8.0, this is required.
        multilabel (bool):  If True, multilabel model will be returned.
                            If false, binary/multiclass model will be returned.
                            If None, multilabel will be inferred from data.
        metrics(list): metrics to use
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    &#34;&#34;&#34;
    if name not in TEXT_CLASSIFIERS:
        raise ValueError(&#39;invalid name for text classification: %s&#39; % (name)) 
    if preproc is not None and not preproc.get_classes():
        raise ValueError(&#39;preproc.get_classes() is empty, but required for text classification&#39;)
    return _text_model(name, train_data, preproc=preproc,
                       multilabel=multilabel, classification=True, metrics=metrics, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.text_regression_model"><code class="name flex">
<span>def <span class="ident">text_regression_model</span></span>(<span>name, train_data, preproc=None, metrics=['mae'], verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Build and return a text regression model.

Args:
    name (string): one of:
                  - 'fasttext' for FastText model
                  - 'nbsvm' for NBSVM model  
                  - 'linreg' for linear regression using embedding layers
                  - 'bigru' for Bidirectional GRU with pretrained word vectors
                  - 'bert' for BERT Text Classification
                  - 'distilbert' for Hugging Face DistilBert model

    train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train)
    preproc: a ktrain.text.TextPreprocessor instance.
             As of v0.8.0, this is required.
    metrics(list): metrics to use
    verbose (boolean): verbosity of output
Return:
    model (Model): A Keras Model instance
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def text_regression_model(name, train_data, preproc=None, metrics=[&#39;mae&#39;],  verbose=1):
    &#34;&#34;&#34;
    ```
    Build and return a text regression model.

    Args:
        name (string): one of:
                      - &#39;fasttext&#39; for FastText model
                      - &#39;nbsvm&#39; for NBSVM model  
                      - &#39;linreg&#39; for linear regression using embedding layers
                      - &#39;bigru&#39; for Bidirectional GRU with pretrained word vectors
                      - &#39;bert&#39; for BERT Text Classification
                      - &#39;distilbert&#39; for Hugging Face DistilBert model

        train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train)
        preproc: a ktrain.text.TextPreprocessor instance.
                 As of v0.8.0, this is required.
        metrics(list): metrics to use
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    &#34;&#34;&#34;
    if name not in TEXT_REGRESSION_MODELS:
        raise ValueError(&#39;invalid name for text classification: %s&#39; % (name) )
    if preproc is not None and preproc.get_classes():
        raise ValueError(&#39;preproc.get_classes() is supposed to be empty for text regression tasks&#39;)
    return _text_model(name, train_data, preproc=preproc,
                      multilabel=False, classification=False, metrics=metrics, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.texts_from_array"><code class="name flex">
<span>def <span class="ident">texts_from_array</span></span>(<span>x_train, y_train, x_test=None, y_test=None, class_names=[], max_features=20000, maxlen=400, val_pct=0.1, ngram_range=1, preprocess_mode='standard', lang=None, random_state=None, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Loads and preprocesses text data from arrays.
texts_from_array can handle data for both text classification
and text regression.  If class_names is empty, a regression task is assumed.
Args:
    x_train(list): list of training texts 
    y_train(list): labels in one of the following forms:
                   1. list of integers representing classes (class_names is required)
                   2. list of strings representing classes (class_names is not needed and ignored.)
                   3. a one or multi hot encoded array representing classes (class_names is required)
                   4. numerical values for text regresssion (class_names should be left empty)
    x_test(list): list of training texts 
    y_test(list): labels in one of the following forms:
                   1. list of integers representing classes (class_names is required)
                   2. list of strings representing classes (class_names is not needed and ignored.)
                   3. a one or multi hot encoded array representing classes (class_names is required)
                   4. numerical values for text regresssion (class_names should be left empty)
    class_names (list): list of strings representing class labels
                        shape should be (num_examples,1) or (num_examples,)
    max_features(int): max num of words to consider in vocabulary
                       Note: This is only used for preprocess_mode='standard'.
    maxlen(int): each document can be of most &lt;maxlen&gt; words. 0 is used as padding ID.
    ngram_range(int): size of multi-word phrases to consider
                      e.g., 2 will consider both 1-word phrases and 2-word phrases
                           limited by max_features
    val_pct(float): Proportion of training to use for validation.
                    Has no effect if x_val and  y_val is supplied.
    preprocess_mode (str):  Either 'standard' (normal tokenization) or one of {'bert', 'distilbert'}
                            tokenization and preprocessing for use with 
                            BERT/DistilBert text classification model.
    lang (str):            language.  Auto-detected if None.
    random_state(int):      If integer is supplied, train/test split is reproducible.
                            If None, train/test split will be random.
    verbose (boolean): verbosity
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def texts_from_array(x_train, y_train, x_test=None, y_test=None, 
                   class_names = [],
                   max_features=MAX_FEATURES, maxlen=MAXLEN, 
                   val_pct=0.1, ngram_range=1, preprocess_mode=&#39;standard&#39;, 
                   lang=None, # auto-detected
                   random_state=None,
                   verbose=1):
    &#34;&#34;&#34;
    ```
    Loads and preprocesses text data from arrays.
    texts_from_array can handle data for both text classification
    and text regression.  If class_names is empty, a regression task is assumed.
    Args:
        x_train(list): list of training texts 
        y_train(list): labels in one of the following forms:
                       1. list of integers representing classes (class_names is required)
                       2. list of strings representing classes (class_names is not needed and ignored.)
                       3. a one or multi hot encoded array representing classes (class_names is required)
                       4. numerical values for text regresssion (class_names should be left empty)
        x_test(list): list of training texts 
        y_test(list): labels in one of the following forms:
                       1. list of integers representing classes (class_names is required)
                       2. list of strings representing classes (class_names is not needed and ignored.)
                       3. a one or multi hot encoded array representing classes (class_names is required)
                       4. numerical values for text regresssion (class_names should be left empty)
        class_names (list): list of strings representing class labels
                            shape should be (num_examples,1) or (num_examples,)
        max_features(int): max num of words to consider in vocabulary
                           Note: This is only used for preprocess_mode=&#39;standard&#39;.
        maxlen(int): each document can be of most &lt;maxlen&gt; words. 0 is used as padding ID.
        ngram_range(int): size of multi-word phrases to consider
                          e.g., 2 will consider both 1-word phrases and 2-word phrases
                               limited by max_features
        val_pct(float): Proportion of training to use for validation.
                        Has no effect if x_val and  y_val is supplied.
        preprocess_mode (str):  Either &#39;standard&#39; (normal tokenization) or one of {&#39;bert&#39;, &#39;distilbert&#39;}
                                tokenization and preprocessing for use with 
                                BERT/DistilBert text classification model.
        lang (str):            language.  Auto-detected if None.
        random_state(int):      If integer is supplied, train/test split is reproducible.
                                If None, train/test split will be random.
        verbose (boolean): verbosity
    ```
    &#34;&#34;&#34;
    U.check_array(x_train,  y=y_train, X_name=&#39;x_train&#39;, y_name=&#39;y_train&#39;)

    if x_test is None or y_test is None:
        x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, 
                                                            test_size=val_pct,
                                                            random_state=random_state)
    else:
        U.check_array(x_test,  y=y_test, X_name=&#39;x_test&#39;, y_name=&#39;y_test&#39;)


    # removed as TextPreprocessor now handles this.
    #if isinstance(y_train[0], str):
        #if not isinstance(y_test[0], str): 
            #raise ValueError(&#39;y_train contains strings, but y_test does not&#39;)
        #encoder = LabelEncoder()
        #encoder.fit(y_train)
        #y_train = encoder.transform(y_train)
        #y_test = encoder.transform(y_test)


    # detect language
    if lang is None: lang = TU.detect_lang(x_train)
    check_unsupported_lang(lang, preprocess_mode)

    # return preprocessed the texts
    preproc_type = tpp.TEXT_PREPROCESSORS.get(preprocess_mode, None)
    if None: raise ValueError(&#39;unsupported preprocess_mode&#39;)
    preproc = preproc_type(maxlen,
                           max_features,
                           class_names = class_names,
                           lang=lang, ngram_range=ngram_range)
    trn = preproc.preprocess_train(x_train, y_train, verbose=verbose)
    val = preproc.preprocess_test(x_test,  y_test, verbose=verbose)
    if not preproc.get_classes() and verbose:
        print(&#39;task: text regression (supply class_names argument if this is supposed to be classification task)&#39;)
    else:
        print(&#39;task: text classification&#39;)
    return (trn, val, preproc)</code></pre>
</details>
</dd>
<dt id="ktrain.text.texts_from_csv"><code class="name flex">
<span>def <span class="ident">texts_from_csv</span></span>(<span>train_filepath, text_column, label_columns=[], val_filepath=None, max_features=20000, maxlen=400, val_pct=0.1, ngram_range=1, preprocess_mode='standard', encoding=None, lang=None, sep=',', is_regression=False, random_state=None, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Loads text data from CSV or TSV file. Class labels are assumed to be
one of the following formats:
    1. one-hot-encoded or multi-hot-encoded arrays representing classes:
          Example with label_columns=['positive', 'negative'] and text_column='text':
            text|positive|negative
            I like this movie.|1|0
            I hated this movie.|0|1
        Classification will have a single one in each row: [[1,0,0], [0,1,0]]]
        Multi-label classification will have one more ones in each row: [[1,1,0], [0,1,1]]
    2. labels are in a single column of string or integer values representing classs labels
           Example with label_columns=['label'] and text_column='text':
             text|label
             I like this movie.|positive
             I hated this movie.|negative
   3. labels are a single column of numerical values for text regression
      NOTE: Must supply is_regression=True for labels to be treated as numerical targets
             wine_description|wine_price
             Exquisite wine!|100
             Wine for budget shoppers|8

Args:
    train_filepath(str): file path to training CSV
    text_column(str): name of column containing the text
    label_column(list): list of columns that are to be treated as labels
    val_filepath(string): file path to test CSV.  If not supplied,
                           10% of documents in training CSV will be
                           used for testing/validation.
    max_features(int): max num of words to consider in vocabulary
                       Note: This is only used for preprocess_mode='standard'.
    maxlen(int): each document can be of most &lt;maxlen&gt; words. 0 is used as padding ID.
    ngram_range(int): size of multi-word phrases to consider
                      e.g., 2 will consider both 1-word phrases and 2-word phrases
                           limited by max_features
    val_pct(float): Proportion of training to use for validation.
                    Has no effect if val_filepath is supplied.
    preprocess_mode (str):  Either 'standard' (normal tokenization) or one of {'bert', 'distilbert'}
                            tokenization and preprocessing for use with 
                            BERT/DistilBert text classification model.
    encoding (str):        character encoding to use. Auto-detected if None
    lang (str):            language.  Auto-detected if None.
    sep(str):              delimiter for CSV (comma is default)
    is_regression(bool):  If True, integer targets will be treated as numerical targets instead of class IDs
    random_state(int):      If integer is supplied, train/test split is reproducible.
                            If None, train/test split will be random
    verbose (boolean): verbosity
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def texts_from_csv(train_filepath, 
                   text_column,
                   label_columns = [],
                   val_filepath=None,
                   max_features=MAX_FEATURES, maxlen=MAXLEN, 
                   val_pct=0.1, ngram_range=1, preprocess_mode=&#39;standard&#39;, 
                   encoding=None,  # auto-detected
                   lang=None,      # auto-detected
                   sep=&#39;,&#39;, 
                   is_regression=False,
                   random_state=None,       
                   verbose=1):
    &#34;&#34;&#34;
    ```
    Loads text data from CSV or TSV file. Class labels are assumed to be
    one of the following formats:
        1. one-hot-encoded or multi-hot-encoded arrays representing classes:
              Example with label_columns=[&#39;positive&#39;, &#39;negative&#39;] and text_column=&#39;text&#39;:
                text|positive|negative
                I like this movie.|1|0
                I hated this movie.|0|1
            Classification will have a single one in each row: [[1,0,0], [0,1,0]]]
            Multi-label classification will have one more ones in each row: [[1,1,0], [0,1,1]]
        2. labels are in a single column of string or integer values representing classs labels
               Example with label_columns=[&#39;label&#39;] and text_column=&#39;text&#39;:
                 text|label
                 I like this movie.|positive
                 I hated this movie.|negative
       3. labels are a single column of numerical values for text regression
          NOTE: Must supply is_regression=True for labels to be treated as numerical targets
                 wine_description|wine_price
                 Exquisite wine!|100
                 Wine for budget shoppers|8

    Args:
        train_filepath(str): file path to training CSV
        text_column(str): name of column containing the text
        label_column(list): list of columns that are to be treated as labels
        val_filepath(string): file path to test CSV.  If not supplied,
                               10% of documents in training CSV will be
                               used for testing/validation.
        max_features(int): max num of words to consider in vocabulary
                           Note: This is only used for preprocess_mode=&#39;standard&#39;.
        maxlen(int): each document can be of most &lt;maxlen&gt; words. 0 is used as padding ID.
        ngram_range(int): size of multi-word phrases to consider
                          e.g., 2 will consider both 1-word phrases and 2-word phrases
                               limited by max_features
        val_pct(float): Proportion of training to use for validation.
                        Has no effect if val_filepath is supplied.
        preprocess_mode (str):  Either &#39;standard&#39; (normal tokenization) or one of {&#39;bert&#39;, &#39;distilbert&#39;}
                                tokenization and preprocessing for use with 
                                BERT/DistilBert text classification model.
        encoding (str):        character encoding to use. Auto-detected if None
        lang (str):            language.  Auto-detected if None.
        sep(str):              delimiter for CSV (comma is default)
        is_regression(bool):  If True, integer targets will be treated as numerical targets instead of class IDs
        random_state(int):      If integer is supplied, train/test split is reproducible.
                                If None, train/test split will be random
        verbose (boolean): verbosity
    ```
    &#34;&#34;&#34;
    if encoding is None:
        with open(train_filepath, &#39;rb&#39;) as f:
            #encoding = chardet.detect(f.read())[&#39;encoding&#39;]
            #encoding = &#39;utf-8&#39; if encoding.lower() in [&#39;ascii&#39;, &#39;utf8&#39;, &#39;utf-8&#39;] else encoding
            encoding = TU.detect_encoding(f.read())
            U.vprint(&#39;detected encoding: %s (if wrong, set manually)&#39; % (encoding), verbose=verbose)

    train_df = pd.read_csv(train_filepath, encoding=encoding,sep=sep)
    val_df = pd.read_csv(val_filepath, encoding=encoding,sep=sep) if val_filepath is not None else None
    return texts_from_df(train_df,
                         text_column,
                         label_columns=label_columns,
                         val_df = val_df,
                         max_features=max_features,
                         maxlen=maxlen,
                         val_pct=val_pct,
                         ngram_range=ngram_range, 
                         preprocess_mode=preprocess_mode,
                         lang=lang, is_regression=is_regression, random_state=random_state,
                         verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.texts_from_df"><code class="name flex">
<span>def <span class="ident">texts_from_df</span></span>(<span>train_df, text_column, label_columns=[], val_df=None, max_features=20000, maxlen=400, val_pct=0.1, ngram_range=1, preprocess_mode='standard', lang=None, is_regression=False, random_state=None, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Loads text data from Pandas dataframe file. Class labels are assumed to be
one of the following formats:
    1. one-hot-encoded or multi-hot-encoded arrays representing classes:
          Example with label_columns=['positive', 'negative'] and text_column='text':
            text|positive|negative
            I like this movie.|1|0
            I hated this movie.|0|1
        Classification will have a single one in each row: [[1,0,0], [0,1,0]]]
        Multi-label classification will have one more ones in each row: [[1,1,0], [0,1,1]]
    2. labels are in a single column of string or integer values representing class labels
           Example with label_columns=['label'] and text_column='text':
             text|label
             I like this movie.|positive
             I hated this movie.|negative
   3. labels are a single column of numerical values for text regression
      NOTE: Must supply is_regression=True for integer labels to be treated as numerical targets
             wine_description|wine_price
             Exquisite wine!|100
             Wine for budget shoppers|8

Args:
    train_df(dataframe): Pandas dataframe
    text_column(str): name of column containing the text
    label_columns(list): list of columns that are to be treated as labels
    val_df(dataframe): file path to test dataframe.  If not supplied,
                           10% of documents in training df will be
                           used for testing/validation.
    max_features(int): max num of words to consider in vocabulary.
                       Note: This is only used for preprocess_mode='standard'.
    maxlen(int): each document can be of most &lt;maxlen&gt; words. 0 is used as padding ID.
    ngram_range(int): size of multi-word phrases to consider
                      e.g., 2 will consider both 1-word phrases and 2-word phrases
                           limited by max_features
    val_pct(float): Proportion of training to use for validation.
                    Has no effect if val_filepath is supplied.
    preprocess_mode (str):  Either 'standard' (normal tokenization) or one of {'bert', 'distilbert'}
                            tokenization and preprocessing for use with 
                            BERT/DistilBert text classification model.
    lang (str):            language.  Auto-detected if None.
    is_regression(bool):  If True, integer targets will be treated as numerical targets instead of class IDs
    random_state(int):      If integer is supplied, train/test split is reproducible.
                            If None, train/test split will be random
    verbose (boolean): verbosity
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def texts_from_df(train_df, 
                   text_column,
                   label_columns = [],
                   val_df=None,
                   max_features=MAX_FEATURES, maxlen=MAXLEN, 
                   val_pct=0.1, ngram_range=1, preprocess_mode=&#39;standard&#39;, 
                   lang=None, # auto-detected
                   is_regression=False,
                   random_state=None,
                   verbose=1):
    &#34;&#34;&#34;
    ```
    Loads text data from Pandas dataframe file. Class labels are assumed to be
    one of the following formats:
        1. one-hot-encoded or multi-hot-encoded arrays representing classes:
              Example with label_columns=[&#39;positive&#39;, &#39;negative&#39;] and text_column=&#39;text&#39;:
                text|positive|negative
                I like this movie.|1|0
                I hated this movie.|0|1
            Classification will have a single one in each row: [[1,0,0], [0,1,0]]]
            Multi-label classification will have one more ones in each row: [[1,1,0], [0,1,1]]
        2. labels are in a single column of string or integer values representing class labels
               Example with label_columns=[&#39;label&#39;] and text_column=&#39;text&#39;:
                 text|label
                 I like this movie.|positive
                 I hated this movie.|negative
       3. labels are a single column of numerical values for text regression
          NOTE: Must supply is_regression=True for integer labels to be treated as numerical targets
                 wine_description|wine_price
                 Exquisite wine!|100
                 Wine for budget shoppers|8

    Args:
        train_df(dataframe): Pandas dataframe
        text_column(str): name of column containing the text
        label_columns(list): list of columns that are to be treated as labels
        val_df(dataframe): file path to test dataframe.  If not supplied,
                               10% of documents in training df will be
                               used for testing/validation.
        max_features(int): max num of words to consider in vocabulary.
                           Note: This is only used for preprocess_mode=&#39;standard&#39;.
        maxlen(int): each document can be of most &lt;maxlen&gt; words. 0 is used as padding ID.
        ngram_range(int): size of multi-word phrases to consider
                          e.g., 2 will consider both 1-word phrases and 2-word phrases
                               limited by max_features
        val_pct(float): Proportion of training to use for validation.
                        Has no effect if val_filepath is supplied.
        preprocess_mode (str):  Either &#39;standard&#39; (normal tokenization) or one of {&#39;bert&#39;, &#39;distilbert&#39;}
                                tokenization and preprocessing for use with 
                                BERT/DistilBert text classification model.
        lang (str):            language.  Auto-detected if None.
        is_regression(bool):  If True, integer targets will be treated as numerical targets instead of class IDs
        random_state(int):      If integer is supplied, train/test split is reproducible.
                                If None, train/test split will be random
        verbose (boolean): verbosity
    ```
    &#34;&#34;&#34;

    # read in train and test data
    train_df = train_df.copy()
    train_df[text_column].fillna(&#39;fillna&#39;, inplace=True)
    if val_df is not None:
        val_df = val_df.copy()
        val_df[text_column].fillna(&#39;fillna&#39;, inplace=True)
    else:
        train_df, val_df = train_test_split(train_df, test_size=val_pct, random_state=random_state)

    # transform labels
    ytransdf = U.YTransformDataFrame(label_columns, is_regression=is_regression)
    t_df = ytransdf.apply_train(train_df)
    v_df = ytransdf.apply_test(val_df)
    class_names = ytransdf.get_classes()
    new_lab_cols = ytransdf.get_label_columns(squeeze=True)
    x_train = t_df[text_column].values
    y_train = t_df[new_lab_cols].values
    x_test = v_df[text_column].values
    y_test = v_df[new_lab_cols].values

    # detect language
    if lang is None: lang = TU.detect_lang(x_train)
    check_unsupported_lang(lang, preprocess_mode)


    # return preprocessed the texts
    preproc_type = tpp.TEXT_PREPROCESSORS.get(preprocess_mode, None)
    if None: raise ValueError(&#39;unsupported preprocess_mode&#39;)
    preproc = preproc_type(maxlen,
                           max_features,
                           class_names = class_names,
                           lang=lang, ngram_range=ngram_range)
    trn = preproc.preprocess_train(x_train, y_train, verbose=verbose)
    val = preproc.preprocess_test(x_test,  y_test, verbose=verbose)
    # QUICKFIX for #314
    preproc.ytransform.le = ytransdf.le
    return (trn, val, preproc)</code></pre>
</details>
</dd>
<dt id="ktrain.text.texts_from_folder"><code class="name flex">
<span>def <span class="ident">texts_from_folder</span></span>(<span>datadir, classes=None, max_features=20000, maxlen=400, ngram_range=1, train_test_names=['train', 'test'], preprocess_mode='standard', encoding=None, lang=None, val_pct=0.1, random_state=None, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Returns corpus as sequence of word IDs.
Assumes corpus is in the following folder structure:
├── datadir
│   ├── train
│   │   ├── class0       # folder containing documents of class 0
│   │   ├── class1       # folder containing documents of class 1
│   │   ├── class2       # folder containing documents of class 2
│   │   └── classN       # folder containing documents of class N
│   └── test 
│       ├── class0       # folder containing documents of class 0
│       ├── class1       # folder containing documents of class 1
│       ├── class2       # folder containing documents of class 2
│       └── classN       # folder containing documents of class N

Each subfolder should contain documents in plain text format.
If train and test contain additional subfolders that do not represent
classes, they can be ignored by explicitly listing the subfolders of
interest using the classes argument.
Args:
    datadir (str): path to folder
    classes (list): list of classes (subfolders to consider).
                    This is simply supplied as the categories argument
                    to sklearn's load_files function.
    max_features (int):  maximum number of unigrams to consider
                         Note: This is only used for preprocess_mode='standard'.
    maxlen (int):  maximum length of tokens in document
    ngram_range (int):  If &gt; 1, will include 2=bigrams, 3=trigrams and bigrams
    train_test_names (list):  list of strings represnting the subfolder
                             name for train and validation sets
                             if test name is missing, &lt;val_pct&gt; of training
                             will be used for validation
    preprocess_mode (str):  Either 'standard' (normal tokenization) or one of {'bert', 'distilbert'}
                            tokenization and preprocessing for use with 
                            BERT/DistilBert text classification model.
    encoding (str):        character encoding to use. Auto-detected if None
    lang (str):            language.  Auto-detected if None.
    val_pct(float):        Onlyl used if train_test_names  has 1 and not 2 names
    random_state(int):      If integer is supplied, train/test split is reproducible.
                            IF None, train/test split will be random
    verbose (bool):         verbosity

</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def texts_from_folder(datadir, classes=None, 
                      max_features=MAX_FEATURES, maxlen=MAXLEN,
                      ngram_range=1,
                      train_test_names=[&#39;train&#39;, &#39;test&#39;],
                      preprocess_mode=&#39;standard&#39;,
                      encoding=None, # detected automatically
                      lang=None, # detected automatically
                      val_pct=0.1, random_state=None,
                      verbose=1):
    &#34;&#34;&#34;
    ```
    Returns corpus as sequence of word IDs.
    Assumes corpus is in the following folder structure:
    ├── datadir
    │   ├── train
    │   │   ├── class0       # folder containing documents of class 0
    │   │   ├── class1       # folder containing documents of class 1
    │   │   ├── class2       # folder containing documents of class 2
    │   │   └── classN       # folder containing documents of class N
    │   └── test 
    │       ├── class0       # folder containing documents of class 0
    │       ├── class1       # folder containing documents of class 1
    │       ├── class2       # folder containing documents of class 2
    │       └── classN       # folder containing documents of class N

    Each subfolder should contain documents in plain text format.
    If train and test contain additional subfolders that do not represent
    classes, they can be ignored by explicitly listing the subfolders of
    interest using the classes argument.
    Args:
        datadir (str): path to folder
        classes (list): list of classes (subfolders to consider).
                        This is simply supplied as the categories argument
                        to sklearn&#39;s load_files function.
        max_features (int):  maximum number of unigrams to consider
                             Note: This is only used for preprocess_mode=&#39;standard&#39;.
        maxlen (int):  maximum length of tokens in document
        ngram_range (int):  If &gt; 1, will include 2=bigrams, 3=trigrams and bigrams
        train_test_names (list):  list of strings represnting the subfolder
                                 name for train and validation sets
                                 if test name is missing, &lt;val_pct&gt; of training
                                 will be used for validation
        preprocess_mode (str):  Either &#39;standard&#39; (normal tokenization) or one of {&#39;bert&#39;, &#39;distilbert&#39;}
                                tokenization and preprocessing for use with 
                                BERT/DistilBert text classification model.
        encoding (str):        character encoding to use. Auto-detected if None
        lang (str):            language.  Auto-detected if None.
        val_pct(float):        Onlyl used if train_test_names  has 1 and not 2 names
        random_state(int):      If integer is supplied, train/test split is reproducible.
                                IF None, train/test split will be random
        verbose (bool):         verbosity
        
    ```
    &#34;&#34;&#34;

    # check train_test_names
    if len(train_test_names) &lt; 1 or len(train_test_names) &gt; 2:
        raise ValueError(&#39;train_test_names must have 1 or two elements for train and optionally validation&#39;)

    # read in training and test corpora
    train_str = train_test_names[0]
    train_b = load_files(os.path.join(datadir, train_str), shuffle=True, categories=classes)
    if len(train_test_names) &gt; 1:
        test_str = train_test_names[1]
        test_b = load_files(os.path.join(datadir,  test_str), shuffle=False, categories=classes)
        x_train = train_b.data
        y_train = train_b.target
        x_test = test_b.data
        y_test = test_b.target
    else:
        x_train, x_test, y_train, y_test = train_test_split(train_b.data, 
                                                            train_b.target, 
                                                            test_size=val_pct,
                                                            random_state=random_state)

    # decode based on supplied encoding
    if encoding is None:
        encoding = TU.detect_encoding(x_train)
        U.vprint(&#39;detected encoding: %s&#39; % (encoding), verbose=verbose)
    
    try:
        x_train = [x.decode(encoding) for x in x_train]
        x_test = [x.decode(encoding) for x in x_test]
    except:
        U.vprint(&#39;Decoding with %s failed 1st attempt - using %s with skips&#39; % (encoding, 
                                                                                encoding),
                                                                                verbose=verbose)
        x_train = TU.decode_by_line(x_train, encoding=encoding, verbose=verbose)
        x_test = TU.decode_by_line(x_test, encoding=encoding, verbose=verbose)


    # detect language
    if lang is None: lang = TU.detect_lang(x_train)
    check_unsupported_lang(lang, preprocess_mode)


    # return preprocessed the texts
    preproc_type = tpp.TEXT_PREPROCESSORS.get(preprocess_mode, None)
    if None: raise ValueError(&#39;unsupported preprocess_mode&#39;)
    preproc = preproc_type(maxlen,
                           max_features,
                           class_names = train_b.target_names,
                           lang=lang, ngram_range=ngram_range)
    trn = preproc.preprocess_train(x_train, y_train, verbose=verbose)
    val = preproc.preprocess_test(x_test,  y_test, verbose=verbose)
    return (trn, val, preproc)</code></pre>
</details>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="ktrain.text.EnglishTranslator"><code class="flex name class">
<span>class <span class="ident">EnglishTranslator</span></span>
<span>(</span><span>src_lang=None, device=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Class to translate text in various languages to English.</p>
<pre><code>Constructor for English translator

Args:
  src_lang(str): language code of source language.
                 Must be one of SUPPORTED_SRC_LANGS:
                   'zh': Chinese (either tradtional or simplified)
                   'ar': Arabic
                   'ru' : Russian
                   'de': German
                   'af': Afrikaans
                   'es': Spanish
                   'fr': French
                   'it': Italian
                   'pt': Portuguese
  device(str): device to use (e.g., 'cuda', 'cpu')
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class EnglishTranslator():
    &#34;&#34;&#34;
    Class to translate text in various languages to English.
    &#34;&#34;&#34;

    def __init__(self, src_lang=None, device=None):
        &#34;&#34;&#34;
        ```
        Constructor for English translator

        Args:
          src_lang(str): language code of source language.
                         Must be one of SUPPORTED_SRC_LANGS:
                           &#39;zh&#39;: Chinese (either tradtional or simplified)
                           &#39;ar&#39;: Arabic
                           &#39;ru&#39; : Russian
                           &#39;de&#39;: German
                           &#39;af&#39;: Afrikaans
                           &#39;es&#39;: Spanish
                           &#39;fr&#39;: French
                           &#39;it&#39;: Italian
                           &#39;pt&#39;: Portuguese
          device(str): device to use (e.g., &#39;cuda&#39;, &#39;cpu&#39;)
        ```
        &#34;&#34;&#34;

        if src_lang is None or src_lang not in SUPPORTED_SRC_LANGS:
            raise ValueError(&#39;A src_lang must be supplied and be one of: %s&#39; % (SUPPORED_SRC_LANG))
        self.src_lang = src_lang
        self.translators = []
        if src_lang == &#39;ar&#39;:
            self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-ar-en&#39;, device=device))
        elif src_lang == &#39;ru&#39;:
            self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-ru-en&#39;, device=device))
        elif src_lang == &#39;de&#39;:
            self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-de-en&#39;, device=device))
        elif src_lang == &#39;af&#39;:
            self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-af-en&#39;, device=device))
        elif src_lang in [&#39;es&#39;, &#39;fr&#39;, &#39;it&#39;, &#39;pt&#39;]:
            self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-ROMANCE-en&#39;, device=device))
        #elif src_lang == &#39;zh&#39;: # could not find zh-&gt;en model, so currently doing two-step translation to English via German
            #self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-ZH-de&#39;, device=device))
            #self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-de-en&#39;, device=device))
        elif src_lang == &#39;zh&#39;:
            self.translators.append(Translator(model_name=&#39;Helsinki-NLP/opus-mt-zh-en&#39;, device=device))
        else:
            raise ValueError(&#39;lang:%s is currently not supported.&#39; % (src_lang))


    def translate(self, src_text, join_with=&#39;\n&#39;, num_beams=None, early_stopping=None):
        &#34;&#34;&#34;
        ```
        Translate source document to English.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

        Args:
          src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs. 
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        &#34;&#34;&#34;
        text = src_text
        for t in self.translators:
             text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping)
        return text</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.EnglishTranslator.translate"><code class="name flex">
<span>def <span class="ident">translate</span></span>(<span>self, src_text, join_with='\n', num_beams=None, early_stopping=None)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Translate source document to English.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

Args:
  src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                 The source text can either be a single sentence or an entire document with multiple sentences
                 and paragraphs. 
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  join_with(str):  list of translated sentences will be delimited with this character.
                   default: each sentence on separate line
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated text
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def translate(self, src_text, join_with=&#39;\n&#39;, num_beams=None, early_stopping=None):
    &#34;&#34;&#34;
    ```
    Translate source document to English.
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

    Args:
      src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                     The source text can either be a single sentence or an entire document with multiple sentences
                     and paragraphs. 
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      join_with(str):  list of translated sentences will be delimited with this character.
                       default: each sentence on separate line
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated text
    ```
    &#34;&#34;&#34;
    text = src_text
    for t in self.translators:
         text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping)
    return text</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.SimpleQA"><code class="flex name class">
<span>class <span class="ident">SimpleQA</span></span>
<span>(</span><span>index_dir, bert_squad_model='bert-large-uncased-whole-word-masking-finetuned-squad', bert_emb_model='bert-base-uncased')</span>
</code></dt>
<dd>
<div class="desc"><p>SimpleQA: Question-Answering on a list of texts</p>
<pre><code>SimpleQA constructor
Args:
  index_dir(str):  path to index directory created by SimpleQA.initialze_index
  bert_squad_model(str): name of BERT SQUAD model to use
  bert_emb_model(str): BERT model to use to generate embeddings for semantic similarity
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class SimpleQA(QA):
    &#34;&#34;&#34;
    SimpleQA: Question-Answering on a list of texts
    &#34;&#34;&#34;
    def __init__(self, index_dir, 
                 bert_squad_model=&#39;bert-large-uncased-whole-word-masking-finetuned-squad&#39;,
                 bert_emb_model=&#39;bert-base-uncased&#39;):
        &#34;&#34;&#34;
        ```
        SimpleQA constructor
        Args:
          index_dir(str):  path to index directory created by SimpleQA.initialze_index
          bert_squad_model(str): name of BERT SQUAD model to use
          bert_emb_model(str): BERT model to use to generate embeddings for semantic similarity
        ```
        &#34;&#34;&#34;

        self.index_dir = index_dir
        try:
            ix = index.open_dir(self.index_dir)
        except:
            raise ValueError(&#39;index_dir has not yet been created - please call SimpleQA.initialize_index(&#34;%s&#34;)&#39; % (self.index_dir))
        super().__init__(bert_squad_model=bert_squad_model, bert_emb_model=bert_emb_model)


    def _open_ix(self):
        return index.open_dir(self.index_dir)


    @classmethod
    def initialize_index(cls, index_dir):
        schema = Schema(reference=ID(stored=True), content=TEXT, rawtext=TEXT(stored=True))
        if not os.path.exists(index_dir):
            os.makedirs(index_dir)
        else:
            raise ValueError(&#39;There is already an existing directory or file with path %s&#39; % (index_dir))
        ix = index.create_in(index_dir, schema)
        return ix

    @classmethod
    def index_from_list(cls, docs, index_dir, commit_every=1024, breakup_docs=True,
                        procs=1, limitmb=256, multisegment=False, min_words=20, references=None):
        &#34;&#34;&#34;
        ```
        index documents from list.
        The procs, limitmb, and especially multisegment arguments can be used to 
        speed up indexing, if it is too slow.  Please see the whoosh documentation
        for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html
        Args:
          docs(list): list of strings representing documents
          index_dir(str): path to index directory (see initialize_index)
          commit_every(int): commet after adding this many documents
          breakup_docs(bool): break up documents into smaller paragraphs and treat those as the documents.
                              This can potentially improve the speed at which answers are returned by the ask method
                              when documents being searched are longer.
          procs(int): number of processors
          limitmb(int): memory limit in MB for each process
          multisegment(bool): new segments written instead of merging
          min_words(int):  minimum words for a document (or paragraph extracted from document when breakup_docs=True) to be included in index.
                           Useful for pruning contexts that are unlikely to contain useful answers
          references(list): List of strings containing a reference (e.g., file name) for each document in docs.
                            Each string is treated as a label for the document (e.g., file name, MD5 hash, etc.):
                               Example:  [&#39;some_file.pdf&#39;, &#39;some_other_file,pdf&#39;, ...]
                            Strings can also be hyperlinks in which case the label and URL should be separated by a single tab character:
                               Example: [&#39;ktrain_article\thttps://arxiv.org/pdf/2004.10703v4.pdf&#39;, ...]

                            These references will be returned in the output of the ask method.
                            If strings are  hyperlinks, then they will automatically be made clickable when the display_answers function
                            displays candidate answers in a pandas DataFRame.

                            If references is None, the index of element in docs is used as reference.
        ```
        &#34;&#34;&#34;
        if not isinstance(docs, (np.ndarray, list)): raise ValueError(&#39;docs must be a list of strings&#39;)
        if references is not None and not isinstance(references, (np.ndarray, list)): raise ValueError(&#39;references must be a list of strings&#39;)
        if references is not None and len(references) != len(docs): raise ValueError(&#39;lengths of docs and references must be equal&#39;)

        ix = index.open_dir(index_dir)
        writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)

        mb = master_bar(range(1))
        for i in mb:
            for idx, doc in enumerate(progress_bar(docs, parent=mb)):
                reference = &#34;%s&#34; % (idx) if references is None else references[idx]

                if breakup_docs:
                    small_docs = TU.paragraph_tokenize(doc, join_sentences=True, lang=&#39;en&#39;)
                    refs = [reference] * len(small_docs)
                    for i, small_doc in enumerate(small_docs):
                        if len(small_doc.split()) &lt; min_words: continue
                        content = small_doc
                        reference = refs[i]
                        writer.add_document(reference=reference, content=content, rawtext=content)
                else:
                    if len(doc.split()) &lt; min_words: continue
                    content = doc 
                    writer.add_document(reference=reference, content=content, rawtext=content)

                idx +=1
                if idx % commit_every == 0:
                    writer.commit()
                    #writer = ix.writer()
                    writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
                mb.child.comment = f&#39;indexing documents&#39;
            writer.commit()
            #mb.write(f&#39;Finished indexing documents&#39;)
        return


    @classmethod
    def index_from_folder(cls, folder_path, index_dir,  use_text_extraction=False, commit_every=1024, breakup_docs=True, 
                          min_words=20, encoding=&#39;utf-8&#39;, procs=1, limitmb=256, multisegment=False, verbose=1):
        &#34;&#34;&#34;
        ```
        index all plain text documents within a folder.
        The procs, limitmb, and especially multisegment arguments can be used to 
        speed up indexing, if it is too slow.  Please see the whoosh documentation
        for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html

        Args:
          folder_path(str): path to folder containing plain text documents (e.g., .txt files)
          index_dir(str): path to index directory (see initialize_index)
          use_text_extraction(bool): If True, the  `textract` package will be used to index text from various
                                     file types including PDF, MS Word, and MS PowerPoint (in addition to plain text files).
                                     If False, only plain text files will be indexed.
          commit_every(int): commet after adding this many documents
          breakup_docs(bool): break up documents into smaller paragraphs and treat those as the documents.
                              This can potentially improve the speed at which answers are returned by the ask method
                              when documents being searched are longer.
          min_words(int):  minimum words for a document (or paragraph extracted from document when breakup_docs=True) to be included in index.
                           Useful for pruning contexts that are unlikely to contain useful answers
          encoding(str): encoding to use when reading document files from disk
          procs(int): number of processors
          limitmb(int): memory limit in MB for each process
          multisegment(bool): new segments written instead of merging
          verbose(bool): verbosity
        ```
        &#34;&#34;&#34;
        if use_text_extraction:
            try:
                import textract
            except ImportError:
                raise Exception(&#39;use_text_extraction=True requires textract:   pip install textract&#39;)


        if not os.path.isdir(folder_path): raise ValueError(&#39;folder_path is not a valid folder&#39;)
        if folder_path[-1] != os.sep: folder_path += os.sep
        ix = index.open_dir(index_dir)
        writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
        for idx, fpath in enumerate(TU.extract_filenames(folder_path)):
            reference = &#34;%s&#34; % (fpath.join(fpath.split(folder_path)[1:]))
            if TU.is_txt(fpath):
                with open(fpath, &#39;r&#39;, encoding=encoding) as f:
                    doc = f.read()
            else:
                if use_text_extraction:
                    try:
                        doc = textract.process(fpath)
                        doc = doc.decode(&#39;utf-8&#39;, &#39;ignore&#39;)
                    except:
                        if verbose:
                            warnings.warn(&#39;Could not extract text from %s&#39; % (fpath))
                        continue
                else:
                    continue

            if breakup_docs:
                small_docs = TU.paragraph_tokenize(doc, join_sentences=True, lang=&#39;en&#39;)
                refs = [reference] * len(small_docs)
                for i, small_doc in enumerate(small_docs):
                    if len(small_doc.split()) &lt; min_words: continue
                    content = small_doc
                    reference = refs[i]
                    writer.add_document(reference=reference, content=content, rawtext=content)
            else:
                if len(doc.split()) &lt; min_words: continue
                content = doc
                writer.add_document(reference=reference, content=content, rawtext=content)

            idx +=1
            if idx % commit_every == 0:
                writer.commit()
                writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
                if verbose: print(&#34;%s docs indexed&#34; % (idx))
        writer.commit()
        return


    def search(self, query, limit=10):
        &#34;&#34;&#34;
        ```
        search index for query
        Args:
          query(str): search query
          limit(int):  number of top search results to return
        Returns:
          list of dicts with keys: reference, rawtext
        ```
        &#34;&#34;&#34;
        ix = self._open_ix()
        with ix.searcher() as searcher:
            query_obj = QueryParser(&#34;content&#34;, ix.schema, group=qparser.OrGroup).parse(query)
            results = searcher.search(query_obj, limit=limit)
            docs = []
            output = [dict(r) for r in results]
            return output</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.text.qa.core.QA" href="qa/core.html#ktrain.text.qa.core.QA">QA</a></li>
<li>abc.ABC</li>
</ul>
<h3>Static methods</h3>
<dl>
<dt id="ktrain.text.SimpleQA.index_from_folder"><code class="name flex">
<span>def <span class="ident">index_from_folder</span></span>(<span>folder_path, index_dir, use_text_extraction=False, commit_every=1024, breakup_docs=True, min_words=20, encoding='utf-8', procs=1, limitmb=256, multisegment=False, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>index all plain text documents within a folder.
The procs, limitmb, and especially multisegment arguments can be used to 
speed up indexing, if it is too slow.  Please see the whoosh documentation
for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html

Args:
  folder_path(str): path to folder containing plain text documents (e.g., .txt files)
  index_dir(str): path to index directory (see initialize_index)
  use_text_extraction(bool): If True, the  `textract` package will be used to index text from various
                             file types including PDF, MS Word, and MS PowerPoint (in addition to plain text files).
                             If False, only plain text files will be indexed.
  commit_every(int): commet after adding this many documents
  breakup_docs(bool): break up documents into smaller paragraphs and treat those as the documents.
                      This can potentially improve the speed at which answers are returned by the ask method
                      when documents being searched are longer.
  min_words(int):  minimum words for a document (or paragraph extracted from document when breakup_docs=True) to be included in index.
                   Useful for pruning contexts that are unlikely to contain useful answers
  encoding(str): encoding to use when reading document files from disk
  procs(int): number of processors
  limitmb(int): memory limit in MB for each process
  multisegment(bool): new segments written instead of merging
  verbose(bool): verbosity
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@classmethod
def index_from_folder(cls, folder_path, index_dir,  use_text_extraction=False, commit_every=1024, breakup_docs=True, 
                      min_words=20, encoding=&#39;utf-8&#39;, procs=1, limitmb=256, multisegment=False, verbose=1):
    &#34;&#34;&#34;
    ```
    index all plain text documents within a folder.
    The procs, limitmb, and especially multisegment arguments can be used to 
    speed up indexing, if it is too slow.  Please see the whoosh documentation
    for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html

    Args:
      folder_path(str): path to folder containing plain text documents (e.g., .txt files)
      index_dir(str): path to index directory (see initialize_index)
      use_text_extraction(bool): If True, the  `textract` package will be used to index text from various
                                 file types including PDF, MS Word, and MS PowerPoint (in addition to plain text files).
                                 If False, only plain text files will be indexed.
      commit_every(int): commet after adding this many documents
      breakup_docs(bool): break up documents into smaller paragraphs and treat those as the documents.
                          This can potentially improve the speed at which answers are returned by the ask method
                          when documents being searched are longer.
      min_words(int):  minimum words for a document (or paragraph extracted from document when breakup_docs=True) to be included in index.
                       Useful for pruning contexts that are unlikely to contain useful answers
      encoding(str): encoding to use when reading document files from disk
      procs(int): number of processors
      limitmb(int): memory limit in MB for each process
      multisegment(bool): new segments written instead of merging
      verbose(bool): verbosity
    ```
    &#34;&#34;&#34;
    if use_text_extraction:
        try:
            import textract
        except ImportError:
            raise Exception(&#39;use_text_extraction=True requires textract:   pip install textract&#39;)


    if not os.path.isdir(folder_path): raise ValueError(&#39;folder_path is not a valid folder&#39;)
    if folder_path[-1] != os.sep: folder_path += os.sep
    ix = index.open_dir(index_dir)
    writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
    for idx, fpath in enumerate(TU.extract_filenames(folder_path)):
        reference = &#34;%s&#34; % (fpath.join(fpath.split(folder_path)[1:]))
        if TU.is_txt(fpath):
            with open(fpath, &#39;r&#39;, encoding=encoding) as f:
                doc = f.read()
        else:
            if use_text_extraction:
                try:
                    doc = textract.process(fpath)
                    doc = doc.decode(&#39;utf-8&#39;, &#39;ignore&#39;)
                except:
                    if verbose:
                        warnings.warn(&#39;Could not extract text from %s&#39; % (fpath))
                    continue
            else:
                continue

        if breakup_docs:
            small_docs = TU.paragraph_tokenize(doc, join_sentences=True, lang=&#39;en&#39;)
            refs = [reference] * len(small_docs)
            for i, small_doc in enumerate(small_docs):
                if len(small_doc.split()) &lt; min_words: continue
                content = small_doc
                reference = refs[i]
                writer.add_document(reference=reference, content=content, rawtext=content)
        else:
            if len(doc.split()) &lt; min_words: continue
            content = doc
            writer.add_document(reference=reference, content=content, rawtext=content)

        idx +=1
        if idx % commit_every == 0:
            writer.commit()
            writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
            if verbose: print(&#34;%s docs indexed&#34; % (idx))
    writer.commit()
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.SimpleQA.index_from_list"><code class="name flex">
<span>def <span class="ident">index_from_list</span></span>(<span>docs, index_dir, commit_every=1024, breakup_docs=True, procs=1, limitmb=256, multisegment=False, min_words=20, references=None)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>index documents from list.
The procs, limitmb, and especially multisegment arguments can be used to 
speed up indexing, if it is too slow.  Please see the whoosh documentation
for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html
Args:
  docs(list): list of strings representing documents
  index_dir(str): path to index directory (see initialize_index)
  commit_every(int): commet after adding this many documents
  breakup_docs(bool): break up documents into smaller paragraphs and treat those as the documents.
                      This can potentially improve the speed at which answers are returned by the ask method
                      when documents being searched are longer.
  procs(int): number of processors
  limitmb(int): memory limit in MB for each process
  multisegment(bool): new segments written instead of merging
  min_words(int):  minimum words for a document (or paragraph extracted from document when breakup_docs=True) to be included in index.
                   Useful for pruning contexts that are unlikely to contain useful answers
  references(list): List of strings containing a reference (e.g., file name) for each document in docs.
                    Each string is treated as a label for the document (e.g., file name, MD5 hash, etc.):
                       Example:  ['some_file.pdf', 'some_other_file,pdf', ...]
                    Strings can also be hyperlinks in which case the label and URL should be separated by a single tab character:
                       Example: ['ktrain_article        https://arxiv.org/pdf/2004.10703v4.pdf', ...]

                    These references will be returned in the output of the ask method.
                    If strings are  hyperlinks, then they will automatically be made clickable when the display_answers function
                    displays candidate answers in a pandas DataFRame.

                    If references is None, the index of element in docs is used as reference.
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@classmethod
def index_from_list(cls, docs, index_dir, commit_every=1024, breakup_docs=True,
                    procs=1, limitmb=256, multisegment=False, min_words=20, references=None):
    &#34;&#34;&#34;
    ```
    index documents from list.
    The procs, limitmb, and especially multisegment arguments can be used to 
    speed up indexing, if it is too slow.  Please see the whoosh documentation
    for more information on these parameters:  https://whoosh.readthedocs.io/en/latest/batch.html
    Args:
      docs(list): list of strings representing documents
      index_dir(str): path to index directory (see initialize_index)
      commit_every(int): commet after adding this many documents
      breakup_docs(bool): break up documents into smaller paragraphs and treat those as the documents.
                          This can potentially improve the speed at which answers are returned by the ask method
                          when documents being searched are longer.
      procs(int): number of processors
      limitmb(int): memory limit in MB for each process
      multisegment(bool): new segments written instead of merging
      min_words(int):  minimum words for a document (or paragraph extracted from document when breakup_docs=True) to be included in index.
                       Useful for pruning contexts that are unlikely to contain useful answers
      references(list): List of strings containing a reference (e.g., file name) for each document in docs.
                        Each string is treated as a label for the document (e.g., file name, MD5 hash, etc.):
                           Example:  [&#39;some_file.pdf&#39;, &#39;some_other_file,pdf&#39;, ...]
                        Strings can also be hyperlinks in which case the label and URL should be separated by a single tab character:
                           Example: [&#39;ktrain_article\thttps://arxiv.org/pdf/2004.10703v4.pdf&#39;, ...]

                        These references will be returned in the output of the ask method.
                        If strings are  hyperlinks, then they will automatically be made clickable when the display_answers function
                        displays candidate answers in a pandas DataFRame.

                        If references is None, the index of element in docs is used as reference.
    ```
    &#34;&#34;&#34;
    if not isinstance(docs, (np.ndarray, list)): raise ValueError(&#39;docs must be a list of strings&#39;)
    if references is not None and not isinstance(references, (np.ndarray, list)): raise ValueError(&#39;references must be a list of strings&#39;)
    if references is not None and len(references) != len(docs): raise ValueError(&#39;lengths of docs and references must be equal&#39;)

    ix = index.open_dir(index_dir)
    writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)

    mb = master_bar(range(1))
    for i in mb:
        for idx, doc in enumerate(progress_bar(docs, parent=mb)):
            reference = &#34;%s&#34; % (idx) if references is None else references[idx]

            if breakup_docs:
                small_docs = TU.paragraph_tokenize(doc, join_sentences=True, lang=&#39;en&#39;)
                refs = [reference] * len(small_docs)
                for i, small_doc in enumerate(small_docs):
                    if len(small_doc.split()) &lt; min_words: continue
                    content = small_doc
                    reference = refs[i]
                    writer.add_document(reference=reference, content=content, rawtext=content)
            else:
                if len(doc.split()) &lt; min_words: continue
                content = doc 
                writer.add_document(reference=reference, content=content, rawtext=content)

            idx +=1
            if idx % commit_every == 0:
                writer.commit()
                #writer = ix.writer()
                writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment)
            mb.child.comment = f&#39;indexing documents&#39;
        writer.commit()
        #mb.write(f&#39;Finished indexing documents&#39;)
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.SimpleQA.initialize_index"><code class="name flex">
<span>def <span class="ident">initialize_index</span></span>(<span>index_dir)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@classmethod
def initialize_index(cls, index_dir):
    schema = Schema(reference=ID(stored=True), content=TEXT, rawtext=TEXT(stored=True))
    if not os.path.exists(index_dir):
        os.makedirs(index_dir)
    else:
        raise ValueError(&#39;There is already an existing directory or file with path %s&#39; % (index_dir))
    ix = index.create_in(index_dir, schema)
    return ix</code></pre>
</details>
</dd>
</dl>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.SimpleQA.search"><code class="name flex">
<span>def <span class="ident">search</span></span>(<span>self, query, limit=10)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>search index for query
Args:
  query(str): search query
  limit(int):  number of top search results to return
Returns:
  list of dicts with keys: reference, rawtext
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def search(self, query, limit=10):
    &#34;&#34;&#34;
    ```
    search index for query
    Args:
      query(str): search query
      limit(int):  number of top search results to return
    Returns:
      list of dicts with keys: reference, rawtext
    ```
    &#34;&#34;&#34;
    ix = self._open_ix()
    with ix.searcher() as searcher:
        query_obj = QueryParser(&#34;content&#34;, ix.schema, group=qparser.OrGroup).parse(query)
        results = searcher.search(query_obj, limit=limit)
        docs = []
        output = [dict(r) for r in results]
        return output</code></pre>
</details>
</dd>
</dl>
<h3>Inherited members</h3>
<ul class="hlist">
<li><code><b><a title="ktrain.text.qa.core.QA" href="qa/core.html#ktrain.text.qa.core.QA">QA</a></b></code>:
<ul class="hlist">
<li><code><a title="ktrain.text.qa.core.QA.ask" href="qa/core.html#ktrain.text.qa.core.QA.ask">ask</a></code></li>
<li><code><a title="ktrain.text.qa.core.QA.predict_squad" href="qa/core.html#ktrain.text.qa.core.QA.predict_squad">predict_squad</a></code></li>
</ul>
</li>
</ul>
</dd>
<dt id="ktrain.text.TopicModel"><code class="flex name class">
<span>class <span class="ident">get_topic_model</span></span>
<span>(</span><span>texts=None, n_topics=None, n_features=10000, min_df=5, max_df=0.5, stop_words='english', model_type='lda', lda_max_iter=5, lda_mode='online', token_pattern=None, verbose=1, hyperparam_kwargs=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Fits a topic model to documents in <texts>.</p>
<h2 id="example">Example</h2>
<p>tm = ktrain.text.get_topic_model(docs, n_topics=20,
n_features=1000, min_df=2, max_df=0.95)</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>texts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
<dd>list of texts</dd>
<dt><strong><code>n_topics</code></strong> :&ensp;<code>int</code></dt>
<dd>number of topics.
If None, n_topics = min{400, sqrt[# documents/2]})</dd>
<dt><strong><code>n_features</code></strong> :&ensp;<code>int</code></dt>
<dd>maximum words to consider</dd>
<dt><strong><code>max_df</code></strong> :&ensp;<code>float</code></dt>
<dd>words in more than max_df proportion of docs discarded</dd>
<dt><strong><code>stop_words</code></strong> :&ensp;<code>str</code> or <code>list</code></dt>
<dd>either 'english' for built-in stop words or
a list of stop words to ignore</dd>
<dt>model_type(str): type of topic model to fit. One of {'lda', 'nmf'}.
Default:'lda'</dt>
<dt><strong><code>lda_max_iter</code></strong> :&ensp;<code>int</code></dt>
<dd>maximum iterations for 'lda'.
5 is default if using lda_mode='online'.
If lda_mode='batch', this should be increased (e.g., 1500).
Ignored if model_type != 'lda'</dd>
<dt><strong><code>lda_mode</code></strong> :&ensp;<code>str</code></dt>
<dd>one of {'online', 'batch'}. Ignored if model_type !='lda'</dd>
</dl>
<p>token_pattern(str): regex pattern to use to tokenize documents.
verbose(bool): verbosity</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class TopicModel():


    def __init__(self,texts=None, n_topics=None, n_features=10000, 
                 min_df=5, max_df=0.5,  stop_words=&#39;english&#39;,
                 model_type=&#39;lda&#39;,
                 lda_max_iter=5, lda_mode=&#39;online&#39;,
                 token_pattern=None, verbose=1,
                 hyperparam_kwargs=None
    ):
        &#34;&#34;&#34;
        Fits a topic model to documents in &lt;texts&gt;.
        Example:
            tm = ktrain.text.get_topic_model(docs, n_topics=20, 
                                            n_features=1000, min_df=2, max_df=0.95)
        Args:
            texts (list of str): list of texts
            n_topics (int): number of topics.
                            If None, n_topics = min{400, sqrt[# documents/2]})
            n_features (int):  maximum words to consider
            max_df (float): words in more than max_df proportion of docs discarded
            stop_words (str or list): either &#39;english&#39; for built-in stop words or
                                      a list of stop words to ignore
            model_type(str): type of topic model to fit. One of {&#39;lda&#39;, &#39;nmf&#39;}.  Default:&#39;lda&#39;
            lda_max_iter (int): maximum iterations for &#39;lda&#39;.  5 is default if using lda_mode=&#39;online&#39;.
                                If lda_mode=&#39;batch&#39;, this should be increased (e.g., 1500).
                                Ignored if model_type != &#39;lda&#39;
            lda_mode (str):  one of {&#39;online&#39;, &#39;batch&#39;}. Ignored if model_type !=&#39;lda&#39;
            token_pattern(str): regex pattern to use to tokenize documents. 
            verbose(bool): verbosity

        &#34;&#34;&#34;
        self.verbose=verbose

        # estimate n_topics
        if n_topics is None:
            if texts is None:
                raise ValueError(&#39;If n_topics is None, texts must be supplied&#39;)
            estimated = max(1, int(math.floor(math.sqrt(len(texts) / 2))))
            n_topics = min(400, estimated)
            print(&#39;n_topics automatically set to %s&#39; % (n_topics))

        # train model
        if texts is not None:
            (model, vectorizer) = self.train(texts, model_type=model_type,
                                             n_topics=n_topics, n_features=n_features,
                                             min_df = min_df, max_df = max_df, 
                                             stop_words=stop_words,
                                             lda_max_iter=lda_max_iter, lda_mode=lda_mode,
                                             token_pattern=token_pattern,
                                             hyperparam_kwargs=hyperparam_kwargs)
        else:
            vectorizer = None
            model = None


        # save model and vectorizer and hyperparameter settings
        self.vectorizer = vectorizer
        self.model = model
        self.n_topics = n_topics
        self.n_features = n_features
        if verbose: print(&#39;done.&#39;)

        # these variables are set by self.build():
        self.topic_dict = None
        self.doc_topics = None
        self.bool_array = None

        self.scorer = None       # set by self.train_scorer()
        self.recommender = None  # set by self.train_recommender()
        return


    def train(self,texts, model_type=&#39;lda&#39;, n_topics=None, n_features=10000,
              min_df=5, max_df=0.5,  stop_words=&#39;english&#39;,
              lda_max_iter=5, lda_mode=&#39;online&#39;,
              token_pattern=None, hyperparam_kwargs=None):
        &#34;&#34;&#34;
        Fits a topic model to documents in &lt;texts&gt;.
        Example:
            tm = ktrain.text.get_topic_model(docs, n_topics=20, 
                                            n_features=1000, min_df=2, max_df=0.95)
        Args:
            texts (list of str): list of texts
            n_topics (int): number of topics.
                            If None, n_topics = min{400, sqrt[# documents/2]})
            n_features (int):  maximum words to consider
            max_df (float): words in more than max_df proportion of docs discarded
            stop_words (str or list): either &#39;english&#39; for built-in stop words or
                                      a list of stop words to ignore
            lda_max_iter (int): maximum iterations for &#39;lda&#39;.  5 is default if using lda_mode=&#39;online&#39;.
                                If lda_mode=&#39;batch&#39;, this should be increased (e.g., 1500).
                                Ignored if model_type != &#39;lda&#39;
            lda_mode (str):  one of {&#39;online&#39;, &#39;batch&#39;}. Ignored of model_type !=&#39;lda&#39;
            token_pattern(str): regex pattern to use to tokenize documents. 
                                If None, a default tokenizer will be used
            hyperparam_kwargs(dict): hyperparameters for LDA/NMF
                                     Keys in this dict can be any of the following:
                                         alpha: alpha for LDA  default: 5./n_topics
                                         beta: beta for LDA.  default:0.01
                                         nmf_alpha: alpha for NMF.  default:0
                                         l1_ratio: l1_ratio for NMF. default: 0
                                         ngram_range:  whether to consider bigrams, trigrams. default: (1,1) 
                                    
        Returns:
            tuple: (model, vectorizer)
        &#34;&#34;&#34;
        if hyperparam_kwargs is None:
            hyperparam_kwargs = {}
        alpha = hyperparam_kwargs.get(&#39;alpha&#39;, 5.0 / n_topics)
        beta = hyperparam_kwargs.get(&#39;beta&#39;, 0.01)
        nmf_alpha = hyperparam_kwargs.get(&#39;nmf_alpha&#39;, 0)
        l1_ratio = hyperparam_kwargs.get(&#39;l1_ratio&#39;, 0)
        ngram_range = hyperparam_kwargs.get(&#39;ngram_range&#39;, (1,1))

        # adjust defaults based on language detected
        if texts is not None:
            lang = TU.detect_lang(texts)
            if lang != &#39;en&#39;:
                stopwords = None if stop_words==&#39;english&#39; else stop_words
                token_pattern = r&#39;(?u)\b\w+\b&#39; if token_pattern is None else token_pattern
            if pp.is_nospace_lang(lang):
                text_list = []
                for t in texts:
                    text_list.append(&#39; &#39;.join(jieba.cut(t, HMM=False)))
                texts = text_list
            if self.verbose: print(&#39;lang: %s&#39; % (lang))


        # preprocess texts
        if self.verbose: print(&#39;preprocessing texts...&#39;)
        if token_pattern is None: token_pattern = TU.DEFAULT_TOKEN_PATTERN
        #if token_pattern is None: token_pattern = r&#39;(?u)\b\w\w+\b&#39;
        vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                 max_features=n_features, stop_words=stop_words,
                                 token_pattern=token_pattern, ngram_range=ngram_range)
        

        x_train = vectorizer.fit_transform(texts)

        # fit model

        if self.verbose: print(&#39;fitting model...&#39;)
        if model_type == &#39;lda&#39;:
            model = LatentDirichletAllocation(n_components=n_topics, max_iter=lda_max_iter,
                                              learning_method=lda_mode, learning_offset=50.,
                                              doc_topic_prior=alpha,
                                              topic_word_prior=beta,
                                              verbose=self.verbose, random_state=0)
        elif model_type == &#39;nmf&#39;:
            model = NMF(
                n_components=n_topics,
                max_iter=lda_max_iter,
                verbose=self.verbose,
                alpha=nmf_alpha,
                l1_ratio=l1_ratio,
                random_state=0)
        else:
            raise ValueError(&#34;unknown model type:&#34;, str(model_type))
        model.fit(x_train)

        # save model and vectorizer and hyperparameter settings
        return (model, vectorizer)


    @property
    def topics(self):
        &#34;&#34;&#34;
        convenience method/property
        &#34;&#34;&#34;
        return self.get_topics()


    def get_document_topic_distribution(self):
        &#34;&#34;&#34;
        Gets the document-topic distribution.
        Each row is a document and each column is a topic
        The output of this method is equivalent to invoking get_doctopics with no arguments.
        &#34;&#34;&#34;
        self._check_build()
        return self.doc_topics


    def get_sorted_docs(self, topic_id):
        &#34;&#34;&#34;
        Returns all docs sorted by relevance to &lt;topic_id&gt;.
        Unlike get_docs, this ranks documents by the supplied topic_id rather
        than the topic_id to which document is most relevant.
        &#34;&#34;&#34;
        docs = self.get_docs()
        d = {}
        for doc in docs: d[doc[&#39;doc_id&#39;]] = doc
        m = self.get_document_topic_distribution()
        doc_ids = (-m[:,topic_id]).argsort()
        return [d[doc_id] for doc_id in doc_ids]


    def get_word_weights(self, topic_id, n_words=100):
        &#34;&#34;&#34;
        Returns a list tuples of the form: (word, weight) for given topic_id.
        The weight can be interpreted as the number of times word was assigned to topic with given topic_id.
        REFERENCE: https://stackoverflow.com/a/48890889/13550699
        Args:
            topic_id(int): topic ID
            n_words=int): number of top words
        &#34;&#34;&#34;
        self._check_model()
        if topic_id+1 &gt; len(self.model.components_): 
            raise ValueError(&#39;topic_id must be less than %s&#39; % (len(self.model.components_)))
        feature_names = self.vectorizer.get_feature_names()
        word_probs = self.model.components_[topic_id]
        word_ids = [i for i in word_probs.argsort()[:-n_words - 1:-1]]
        words = [feature_names[i] for i in word_ids]
        probs = [word_probs[i] for i in word_ids]
        return list( zip(words, probs) )


    def get_topics(self, n_words=10, as_string=True):
        &#34;&#34;&#34;
        Returns a list of discovered topics
        Args:
            n_words(int): number of words to use in topic summary
            as_string(bool): If True, each summary is a space-delimited string instead of list of words
        &#34;&#34;&#34;
        self._check_model()
        feature_names = self.vectorizer.get_feature_names()
        topic_summaries = []
        for topic_idx, topic in enumerate(self.model.components_):
            summary = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
            if as_string: summary = &#34; &#34;.join(summary)
            topic_summaries.append(summary)
        return topic_summaries


    def print_topics(self, n_words=10, show_counts=False):
        &#34;&#34;&#34;
        print topics
        n_words(int): number of words to describe each topic
        show_counts(bool): If True, print topics with document counts, where
                           the count is the number of documents with that topic as primary.
        &#34;&#34;&#34;
        topics = self.get_topics(n_words=n_words, as_string=True)
        if show_counts:
            self._check_build()
            topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()], 
                                    key=lambda kv:kv[-1], reverse=True)
            for (idx, topic, count) in topic_counts:
                print(&#34;topic:%s | count:%s | %s&#34; %(idx, count, topic))
        else:
            for i, t in enumerate(topics):
                print(&#39;topic %s | %s&#39; % (i, t))
        return


    def build(self, texts, threshold=None):
        &#34;&#34;&#34;
        Builds the document-topic distribution showing the topic probability distirbution
        for each document in &lt;texts&gt; with respect to the learned topic space.
        Args:
            texts (list of str): list of text documents
            threshold (float): If not None, documents with whose highest topic probability
                               is less than threshold are filtered out.
        &#34;&#34;&#34;
        if threshold is not None:
            doc_topics, bool_array = self.predict(texts, threshold=threshold)
        else:
            doc_topics = self.predict(texts)
            bool_array = np.array([True] * len(texts))

        self.doc_topics = doc_topics
        self.bool_array = bool_array

        texts = [text for i, text in enumerate(texts) if bool_array[i]]
        self.topic_dict = self._rank_documents(texts, doc_topics=doc_topics)
        return


    def filter(self, lst):
        &#34;&#34;&#34;
        The build method may prune documents based on threshold.
        This method prunes other lists based on how build pruned documents.
        This is useful to filter lists containing metadata associated with documents
        for use with visualize_documents.
        Args:
            lst(list): a list of data
        Returns:
            list:  a filtered list of data based on how build filtered the documents
        &#34;&#34;&#34;
        if len(lst) != self.bool_array.shape[0]:
            raise ValueError(&#39;Length of lst is not consistent with the number of documents &#39; +
                             &#39;supplied to get_topic_model&#39;)
        arr = np.array(lst)
        return list(arr[self.bool_array])
                           

    def get_docs(self, topic_ids=[], doc_ids=[], rank=False):
        &#34;&#34;&#34;
        Returns document entries for supplied topic_ids.
        Documents returned are those whose primary topic is topic with given topic_id
        Args:
            topic_ids(list of ints): list of topid IDs where each id is in the range
                                     of range(self.n_topics).
            doc_ids (list of ints): list of document IDs where each id is an index
                                    into self.doctopics
            rank(bool): If True, the list is sorted first by topic_id (ascending)
                        and then ty topic probability (descending).
                        Otherwise, list is sorted by doc_id (i.e., the order
                        of texts supplied to self.build (which is the order of self.doc_topics).

        Returns:
            list of dicts:  list of dicts with keys:
                            &#39;text&#39;: text of document
                            &#39;doc_id&#39;: ID of document
                            &#39;topic_proba&#39;: topic probability (or score)
                            &#39;topic_id&#39;: ID of topic
            
        &#34;&#34;&#34;
        self._check_build()
        if not topic_ids:
            topic_ids = list(range(self.n_topics))
        result_texts = []
        for topic_id in topic_ids:
            if topic_id not in self.topic_dict: continue
            texts = [{&#39;text&#39;:tup[0], &#39;doc_id&#39;:tup[1], &#39;topic_proba&#39;:tup[2], &#39;topic_id&#39;:topic_id} for tup in self.topic_dict[topic_id] 
                                                                                                     if not doc_ids or tup[1] in doc_ids]
            result_texts.extend(texts)
        if not rank:
            result_texts = sorted(result_texts, key=lambda x:x[&#39;doc_id&#39;])
        return result_texts


    def get_doctopics(self,  topic_ids=[], doc_ids=[]):
        &#34;&#34;&#34;
        Returns a topic probability distribution for documents
        with primary topic that is one of &lt;topic_ids&gt; and with doc_id in &lt;doc_ids&gt;.

        If no topic_ids or doc_ids are provided, then topic distributions for all documents
        are returned (which equivalent to the output of get_document_topic_distribution).

        Args:
            topic_ids(list of ints): list of topid IDs where each id is in the range
                                     of range(self.n_topics).
            doc_ids (list of ints): list of document IDs where each id is an index
                                    into self.doctopics
        Returns:
            np.ndarray: Each row is the topic probability distribution of a document.
                        Array is sorted in the order returned by self.get_docs.
                        
        &#34;&#34;&#34;
        docs = self.get_docs(topic_ids=topic_ids, doc_ids=doc_ids)
        return np.array([self.doc_topics[idx] for idx in [x[&#39;doc_id&#39;] for x in docs]])


    def get_texts(self,  topic_ids=[]):
        &#34;&#34;&#34;
        Returns texts for documents
        with primary topic that is one of &lt;topic_ids&gt;
        Args:
            topic_ids(list of ints): list of topic IDs
        Returns:
            list of str
        &#34;&#34;&#34;
        if not topic_ids: topic_ids = list(range(self.n_topics))
        docs = self.get_docs(topic_ids)
        return [x[0] for x in docs]


    def predict(self, texts, threshold=None, harden=False):
        &#34;&#34;&#34;
        Args:
            texts (list of str): list of texts
            threshold (float): If not None, documents with maximum topic scores
                                less than &lt;threshold&gt; are filtered out
            harden(bool): If True, each document is assigned to a single topic for which
                          it has the highest score
        Returns:
            if threshold is None:
                np.ndarray: topic distribution for each text document
            else:
                (np.ndarray, np.ndarray): topic distribution and boolean array
        &#34;&#34;&#34;
        self._check_model()
        transformed_texts = self.vectorizer.transform(texts)
        X_topics = self.model.transform(transformed_texts)
        #if self.model_type == &#39;nmf&#39;:
            #scores = np.matrix(X_topics)
            #scores_normalized= scores/scores.sum(axis=1)
            #X_topics = scores_normalized
        _idx = np.array([True] * len(texts))
        if threshold is not None:
            _idx = np.amax(X_topics, axis=1) &gt; threshold  # idx of doc that above the threshold
            _idx = np.array(_idx)
            X_topics = X_topics[_idx]
        if harden: X_topics = self._harden_topics(X_topics)
        if threshold is not None:
            return (X_topics, _idx)
        else:
            return X_topics


    def visualize_documents(self, texts=None, doc_topics=None, 
                            width=700, height=700, point_size=5, title=&#39;Document Visualization&#39;,
                            extra_info={},
                            colors=None,
                            filepath=None,):
        &#34;&#34;&#34;
        Generates a visualization of a set of documents based on model.
        If &lt;texts&gt; is supplied, raw documents will be first transformed into document-topic
        matrix.  If &lt;doc_topics&gt; is supplied, then this will be used for visualization instead.
        Args:
            texts(list of str): list of document texts.  Mutually-exclusive with &lt;doc_topics&gt;
            doc_topics(ndarray): pre-computed topic distribution for each document in texts.
                                 Mutually-exclusive with &lt;texts&gt;.
            width(int): width of image
            height(int): height of image
            point_size(int): size of circles in plot
            title(str):  title of visualization
            extra_info(dict of lists): A user-supplied information for each datapoint (attributes of the datapoint).
                                       The keys are field names.  The values are lists - each of which must
                                       be the same number of elements as &lt;texts&gt; or &lt;doc_topics&gt;. These fields are displayed
                                       when hovering over datapoints in the visualization.
            colors(list of str):  list of Hex color codes for each datapoint.
                                  Length of list must match either len(texts) or doc_topics.shape[0]
            filepath(str):             Optional filepath to save the interactive visualization
        &#34;&#34;&#34;

        # error-checking
        if texts is not None: length = len(texts)
        else: length = doc_topics.shape[0]
        if colors is not None and len(colors) != length:
            raise ValueError(&#39;length of colors is not consistent with length of texts or doctopics&#39;)
        if texts is not None and doc_topics is not None:
            raise ValueError(&#39;texts is mutually-exclusive with doc_topics&#39;)
        if texts is None and doc_topics is None:
            raise ValueError(&#39;One of texts or doc_topics is required.&#39;)
        if extra_info:
            invalid_keys = [&#39;x&#39;, &#39;y&#39;, &#39;topic&#39;, &#39;fill_color&#39;]
            for k in extra_info.keys():
                if k in invalid_keys:
                    raise ValueError(&#39;cannot use &#34;%s&#34; as key in extra_info&#39; %(k))
                lst = extra_info[k]
                if len(lst) != length:
                    raise ValueError(&#39;texts and extra_info lists must be same size&#39;)

        # check fo bokeh
        try:
            import bokeh.plotting as bp
            from bokeh.plotting import save
            from bokeh.models import HoverTool
            from bokeh.io import output_notebook
        except:
            warnings.warn(&#39;visualize_documents method requires bokeh package: pip install bokeh&#39;)
            return

        # prepare data
        if doc_topics is not None:
            X_topics = doc_topics
        else:
            if self.verbose:  print(&#39;transforming texts...&#39;, end=&#39;&#39;)
            X_topics = self.predict(texts, harden=False)
            if self.verbose: print(&#39;done.&#39;)

        # reduce to 2-D
        if self.verbose:  print(&#39;reducing to 2 dimensions...&#39;, end=&#39;&#39;)
        tsne_model = TSNE(n_components=2, verbose=self.verbose, random_state=0, angle=.99, init=&#39;pca&#39;)
        tsne_lda = tsne_model.fit_transform(X_topics)
        print(&#39;done.&#39;)

        # get random colormap
        colormap = U.get_random_colors(self.n_topics)

        # generate inline visualization in Jupyter notebook
        lda_keys = self._harden_topics(X_topics)
        if colors is None: colors = colormap[lda_keys]
        topic_summaries = self.get_topics(n_words=5)
        os.environ[&#34;BOKEH_RESOURCES&#34;]=&#34;inline&#34;
        output_notebook()
        dct = { 
                &#39;x&#39;:tsne_lda[:,0],
                &#39;y&#39;:tsne_lda[:, 1],
                &#39;topic&#39;:[topic_summaries[tid] for tid in lda_keys],
                &#39;fill_color&#39;:colors,}
        tool_tups = [(&#39;index&#39;, &#39;$index&#39;),
                     (&#39;(x,y)&#39;,&#39;($x,$y)&#39;),
                     (&#39;topic&#39;, &#39;@topic&#39;)]
        for k in extra_info.keys():
            dct[k] = extra_info[k]
            tool_tups.append((k, &#39;@&#39;+k))

        source = bp.ColumnDataSource(data=dct)
        hover = HoverTool( tooltips=tool_tups)
        p = bp.figure(plot_width=width, plot_height=height, 
                      tools=[hover, &#39;save&#39;, &#39;pan&#39;, &#39;wheel_zoom&#39;, &#39;box_zoom&#39;, &#39;reset&#39;],
                      #tools=&#34;pan,wheel_zoom,box_zoom,reset,hover,previewsave&#34;,
                      title=title)
        #plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                           #title=title,
                           #tools=&#34;pan,wheel_zoom,box_zoom,reset,hover,previewsave&#34;,
                           #x_axis_type=None, y_axis_type=None, min_border=1)
        p.circle(&#39;x&#39;, &#39;y&#39;, size=point_size, source=source, fill_color= &#39;fill_color&#39;)
        bp.show(p)
        if filepath is not None:
            bp.output_file(filepath)
            bp.save(p)
        return


    def train_recommender(self, n_neighbors=20, metric=&#39;minkowski&#39;, p=2):
        &#34;&#34;&#34;
        Trains a recommender that, given a single document, will return
        documents in the corpus that are semantically similar to it.

        Args:
            n_neighbors (int): 
        Returns:
            None
        &#34;&#34;&#34;
        from sklearn.neighbors import NearestNeighbors
        rec = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p)
        probs = self.get_doctopics()
        rec.fit(probs)
        self.recommender = rec
        return


    def recommend(self, text=None, doc_topic=None, n=5, n_neighbors=100):
        &#34;&#34;&#34;
        Given an example document, recommends documents similar to it
        from the set of documents supplied to build().
 
        Args:
            texts(list of str): list of document texts.  Mutually-exclusive with &lt;doc_topics&gt;
            doc_topics(ndarray): pre-computed topic distribution for each document in texts.
                                 Mutually-exclusive with &lt;texts&gt;.
            n (int): number of recommendations to return
        Returns:
            list of tuples: each tuple is of the form:
                            (text, doc_id, topic_probability, topic_id)

        &#34;&#34;&#34;
        # error-checks
        if text is not None and doc_topic is not None:
            raise ValueError(&#39;text is mutually-exclusive with doc_topic&#39;)
        if text is None and doc_topic is None:
            raise ValueError(&#39;One of text or doc_topic is required.&#39;)
        if text is not None and type(text) not in [str]:
            raise ValueError(&#39;text must be a str &#39;)
        if  doc_topic is not None and type(doc_topic) not in [np.ndarray]:
            raise ValueError(&#39;doc_topic must be a np.ndarray&#39;)

        if n &gt; n_neighbors: n_neighbors = n

        x_test = [doc_topic]
        if text:
            x_test = self.predict([text])
        docs = self.get_docs()
        indices = self.recommender.kneighbors(x_test, return_distance=False, n_neighbors=n_neighbors)
        results = [doc for i, doc in enumerate(docs) if i in indices]
        return results[:n]


    def train_scorer(self, topic_ids=[], doc_ids=[], n_neighbors=20):
        &#34;&#34;&#34;
        Trains a scorer that can score documents based on similarity to a
        seed set of documents represented by topic_ids and doc_ids.

        NOTE: The score method currently employs the use of LocalOutLierFactor, which
        means you should not try to score documents that were used in training. Only
        new, unseen documents should be scored for similarity. 
        REFERENCE: 
        https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor

        Args:
            topic_ids(list of ints): list of topid IDs where each id is in the range
                                     of range(self.n_topics).  Documents associated
                                     with these topic_ids will be used as seed set.
            doc_ids (list of ints): list of document IDs where each id is an index
                                    into self.doctopics.  Documents associated 
                                    with these doc_ids will be used as seed set.
        Returns:
            None
        &#34;&#34;&#34;
        from sklearn.neighbors import LocalOutlierFactor
        clf = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True, contamination=0.1)
        probs = self.get_doctopics(topic_ids=topic_ids, doc_ids=doc_ids)
        clf.fit(probs)
        self.scorer = clf
        return


    def score(self, texts=None, doc_topics=None):
        &#34;&#34;&#34;
        Given a new set of documents (supplied as texts or doc_topics), the score method
        uses a One-Class classifier to score documents based on similarity to a
        seed set of documents (where seed set is computed by train_scorer() method).

        Higher scores indicate a higher degree of similarity.
        Positive values represent a binary decision of similar.
        Negative values represent a binary decision of dissimlar.
        In practice, negative scores closer to zer will also be simlar as One-Class
        classifiers are more strict than traditional binary classifiers.
        Documents with negative scores closer to zero are good candidates for
        inclusion in a training set for binary classification (e.g., active labeling).

        NOTE: The score method currently employs the use of LocalOutLierFactor, which
        means you should not try to score documents that were used in training. Only
        new, unseen documents should be scored for similarity.
 
        Args:
            texts(list of str): list of document texts.  Mutually-exclusive with &lt;doc_topics&gt;
            doc_topics(ndarray): pre-computed topic distribution for each document in texts.
                                 Mutually-exclusive with &lt;texts&gt;.
        Returns:
            list of floats:  larger values indicate higher degree of similarity
                             positive values indicate a binary decision of similar
                             negative values indicate binary decision of dissimilar
                             In practice, negative scores closer to zero will also 
                             be similar as One-class classifiers are more strict
                             than traditional binary classifiers.

        &#34;&#34;&#34;
        # error-checks
        if texts is not None and doc_topics is not None:
            raise ValueError(&#39;texts is mutually-exclusive with doc_topics&#39;)
        if texts is None and doc_topics is None:
            raise ValueError(&#39;One of texts or doc_topics is required.&#39;)
        if texts is not None and type(texts) not in [list, np.ndarray]:
            raise ValueError(&#39;texts must be either a list or numpy ndarray&#39;)
        if  doc_topics is not None and type(doc_topics) not in [np.ndarray]:
            raise ValueError(&#39;doc_topics must be a np.ndarray&#39;)

        x_test = doc_topics
        if texts:
            x_test = self.predict(texts)
        return self.scorer.decision_function(x_test)


    def search(self, query, topic_ids=[], doc_ids=[], case_sensitive=False):
        &#34;&#34;&#34;
        search documents for query string.
        Args:
            query(str):  the word or phrase to search
            topic_ids(list of ints): list of topid IDs where each id is in the range
                                     of range(self.n_topics).
            doc_ids (list of ints): list of document IDs where each id is an index
                                    into self.doctopics
            case_sensitive(bool):  If True, case sensitive search
        &#34;&#34;&#34;

        # setup pattern
        if not case_sensitive: query = query.lower()
        pattern = re.compile(r&#39;\b%s\b&#39; % query)

        # retrive docs
        docs = self.get_docs(topic_ids=topic_ids, doc_ids=doc_ids)

        # search
        mb = master_bar(range(1))
        results = []
        for i in mb:
            for doc in progress_bar(docs, parent=mb):
                text = doc[&#39;text&#39;]
                if not case_sensitive: text = text.lower()
                matches = pattern.findall(text)
                if matches: results.append(doc)
            if self.verbose: mb.write(&#39;done.&#39;)
        return results


    def _rank_documents(self, 
                       texts,
                       doc_topics=None):
        &#34;&#34;&#34;
        Rank documents by topic score.
        If topic_index is supplied, rank documents based on relevance to supplied topic.
        Otherwise, rank all texts by their highest topic score (for any topic).
        Args:
            texts(list of str): list of document texts.
            doc_topics(ndarray): pre-computed topic distribution for each document
                                 If None, re-computed from texts.
                              
        Returns:
            dict of lists: each element in list is a tuple of (doc_index, topic_index, score)
            ... where doc_index is an index into either texts 
        &#34;&#34;&#34;
        if doc_topics is not None:
            X_topics = doc_topics
        else:
            if self.verbose: print(&#39;transforming texts to topic space...&#39;)
            X_topics = self.predict(texts)
        topics = np.argmax(X_topics, axis=1)
        scores = np.amax(X_topics, axis=1)
        doc_ids = np.array([i for i, x in enumerate(texts)])
        result = list(zip(texts, doc_ids, topics, scores))
        if self.verbose: print(&#39;done.&#39;)
        result = sorted(result, key=lambda x: x[-1], reverse=True)
        result_dict = {}
        for r in result:
            text = r[0]
            doc_id = r[1]
            topic_id = r[2]
            score = r[3]
            lst = result_dict.get(topic_id, [])
            lst.append((text, doc_id, score))
            result_dict[topic_id] = lst
        return result_dict


    def _harden_topics(self, X_topics):
        &#34;&#34;&#34;
        Transforms soft-clustering to hard-clustering
        &#34;&#34;&#34;
        max_topics = []
        for i in range(X_topics.shape[0]):
            max_topics.append(X_topics[i].argmax())
        X_topics = np.array(max_topics)
        return X_topics


    def _check_build(self):
        self._check_model()
        if self.topic_dict is None: 
            raise Exception(&#39;Must call build() method.&#39;)

    def _check_scorer(self):
        if self.scorer is None:
            raise Exception(&#39;Must call train_scorer()&#39;)

    def _check_recommender(self):
        if self.recommender is None:
            raise Exception(&#39;Must call train_recommender()&#39;)


    def _check_model(self):
        if self.model is None or self.vectorizer is None:
            raise Exception(&#39;Must call train()&#39;)


    def save(self, fname):
        &#34;&#34;&#34;
        save TopicModel object
        &#34;&#34;&#34;

        
        with open(fname+&#39;.tm_vect&#39;, &#39;wb&#39;) as f:
            pickle.dump(self.vectorizer, f)
        with open(fname+&#39;.tm_model&#39;, &#39;wb&#39;) as f:
            pickle.dump(self.model, f)
        params = {&#39;n_topics&#39;: self.n_topics,
                  &#39;n_features&#39;: self.n_features,
                  &#39;verbose&#39;: self.verbose}
        with open(fname+&#39;.tm_params&#39;, &#39;wb&#39;) as f:
            pickle.dump(params, f)

        return</code></pre>
</details>
<h3>Instance variables</h3>
<dl>
<dt id="ktrain.text.TopicModel.topics"><code class="name">var <span class="ident">topics</span></code></dt>
<dd>
<div class="desc"><p>convenience method/property</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@property
def topics(self):
    &#34;&#34;&#34;
    convenience method/property
    &#34;&#34;&#34;
    return self.get_topics()</code></pre>
</details>
</dd>
</dl>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.TopicModel.build"><code class="name flex">
<span>def <span class="ident">build</span></span>(<span>self, texts, threshold=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Builds the document-topic distribution showing the topic probability distirbution
for each document in <texts> with respect to the learned topic space.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>texts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
<dd>list of text documents</dd>
<dt><strong><code>threshold</code></strong> :&ensp;<code>float</code></dt>
<dd>If not None, documents with whose highest topic probability
is less than threshold are filtered out.</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build(self, texts, threshold=None):
    &#34;&#34;&#34;
    Builds the document-topic distribution showing the topic probability distirbution
    for each document in &lt;texts&gt; with respect to the learned topic space.
    Args:
        texts (list of str): list of text documents
        threshold (float): If not None, documents with whose highest topic probability
                           is less than threshold are filtered out.
    &#34;&#34;&#34;
    if threshold is not None:
        doc_topics, bool_array = self.predict(texts, threshold=threshold)
    else:
        doc_topics = self.predict(texts)
        bool_array = np.array([True] * len(texts))

    self.doc_topics = doc_topics
    self.bool_array = bool_array

    texts = [text for i, text in enumerate(texts) if bool_array[i]]
    self.topic_dict = self._rank_documents(texts, doc_topics=doc_topics)
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.filter"><code class="name flex">
<span>def <span class="ident">filter</span></span>(<span>self, lst)</span>
</code></dt>
<dd>
<div class="desc"><p>The build method may prune documents based on threshold.
This method prunes other lists based on how build pruned documents.
This is useful to filter lists containing metadata associated with documents
for use with visualize_documents.</p>
<h2 id="args">Args</h2>
<p>lst(list): a list of data</p>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>list</code></dt>
<dd>a filtered list of data based on how build filtered the documents</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def filter(self, lst):
    &#34;&#34;&#34;
    The build method may prune documents based on threshold.
    This method prunes other lists based on how build pruned documents.
    This is useful to filter lists containing metadata associated with documents
    for use with visualize_documents.
    Args:
        lst(list): a list of data
    Returns:
        list:  a filtered list of data based on how build filtered the documents
    &#34;&#34;&#34;
    if len(lst) != self.bool_array.shape[0]:
        raise ValueError(&#39;Length of lst is not consistent with the number of documents &#39; +
                         &#39;supplied to get_topic_model&#39;)
    arr = np.array(lst)
    return list(arr[self.bool_array])</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.get_docs"><code class="name flex">
<span>def <span class="ident">get_docs</span></span>(<span>self, topic_ids=[], doc_ids=[], rank=False)</span>
</code></dt>
<dd>
<div class="desc"><p>Returns document entries for supplied topic_ids.
Documents returned are those whose primary topic is topic with given topic_id</p>
<h2 id="args">Args</h2>
<dl>
<dt>topic_ids(list of ints): list of topid IDs where each id is in the range</dt>
<dt>of range(self.n_topics).</dt>
<dt><strong><code>doc_ids</code></strong> :&ensp;<code>list</code> of <code>ints</code></dt>
<dd>list of document IDs where each id is an index
into self.doctopics</dd>
</dl>
<p>rank(bool): If True, the list is sorted first by topic_id (ascending)
and then ty topic probability (descending).
Otherwise, list is sorted by doc_id (i.e., the order
of texts supplied to self.build (which is the order of self.doc_topics).</p>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>list</code> of <code>dicts</code></dt>
<dd>list of dicts with keys:
'text': text of document
'doc_id': ID of document
'topic_proba': topic probability (or score)
'topic_id': ID of topic</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_docs(self, topic_ids=[], doc_ids=[], rank=False):
    &#34;&#34;&#34;
    Returns document entries for supplied topic_ids.
    Documents returned are those whose primary topic is topic with given topic_id
    Args:
        topic_ids(list of ints): list of topid IDs where each id is in the range
                                 of range(self.n_topics).
        doc_ids (list of ints): list of document IDs where each id is an index
                                into self.doctopics
        rank(bool): If True, the list is sorted first by topic_id (ascending)
                    and then ty topic probability (descending).
                    Otherwise, list is sorted by doc_id (i.e., the order
                    of texts supplied to self.build (which is the order of self.doc_topics).

    Returns:
        list of dicts:  list of dicts with keys:
                        &#39;text&#39;: text of document
                        &#39;doc_id&#39;: ID of document
                        &#39;topic_proba&#39;: topic probability (or score)
                        &#39;topic_id&#39;: ID of topic
        
    &#34;&#34;&#34;
    self._check_build()
    if not topic_ids:
        topic_ids = list(range(self.n_topics))
    result_texts = []
    for topic_id in topic_ids:
        if topic_id not in self.topic_dict: continue
        texts = [{&#39;text&#39;:tup[0], &#39;doc_id&#39;:tup[1], &#39;topic_proba&#39;:tup[2], &#39;topic_id&#39;:topic_id} for tup in self.topic_dict[topic_id] 
                                                                                                 if not doc_ids or tup[1] in doc_ids]
        result_texts.extend(texts)
    if not rank:
        result_texts = sorted(result_texts, key=lambda x:x[&#39;doc_id&#39;])
    return result_texts</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.get_doctopics"><code class="name flex">
<span>def <span class="ident">get_doctopics</span></span>(<span>self, topic_ids=[], doc_ids=[])</span>
</code></dt>
<dd>
<div class="desc"><p>Returns a topic probability distribution for documents
with primary topic that is one of <topic_ids> and with doc_id in <doc_ids>.</p>
<p>If no topic_ids or doc_ids are provided, then topic distributions for all documents
are returned (which equivalent to the output of get_document_topic_distribution).</p>
<h2 id="args">Args</h2>
<dl>
<dt>topic_ids(list of ints): list of topid IDs where each id is in the range</dt>
<dt>of range(self.n_topics).</dt>
<dt><strong><code>doc_ids</code></strong> :&ensp;<code>list</code> of <code>ints</code></dt>
<dd>list of document IDs where each id is an index
into self.doctopics</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>np.ndarray</code></dt>
<dd>Each row is the topic probability distribution of a document.
Array is sorted in the order returned by self.get_docs.</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_doctopics(self,  topic_ids=[], doc_ids=[]):
    &#34;&#34;&#34;
    Returns a topic probability distribution for documents
    with primary topic that is one of &lt;topic_ids&gt; and with doc_id in &lt;doc_ids&gt;.

    If no topic_ids or doc_ids are provided, then topic distributions for all documents
    are returned (which equivalent to the output of get_document_topic_distribution).

    Args:
        topic_ids(list of ints): list of topid IDs where each id is in the range
                                 of range(self.n_topics).
        doc_ids (list of ints): list of document IDs where each id is an index
                                into self.doctopics
    Returns:
        np.ndarray: Each row is the topic probability distribution of a document.
                    Array is sorted in the order returned by self.get_docs.
                    
    &#34;&#34;&#34;
    docs = self.get_docs(topic_ids=topic_ids, doc_ids=doc_ids)
    return np.array([self.doc_topics[idx] for idx in [x[&#39;doc_id&#39;] for x in docs]])</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.get_document_topic_distribution"><code class="name flex">
<span>def <span class="ident">get_document_topic_distribution</span></span>(<span>self)</span>
</code></dt>
<dd>
<div class="desc"><p>Gets the document-topic distribution.
Each row is a document and each column is a topic
The output of this method is equivalent to invoking get_doctopics with no arguments.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_document_topic_distribution(self):
    &#34;&#34;&#34;
    Gets the document-topic distribution.
    Each row is a document and each column is a topic
    The output of this method is equivalent to invoking get_doctopics with no arguments.
    &#34;&#34;&#34;
    self._check_build()
    return self.doc_topics</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.get_sorted_docs"><code class="name flex">
<span>def <span class="ident">get_sorted_docs</span></span>(<span>self, topic_id)</span>
</code></dt>
<dd>
<div class="desc"><p>Returns all docs sorted by relevance to <topic_id>.
Unlike get_docs, this ranks documents by the supplied topic_id rather
than the topic_id to which document is most relevant.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_sorted_docs(self, topic_id):
    &#34;&#34;&#34;
    Returns all docs sorted by relevance to &lt;topic_id&gt;.
    Unlike get_docs, this ranks documents by the supplied topic_id rather
    than the topic_id to which document is most relevant.
    &#34;&#34;&#34;
    docs = self.get_docs()
    d = {}
    for doc in docs: d[doc[&#39;doc_id&#39;]] = doc
    m = self.get_document_topic_distribution()
    doc_ids = (-m[:,topic_id]).argsort()
    return [d[doc_id] for doc_id in doc_ids]</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.get_texts"><code class="name flex">
<span>def <span class="ident">get_texts</span></span>(<span>self, topic_ids=[])</span>
</code></dt>
<dd>
<div class="desc"><p>Returns texts for documents
with primary topic that is one of <topic_ids></p>
<h2 id="args">Args</h2>
<p>topic_ids(list of ints): list of topic IDs</p>
<h2 id="returns">Returns</h2>
<p>list of str</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_texts(self,  topic_ids=[]):
    &#34;&#34;&#34;
    Returns texts for documents
    with primary topic that is one of &lt;topic_ids&gt;
    Args:
        topic_ids(list of ints): list of topic IDs
    Returns:
        list of str
    &#34;&#34;&#34;
    if not topic_ids: topic_ids = list(range(self.n_topics))
    docs = self.get_docs(topic_ids)
    return [x[0] for x in docs]</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.get_topics"><code class="name flex">
<span>def <span class="ident">get_topics</span></span>(<span>self, n_words=10, as_string=True)</span>
</code></dt>
<dd>
<div class="desc"><p>Returns a list of discovered topics</p>
<h2 id="args">Args</h2>
<p>n_words(int): number of words to use in topic summary
as_string(bool): If True, each summary is a space-delimited string instead of list of words</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_topics(self, n_words=10, as_string=True):
    &#34;&#34;&#34;
    Returns a list of discovered topics
    Args:
        n_words(int): number of words to use in topic summary
        as_string(bool): If True, each summary is a space-delimited string instead of list of words
    &#34;&#34;&#34;
    self._check_model()
    feature_names = self.vectorizer.get_feature_names()
    topic_summaries = []
    for topic_idx, topic in enumerate(self.model.components_):
        summary = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
        if as_string: summary = &#34; &#34;.join(summary)
        topic_summaries.append(summary)
    return topic_summaries</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.get_word_weights"><code class="name flex">
<span>def <span class="ident">get_word_weights</span></span>(<span>self, topic_id, n_words=100)</span>
</code></dt>
<dd>
<div class="desc"><p>Returns a list tuples of the form: (word, weight) for given topic_id.
The weight can be interpreted as the number of times word was assigned to topic with given topic_id.
REFERENCE: <a href="https://stackoverflow.com/a/48890889/13550699">https://stackoverflow.com/a/48890889/13550699</a></p>
<h2 id="args">Args</h2>
<p>topic_id(int): topic ID
n_words=int): number of top words</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_word_weights(self, topic_id, n_words=100):
    &#34;&#34;&#34;
    Returns a list tuples of the form: (word, weight) for given topic_id.
    The weight can be interpreted as the number of times word was assigned to topic with given topic_id.
    REFERENCE: https://stackoverflow.com/a/48890889/13550699
    Args:
        topic_id(int): topic ID
        n_words=int): number of top words
    &#34;&#34;&#34;
    self._check_model()
    if topic_id+1 &gt; len(self.model.components_): 
        raise ValueError(&#39;topic_id must be less than %s&#39; % (len(self.model.components_)))
    feature_names = self.vectorizer.get_feature_names()
    word_probs = self.model.components_[topic_id]
    word_ids = [i for i in word_probs.argsort()[:-n_words - 1:-1]]
    words = [feature_names[i] for i in word_ids]
    probs = [word_probs[i] for i in word_ids]
    return list( zip(words, probs) )</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.predict"><code class="name flex">
<span>def <span class="ident">predict</span></span>(<span>self, texts, threshold=None, harden=False)</span>
</code></dt>
<dd>
<div class="desc"><h2 id="args">Args</h2>
<dl>
<dt><strong><code>texts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
<dd>list of texts</dd>
<dt><strong><code>threshold</code></strong> :&ensp;<code>float</code></dt>
<dd>If not None, documents with maximum topic scores
less than <threshold> are filtered out</dd>
</dl>
<p>harden(bool): If True, each document is assigned to a single topic for which
it has the highest score</p>
<h2 id="returns">Returns</h2>
<dl>
<dt>if threshold is None:</dt>
<dt><code>
np.ndarray</code></dt>
<dd>topic distribution for each text document</dd>
</dl>
<p>else:
(np.ndarray, np.ndarray): topic distribution and boolean array</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def predict(self, texts, threshold=None, harden=False):
    &#34;&#34;&#34;
    Args:
        texts (list of str): list of texts
        threshold (float): If not None, documents with maximum topic scores
                            less than &lt;threshold&gt; are filtered out
        harden(bool): If True, each document is assigned to a single topic for which
                      it has the highest score
    Returns:
        if threshold is None:
            np.ndarray: topic distribution for each text document
        else:
            (np.ndarray, np.ndarray): topic distribution and boolean array
    &#34;&#34;&#34;
    self._check_model()
    transformed_texts = self.vectorizer.transform(texts)
    X_topics = self.model.transform(transformed_texts)
    #if self.model_type == &#39;nmf&#39;:
        #scores = np.matrix(X_topics)
        #scores_normalized= scores/scores.sum(axis=1)
        #X_topics = scores_normalized
    _idx = np.array([True] * len(texts))
    if threshold is not None:
        _idx = np.amax(X_topics, axis=1) &gt; threshold  # idx of doc that above the threshold
        _idx = np.array(_idx)
        X_topics = X_topics[_idx]
    if harden: X_topics = self._harden_topics(X_topics)
    if threshold is not None:
        return (X_topics, _idx)
    else:
        return X_topics</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.print_topics"><code class="name flex">
<span>def <span class="ident">print_topics</span></span>(<span>self, n_words=10, show_counts=False)</span>
</code></dt>
<dd>
<div class="desc"><p>print topics
n_words(int): number of words to describe each topic
show_counts(bool): If True, print topics with document counts, where
the count is the number of documents with that topic as primary.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def print_topics(self, n_words=10, show_counts=False):
    &#34;&#34;&#34;
    print topics
    n_words(int): number of words to describe each topic
    show_counts(bool): If True, print topics with document counts, where
                       the count is the number of documents with that topic as primary.
    &#34;&#34;&#34;
    topics = self.get_topics(n_words=n_words, as_string=True)
    if show_counts:
        self._check_build()
        topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()], 
                                key=lambda kv:kv[-1], reverse=True)
        for (idx, topic, count) in topic_counts:
            print(&#34;topic:%s | count:%s | %s&#34; %(idx, count, topic))
    else:
        for i, t in enumerate(topics):
            print(&#39;topic %s | %s&#39; % (i, t))
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.recommend"><code class="name flex">
<span>def <span class="ident">recommend</span></span>(<span>self, text=None, doc_topic=None, n=5, n_neighbors=100)</span>
</code></dt>
<dd>
<div class="desc"><p>Given an example document, recommends documents similar to it
from the set of documents supplied to build().</p>
<h2 id="args">Args</h2>
<dl>
<dt>texts(list of str): list of document texts.
Mutually-exclusive with <doc_topics></dt>
<dt>doc_topics(ndarray): pre-computed topic distribution for each document in texts.</dt>
<dt>Mutually-exclusive with <texts>.</dt>
<dt><strong><code>n</code></strong> :&ensp;<code>int</code></dt>
<dd>number of recommendations to return</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>list</code> of <code>tuples</code></dt>
<dd>each tuple is of the form:
(text, doc_id, topic_probability, topic_id)</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def recommend(self, text=None, doc_topic=None, n=5, n_neighbors=100):
    &#34;&#34;&#34;
    Given an example document, recommends documents similar to it
    from the set of documents supplied to build().

    Args:
        texts(list of str): list of document texts.  Mutually-exclusive with &lt;doc_topics&gt;
        doc_topics(ndarray): pre-computed topic distribution for each document in texts.
                             Mutually-exclusive with &lt;texts&gt;.
        n (int): number of recommendations to return
    Returns:
        list of tuples: each tuple is of the form:
                        (text, doc_id, topic_probability, topic_id)

    &#34;&#34;&#34;
    # error-checks
    if text is not None and doc_topic is not None:
        raise ValueError(&#39;text is mutually-exclusive with doc_topic&#39;)
    if text is None and doc_topic is None:
        raise ValueError(&#39;One of text or doc_topic is required.&#39;)
    if text is not None and type(text) not in [str]:
        raise ValueError(&#39;text must be a str &#39;)
    if  doc_topic is not None and type(doc_topic) not in [np.ndarray]:
        raise ValueError(&#39;doc_topic must be a np.ndarray&#39;)

    if n &gt; n_neighbors: n_neighbors = n

    x_test = [doc_topic]
    if text:
        x_test = self.predict([text])
    docs = self.get_docs()
    indices = self.recommender.kneighbors(x_test, return_distance=False, n_neighbors=n_neighbors)
    results = [doc for i, doc in enumerate(docs) if i in indices]
    return results[:n]</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.save"><code class="name flex">
<span>def <span class="ident">save</span></span>(<span>self, fname)</span>
</code></dt>
<dd>
<div class="desc"><p>save TopicModel object</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def save(self, fname):
    &#34;&#34;&#34;
    save TopicModel object
    &#34;&#34;&#34;

    
    with open(fname+&#39;.tm_vect&#39;, &#39;wb&#39;) as f:
        pickle.dump(self.vectorizer, f)
    with open(fname+&#39;.tm_model&#39;, &#39;wb&#39;) as f:
        pickle.dump(self.model, f)
    params = {&#39;n_topics&#39;: self.n_topics,
              &#39;n_features&#39;: self.n_features,
              &#39;verbose&#39;: self.verbose}
    with open(fname+&#39;.tm_params&#39;, &#39;wb&#39;) as f:
        pickle.dump(params, f)

    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.score"><code class="name flex">
<span>def <span class="ident">score</span></span>(<span>self, texts=None, doc_topics=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Given a new set of documents (supplied as texts or doc_topics), the score method
uses a One-Class classifier to score documents based on similarity to a
seed set of documents (where seed set is computed by train_scorer() method).</p>
<p>Higher scores indicate a higher degree of similarity.
Positive values represent a binary decision of similar.
Negative values represent a binary decision of dissimlar.
In practice, negative scores closer to zer will also be simlar as One-Class
classifiers are more strict than traditional binary classifiers.
Documents with negative scores closer to zero are good candidates for
inclusion in a training set for binary classification (e.g., active labeling).</p>
<p>NOTE: The score method currently employs the use of LocalOutLierFactor, which
means you should not try to score documents that were used in training. Only
new, unseen documents should be scored for similarity.</p>
<h2 id="args">Args</h2>
<p>texts(list of str): list of document texts.
Mutually-exclusive with <doc_topics>
doc_topics(ndarray): pre-computed topic distribution for each document in texts.
Mutually-exclusive with <texts>.</p>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>list</code> of <code>floats</code></dt>
<dd>larger values indicate higher degree of similarity
positive values indicate a binary decision of similar
negative values indicate binary decision of dissimilar
In practice, negative scores closer to zero will also
be similar as One-class classifiers are more strict
than traditional binary classifiers.</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def score(self, texts=None, doc_topics=None):
    &#34;&#34;&#34;
    Given a new set of documents (supplied as texts or doc_topics), the score method
    uses a One-Class classifier to score documents based on similarity to a
    seed set of documents (where seed set is computed by train_scorer() method).

    Higher scores indicate a higher degree of similarity.
    Positive values represent a binary decision of similar.
    Negative values represent a binary decision of dissimlar.
    In practice, negative scores closer to zer will also be simlar as One-Class
    classifiers are more strict than traditional binary classifiers.
    Documents with negative scores closer to zero are good candidates for
    inclusion in a training set for binary classification (e.g., active labeling).

    NOTE: The score method currently employs the use of LocalOutLierFactor, which
    means you should not try to score documents that were used in training. Only
    new, unseen documents should be scored for similarity.

    Args:
        texts(list of str): list of document texts.  Mutually-exclusive with &lt;doc_topics&gt;
        doc_topics(ndarray): pre-computed topic distribution for each document in texts.
                             Mutually-exclusive with &lt;texts&gt;.
    Returns:
        list of floats:  larger values indicate higher degree of similarity
                         positive values indicate a binary decision of similar
                         negative values indicate binary decision of dissimilar
                         In practice, negative scores closer to zero will also 
                         be similar as One-class classifiers are more strict
                         than traditional binary classifiers.

    &#34;&#34;&#34;
    # error-checks
    if texts is not None and doc_topics is not None:
        raise ValueError(&#39;texts is mutually-exclusive with doc_topics&#39;)
    if texts is None and doc_topics is None:
        raise ValueError(&#39;One of texts or doc_topics is required.&#39;)
    if texts is not None and type(texts) not in [list, np.ndarray]:
        raise ValueError(&#39;texts must be either a list or numpy ndarray&#39;)
    if  doc_topics is not None and type(doc_topics) not in [np.ndarray]:
        raise ValueError(&#39;doc_topics must be a np.ndarray&#39;)

    x_test = doc_topics
    if texts:
        x_test = self.predict(texts)
    return self.scorer.decision_function(x_test)</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.search"><code class="name flex">
<span>def <span class="ident">search</span></span>(<span>self, query, topic_ids=[], doc_ids=[], case_sensitive=False)</span>
</code></dt>
<dd>
<div class="desc"><p>search documents for query string.</p>
<h2 id="args">Args</h2>
<dl>
<dt>query(str):
the word or phrase to search</dt>
<dt>topic_ids(list of ints): list of topid IDs where each id is in the range</dt>
<dt>of range(self.n_topics).</dt>
<dt><strong><code>doc_ids</code></strong> :&ensp;<code>list</code> of <code>ints</code></dt>
<dd>list of document IDs where each id is an index
into self.doctopics</dd>
</dl>
<p>case_sensitive(bool):
If True, case sensitive search</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def search(self, query, topic_ids=[], doc_ids=[], case_sensitive=False):
    &#34;&#34;&#34;
    search documents for query string.
    Args:
        query(str):  the word or phrase to search
        topic_ids(list of ints): list of topid IDs where each id is in the range
                                 of range(self.n_topics).
        doc_ids (list of ints): list of document IDs where each id is an index
                                into self.doctopics
        case_sensitive(bool):  If True, case sensitive search
    &#34;&#34;&#34;

    # setup pattern
    if not case_sensitive: query = query.lower()
    pattern = re.compile(r&#39;\b%s\b&#39; % query)

    # retrive docs
    docs = self.get_docs(topic_ids=topic_ids, doc_ids=doc_ids)

    # search
    mb = master_bar(range(1))
    results = []
    for i in mb:
        for doc in progress_bar(docs, parent=mb):
            text = doc[&#39;text&#39;]
            if not case_sensitive: text = text.lower()
            matches = pattern.findall(text)
            if matches: results.append(doc)
        if self.verbose: mb.write(&#39;done.&#39;)
    return results</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.train"><code class="name flex">
<span>def <span class="ident">train</span></span>(<span>self, texts, model_type='lda', n_topics=None, n_features=10000, min_df=5, max_df=0.5, stop_words='english', lda_max_iter=5, lda_mode='online', token_pattern=None, hyperparam_kwargs=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Fits a topic model to documents in <texts>.</p>
<h2 id="example">Example</h2>
<p>tm = ktrain.text.get_topic_model(docs, n_topics=20,
n_features=1000, min_df=2, max_df=0.95)</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>texts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
<dd>list of texts</dd>
<dt><strong><code>n_topics</code></strong> :&ensp;<code>int</code></dt>
<dd>number of topics.
If None, n_topics = min{400, sqrt[# documents/2]})</dd>
<dt><strong><code>n_features</code></strong> :&ensp;<code>int</code></dt>
<dd>maximum words to consider</dd>
<dt><strong><code>max_df</code></strong> :&ensp;<code>float</code></dt>
<dd>words in more than max_df proportion of docs discarded</dd>
<dt><strong><code>stop_words</code></strong> :&ensp;<code>str</code> or <code>list</code></dt>
<dd>either 'english' for built-in stop words or
a list of stop words to ignore</dd>
<dt><strong><code>lda_max_iter</code></strong> :&ensp;<code>int</code></dt>
<dd>maximum iterations for 'lda'.
5 is default if using lda_mode='online'.
If lda_mode='batch', this should be increased (e.g., 1500).
Ignored if model_type != 'lda'</dd>
<dt><strong><code>lda_mode</code></strong> :&ensp;<code>str</code></dt>
<dd>one of {'online', 'batch'}. Ignored of model_type !='lda'</dd>
</dl>
<p>token_pattern(str): regex pattern to use to tokenize documents.
If None, a default tokenizer will be used
hyperparam_kwargs(dict): hyperparameters for LDA/NMF
Keys in this dict can be any of the following:
alpha: alpha for LDA
default: 5./n_topics
beta: beta for LDA.
default:0.01
nmf_alpha: alpha for NMF.
default:0
l1_ratio: l1_ratio for NMF. default: 0
ngram_range:
whether to consider bigrams, trigrams. default: (1,1) </p>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>tuple</code></dt>
<dd>(model, vectorizer)</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def train(self,texts, model_type=&#39;lda&#39;, n_topics=None, n_features=10000,
          min_df=5, max_df=0.5,  stop_words=&#39;english&#39;,
          lda_max_iter=5, lda_mode=&#39;online&#39;,
          token_pattern=None, hyperparam_kwargs=None):
    &#34;&#34;&#34;
    Fits a topic model to documents in &lt;texts&gt;.
    Example:
        tm = ktrain.text.get_topic_model(docs, n_topics=20, 
                                        n_features=1000, min_df=2, max_df=0.95)
    Args:
        texts (list of str): list of texts
        n_topics (int): number of topics.
                        If None, n_topics = min{400, sqrt[# documents/2]})
        n_features (int):  maximum words to consider
        max_df (float): words in more than max_df proportion of docs discarded
        stop_words (str or list): either &#39;english&#39; for built-in stop words or
                                  a list of stop words to ignore
        lda_max_iter (int): maximum iterations for &#39;lda&#39;.  5 is default if using lda_mode=&#39;online&#39;.
                            If lda_mode=&#39;batch&#39;, this should be increased (e.g., 1500).
                            Ignored if model_type != &#39;lda&#39;
        lda_mode (str):  one of {&#39;online&#39;, &#39;batch&#39;}. Ignored of model_type !=&#39;lda&#39;
        token_pattern(str): regex pattern to use to tokenize documents. 
                            If None, a default tokenizer will be used
        hyperparam_kwargs(dict): hyperparameters for LDA/NMF
                                 Keys in this dict can be any of the following:
                                     alpha: alpha for LDA  default: 5./n_topics
                                     beta: beta for LDA.  default:0.01
                                     nmf_alpha: alpha for NMF.  default:0
                                     l1_ratio: l1_ratio for NMF. default: 0
                                     ngram_range:  whether to consider bigrams, trigrams. default: (1,1) 
                                
    Returns:
        tuple: (model, vectorizer)
    &#34;&#34;&#34;
    if hyperparam_kwargs is None:
        hyperparam_kwargs = {}
    alpha = hyperparam_kwargs.get(&#39;alpha&#39;, 5.0 / n_topics)
    beta = hyperparam_kwargs.get(&#39;beta&#39;, 0.01)
    nmf_alpha = hyperparam_kwargs.get(&#39;nmf_alpha&#39;, 0)
    l1_ratio = hyperparam_kwargs.get(&#39;l1_ratio&#39;, 0)
    ngram_range = hyperparam_kwargs.get(&#39;ngram_range&#39;, (1,1))

    # adjust defaults based on language detected
    if texts is not None:
        lang = TU.detect_lang(texts)
        if lang != &#39;en&#39;:
            stopwords = None if stop_words==&#39;english&#39; else stop_words
            token_pattern = r&#39;(?u)\b\w+\b&#39; if token_pattern is None else token_pattern
        if pp.is_nospace_lang(lang):
            text_list = []
            for t in texts:
                text_list.append(&#39; &#39;.join(jieba.cut(t, HMM=False)))
            texts = text_list
        if self.verbose: print(&#39;lang: %s&#39; % (lang))


    # preprocess texts
    if self.verbose: print(&#39;preprocessing texts...&#39;)
    if token_pattern is None: token_pattern = TU.DEFAULT_TOKEN_PATTERN
    #if token_pattern is None: token_pattern = r&#39;(?u)\b\w\w+\b&#39;
    vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                             max_features=n_features, stop_words=stop_words,
                             token_pattern=token_pattern, ngram_range=ngram_range)
    

    x_train = vectorizer.fit_transform(texts)

    # fit model

    if self.verbose: print(&#39;fitting model...&#39;)
    if model_type == &#39;lda&#39;:
        model = LatentDirichletAllocation(n_components=n_topics, max_iter=lda_max_iter,
                                          learning_method=lda_mode, learning_offset=50.,
                                          doc_topic_prior=alpha,
                                          topic_word_prior=beta,
                                          verbose=self.verbose, random_state=0)
    elif model_type == &#39;nmf&#39;:
        model = NMF(
            n_components=n_topics,
            max_iter=lda_max_iter,
            verbose=self.verbose,
            alpha=nmf_alpha,
            l1_ratio=l1_ratio,
            random_state=0)
    else:
        raise ValueError(&#34;unknown model type:&#34;, str(model_type))
    model.fit(x_train)

    # save model and vectorizer and hyperparameter settings
    return (model, vectorizer)</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.train_recommender"><code class="name flex">
<span>def <span class="ident">train_recommender</span></span>(<span>self, n_neighbors=20, metric='minkowski', p=2)</span>
</code></dt>
<dd>
<div class="desc"><p>Trains a recommender that, given a single document, will return
documents in the corpus that are semantically similar to it.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>n_neighbors</code></strong> :&ensp;<code>int</code></dt>
<dd>&nbsp;</dd>
</dl>
<h2 id="returns">Returns</h2>
<p>None</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def train_recommender(self, n_neighbors=20, metric=&#39;minkowski&#39;, p=2):
    &#34;&#34;&#34;
    Trains a recommender that, given a single document, will return
    documents in the corpus that are semantically similar to it.

    Args:
        n_neighbors (int): 
    Returns:
        None
    &#34;&#34;&#34;
    from sklearn.neighbors import NearestNeighbors
    rec = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p)
    probs = self.get_doctopics()
    rec.fit(probs)
    self.recommender = rec
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.train_scorer"><code class="name flex">
<span>def <span class="ident">train_scorer</span></span>(<span>self, topic_ids=[], doc_ids=[], n_neighbors=20)</span>
</code></dt>
<dd>
<div class="desc"><p>Trains a scorer that can score documents based on similarity to a
seed set of documents represented by topic_ids and doc_ids.</p>
<p>NOTE: The score method currently employs the use of LocalOutLierFactor, which
means you should not try to score documents that were used in training. Only
new, unseen documents should be scored for similarity.
REFERENCE:
<a href="https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor">https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor</a></p>
<h2 id="args">Args</h2>
<dl>
<dt>topic_ids(list of ints): list of topid IDs where each id is in the range</dt>
<dt>of range(self.n_topics).
Documents associated</dt>
<dt>with these topic_ids will be used as seed set.</dt>
<dt><strong><code>doc_ids</code></strong> :&ensp;<code>list</code> of <code>ints</code></dt>
<dd>list of document IDs where each id is an index
into self.doctopics.
Documents associated
with these doc_ids will be used as seed set.</dd>
</dl>
<h2 id="returns">Returns</h2>
<p>None</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def train_scorer(self, topic_ids=[], doc_ids=[], n_neighbors=20):
    &#34;&#34;&#34;
    Trains a scorer that can score documents based on similarity to a
    seed set of documents represented by topic_ids and doc_ids.

    NOTE: The score method currently employs the use of LocalOutLierFactor, which
    means you should not try to score documents that were used in training. Only
    new, unseen documents should be scored for similarity. 
    REFERENCE: 
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor

    Args:
        topic_ids(list of ints): list of topid IDs where each id is in the range
                                 of range(self.n_topics).  Documents associated
                                 with these topic_ids will be used as seed set.
        doc_ids (list of ints): list of document IDs where each id is an index
                                into self.doctopics.  Documents associated 
                                with these doc_ids will be used as seed set.
    Returns:
        None
    &#34;&#34;&#34;
    from sklearn.neighbors import LocalOutlierFactor
    clf = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True, contamination=0.1)
    probs = self.get_doctopics(topic_ids=topic_ids, doc_ids=doc_ids)
    clf.fit(probs)
    self.scorer = clf
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.TopicModel.visualize_documents"><code class="name flex">
<span>def <span class="ident">visualize_documents</span></span>(<span>self, texts=None, doc_topics=None, width=700, height=700, point_size=5, title='Document Visualization', extra_info={}, colors=None, filepath=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Generates a visualization of a set of documents based on model.
If <texts> is supplied, raw documents will be first transformed into document-topic
matrix.
If <doc_topics> is supplied, then this will be used for visualization instead.</p>
<h2 id="args">Args</h2>
<p>texts(list of str): list of document texts.
Mutually-exclusive with <doc_topics>
doc_topics(ndarray): pre-computed topic distribution for each document in texts.
Mutually-exclusive with <texts>.
width(int): width of image
height(int): height of image
point_size(int): size of circles in plot
title(str):
title of visualization
extra_info(dict of lists): A user-supplied information for each datapoint (attributes of the datapoint).
The keys are field names.
The values are lists - each of which must
be the same number of elements as <texts> or <doc_topics>. These fields are displayed
when hovering over datapoints in the visualization.
colors(list of str):
list of Hex color codes for each datapoint.
Length of list must match either len(texts) or doc_topics.shape[0]
filepath(str):
Optional filepath to save the interactive visualization</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def visualize_documents(self, texts=None, doc_topics=None, 
                        width=700, height=700, point_size=5, title=&#39;Document Visualization&#39;,
                        extra_info={},
                        colors=None,
                        filepath=None,):
    &#34;&#34;&#34;
    Generates a visualization of a set of documents based on model.
    If &lt;texts&gt; is supplied, raw documents will be first transformed into document-topic
    matrix.  If &lt;doc_topics&gt; is supplied, then this will be used for visualization instead.
    Args:
        texts(list of str): list of document texts.  Mutually-exclusive with &lt;doc_topics&gt;
        doc_topics(ndarray): pre-computed topic distribution for each document in texts.
                             Mutually-exclusive with &lt;texts&gt;.
        width(int): width of image
        height(int): height of image
        point_size(int): size of circles in plot
        title(str):  title of visualization
        extra_info(dict of lists): A user-supplied information for each datapoint (attributes of the datapoint).
                                   The keys are field names.  The values are lists - each of which must
                                   be the same number of elements as &lt;texts&gt; or &lt;doc_topics&gt;. These fields are displayed
                                   when hovering over datapoints in the visualization.
        colors(list of str):  list of Hex color codes for each datapoint.
                              Length of list must match either len(texts) or doc_topics.shape[0]
        filepath(str):             Optional filepath to save the interactive visualization
    &#34;&#34;&#34;

    # error-checking
    if texts is not None: length = len(texts)
    else: length = doc_topics.shape[0]
    if colors is not None and len(colors) != length:
        raise ValueError(&#39;length of colors is not consistent with length of texts or doctopics&#39;)
    if texts is not None and doc_topics is not None:
        raise ValueError(&#39;texts is mutually-exclusive with doc_topics&#39;)
    if texts is None and doc_topics is None:
        raise ValueError(&#39;One of texts or doc_topics is required.&#39;)
    if extra_info:
        invalid_keys = [&#39;x&#39;, &#39;y&#39;, &#39;topic&#39;, &#39;fill_color&#39;]
        for k in extra_info.keys():
            if k in invalid_keys:
                raise ValueError(&#39;cannot use &#34;%s&#34; as key in extra_info&#39; %(k))
            lst = extra_info[k]
            if len(lst) != length:
                raise ValueError(&#39;texts and extra_info lists must be same size&#39;)

    # check fo bokeh
    try:
        import bokeh.plotting as bp
        from bokeh.plotting import save
        from bokeh.models import HoverTool
        from bokeh.io import output_notebook
    except:
        warnings.warn(&#39;visualize_documents method requires bokeh package: pip install bokeh&#39;)
        return

    # prepare data
    if doc_topics is not None:
        X_topics = doc_topics
    else:
        if self.verbose:  print(&#39;transforming texts...&#39;, end=&#39;&#39;)
        X_topics = self.predict(texts, harden=False)
        if self.verbose: print(&#39;done.&#39;)

    # reduce to 2-D
    if self.verbose:  print(&#39;reducing to 2 dimensions...&#39;, end=&#39;&#39;)
    tsne_model = TSNE(n_components=2, verbose=self.verbose, random_state=0, angle=.99, init=&#39;pca&#39;)
    tsne_lda = tsne_model.fit_transform(X_topics)
    print(&#39;done.&#39;)

    # get random colormap
    colormap = U.get_random_colors(self.n_topics)

    # generate inline visualization in Jupyter notebook
    lda_keys = self._harden_topics(X_topics)
    if colors is None: colors = colormap[lda_keys]
    topic_summaries = self.get_topics(n_words=5)
    os.environ[&#34;BOKEH_RESOURCES&#34;]=&#34;inline&#34;
    output_notebook()
    dct = { 
            &#39;x&#39;:tsne_lda[:,0],
            &#39;y&#39;:tsne_lda[:, 1],
            &#39;topic&#39;:[topic_summaries[tid] for tid in lda_keys],
            &#39;fill_color&#39;:colors,}
    tool_tups = [(&#39;index&#39;, &#39;$index&#39;),
                 (&#39;(x,y)&#39;,&#39;($x,$y)&#39;),
                 (&#39;topic&#39;, &#39;@topic&#39;)]
    for k in extra_info.keys():
        dct[k] = extra_info[k]
        tool_tups.append((k, &#39;@&#39;+k))

    source = bp.ColumnDataSource(data=dct)
    hover = HoverTool( tooltips=tool_tups)
    p = bp.figure(plot_width=width, plot_height=height, 
                  tools=[hover, &#39;save&#39;, &#39;pan&#39;, &#39;wheel_zoom&#39;, &#39;box_zoom&#39;, &#39;reset&#39;],
                  #tools=&#34;pan,wheel_zoom,box_zoom,reset,hover,previewsave&#34;,
                  title=title)
    #plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                       #title=title,
                       #tools=&#34;pan,wheel_zoom,box_zoom,reset,hover,previewsave&#34;,
                       #x_axis_type=None, y_axis_type=None, min_border=1)
    p.circle(&#39;x&#39;, &#39;y&#39;, size=point_size, source=source, fill_color= &#39;fill_color&#39;)
    bp.show(p)
    if filepath is not None:
        bp.output_file(filepath)
        bp.save(p)
    return</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.Transformer"><code class="flex name class">
<span>class <span class="ident">Transformer</span></span>
<span>(</span><span>model_name, maxlen=128, class_names=[], classes=[], batch_size=None, use_with_learner=True)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>convenience class for text classification Hugging Face transformers 
Usage:
   t = Transformer('distilbert-base-uncased', maxlen=128, classes=['neg', 'pos'], batch_size=16)
   train_dataset = t.preprocess_train(train_texts, train_labels)
   model = t.get_classifier()
   model.fit(train_dataset)
</code></pre>
<pre><code>Args:
    model_name (str):  name of Hugging Face pretrained model
    maxlen (int):  sequence length
    class_names(list):  list of strings of class names (e.g., 'positive', 'negative').
                        The index position of string is the class ID.
                        Not required for:
                          - regression problems
                          - binary/multi classification problems where
                            labels in y_train/y_test are in string format.
                            In this case, classes will be populated automatically.
                            get_classes() can be called to view discovered class labels.
                        The class_names argument replaces the old classes argument.
    classes(list):  alias for class_names.  Included for backwards-compatiblity.

    use_with_learner(bool):  If False, preprocess_train and preprocess_test
                             will return tf.Datasets for direct use with model.fit
                             in tf.Keras.
                             If True, preprocess_train and preprocess_test will
                             return a ktrain TransformerDataset object for use with
                             ktrain.get_learner.
    batch_size (int): batch_size - only required if use_with_learner=False


</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Transformer(TransformersPreprocessor):
    &#34;&#34;&#34;
    ```
    convenience class for text classification Hugging Face transformers 
    Usage:
       t = Transformer(&#39;distilbert-base-uncased&#39;, maxlen=128, classes=[&#39;neg&#39;, &#39;pos&#39;], batch_size=16)
       train_dataset = t.preprocess_train(train_texts, train_labels)
       model = t.get_classifier()
       model.fit(train_dataset)
    ```
    &#34;&#34;&#34;

    def __init__(self, model_name, maxlen=128, class_names=[], classes=[],
                 batch_size=None, use_with_learner=True):
        &#34;&#34;&#34;
        ```
        Args:
            model_name (str):  name of Hugging Face pretrained model
            maxlen (int):  sequence length
            class_names(list):  list of strings of class names (e.g., &#39;positive&#39;, &#39;negative&#39;).
                                The index position of string is the class ID.
                                Not required for:
                                  - regression problems
                                  - binary/multi classification problems where
                                    labels in y_train/y_test are in string format.
                                    In this case, classes will be populated automatically.
                                    get_classes() can be called to view discovered class labels.
                                The class_names argument replaces the old classes argument.
            classes(list):  alias for class_names.  Included for backwards-compatiblity.

            use_with_learner(bool):  If False, preprocess_train and preprocess_test
                                     will return tf.Datasets for direct use with model.fit
                                     in tf.Keras.
                                     If True, preprocess_train and preprocess_test will
                                     return a ktrain TransformerDataset object for use with
                                     ktrain.get_learner.
            batch_size (int): batch_size - only required if use_with_learner=False


        ```
        &#34;&#34;&#34;
        multilabel = None # force discovery of multilabel task from data in preprocess_train-&gt;set_multilabel
        class_names = self.migrate_classes(class_names, classes)
        if not use_with_learner and batch_size is None:
            raise ValueError(&#39;batch_size is required when use_with_learner=False&#39;)
        if multilabel and (class_names is None or not class_names):
            raise ValueError(&#39;classes argument is required when multilabel=True&#39;)
        super().__init__(model_name,
                         maxlen, max_features=10000, class_names=class_names, multilabel=multilabel)
        self.batch_size = batch_size
        self.use_with_learner = use_with_learner
        self.lang = None


    def preprocess_train(self, texts, y=None, mode=&#39;train&#39;, verbose=1):
        &#34;&#34;&#34;
        ```
        Preprocess training set for A Transformer model

        Y values can be in one of the following forms:
        1) integers representing the class (index into array returned by get_classes)
           for binary and multiclass text classification.
           If labels are integers, class_names argument to Transformer constructor is required.
        2) strings representing the class (e.g., &#39;negative&#39;, &#39;positive&#39;).
           If labels are strings, class_names argument to Transformer constructor is ignored,
           as class labels will be extracted from y.
        3) multi-hot-encoded vector for multilabel text classification problems
           If labels are multi-hot-encoded, class_names argument to Transformer constructor is requird.
        4) Numerical values for regression problems.
           &lt;class_names&gt; argument to Transformer constructor should NOT be supplied

        Args:
            texts (list of strings): text of documents
            y: labels
            mode (str):  If &#39;train&#39; and prepare_for_learner=False,
                         a tf.Dataset will be returned with repeat enabled
                         for training with fit_generator
            verbose(bool): verbosity
        Returns:
          TransformerDataset if self.use_with_learner = True else tf.Dataset
        ```
        &#34;&#34;&#34;
        tseq = super().preprocess_train(texts, y=y, mode=mode, verbose=verbose)
        if self.use_with_learner: return tseq
        tseq.batch_size = self.batch_size
        train = (mode == &#39;train&#39;)
        return tseq.to_tfdataset(train=train)


    def preprocess_test(self, texts, y=None,  verbose=1):
        &#34;&#34;&#34;
        ```
        Preprocess the validation or test set for a Transformer model
        Y values can be in one of the following forms:
        1) integers representing the class (index into array returned by get_classes)
           for binary and multiclass text classification.
           If labels are integers, class_names argument to Transformer constructor is required.
        2) strings representing the class (e.g., &#39;negative&#39;, &#39;positive&#39;).
           If labels are strings, class_names argument to Transformer constructor is ignored,
           as class labels will be extracted from y.
        3) multi-hot-encoded vector for multilabel text classification problems
           If labels are multi-hot-encoded, class_names argument to Transformer constructor is requird.
        4) Numerical values for regression problems.
           &lt;class_names&gt; argument to Transformer constructor should NOT be supplied

        Args:
            texts (list of strings): text of documents
            y: labels
            verbose(bool): verbosity
        Returns:
            TransformerDataset if self.use_with_learner = True else tf.Dataset
        ```
        &#34;&#34;&#34;
        self.check_trained()
        return self.preprocess_train(texts, y=y, mode=&#39;test&#39;, verbose=verbose)</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.text.preprocessor.TransformersPreprocessor" href="preprocessor.html#ktrain.text.preprocessor.TransformersPreprocessor">TransformersPreprocessor</a></li>
<li><a title="ktrain.text.preprocessor.TextPreprocessor" href="preprocessor.html#ktrain.text.preprocessor.TextPreprocessor">TextPreprocessor</a></li>
<li><a title="ktrain.preprocessor.Preprocessor" href="../preprocessor.html#ktrain.preprocessor.Preprocessor">Preprocessor</a></li>
<li>abc.ABC</li>
</ul>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.Transformer.preprocess_test"><code class="name flex">
<span>def <span class="ident">preprocess_test</span></span>(<span>self, texts, y=None, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Preprocess the validation or test set for a Transformer model
Y values can be in one of the following forms:
1) integers representing the class (index into array returned by get_classes)
   for binary and multiclass text classification.
   If labels are integers, class_names argument to Transformer constructor is required.
2) strings representing the class (e.g., 'negative', 'positive').
   If labels are strings, class_names argument to Transformer constructor is ignored,
   as class labels will be extracted from y.
3) multi-hot-encoded vector for multilabel text classification problems
   If labels are multi-hot-encoded, class_names argument to Transformer constructor is requird.
4) Numerical values for regression problems.
   &lt;class_names&gt; argument to Transformer constructor should NOT be supplied

Args:
    texts (list of strings): text of documents
    y: labels
    verbose(bool): verbosity
Returns:
    TransformerDataset if self.use_with_learner = True else tf.Dataset
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess_test(self, texts, y=None,  verbose=1):
    &#34;&#34;&#34;
    ```
    Preprocess the validation or test set for a Transformer model
    Y values can be in one of the following forms:
    1) integers representing the class (index into array returned by get_classes)
       for binary and multiclass text classification.
       If labels are integers, class_names argument to Transformer constructor is required.
    2) strings representing the class (e.g., &#39;negative&#39;, &#39;positive&#39;).
       If labels are strings, class_names argument to Transformer constructor is ignored,
       as class labels will be extracted from y.
    3) multi-hot-encoded vector for multilabel text classification problems
       If labels are multi-hot-encoded, class_names argument to Transformer constructor is requird.
    4) Numerical values for regression problems.
       &lt;class_names&gt; argument to Transformer constructor should NOT be supplied

    Args:
        texts (list of strings): text of documents
        y: labels
        verbose(bool): verbosity
    Returns:
        TransformerDataset if self.use_with_learner = True else tf.Dataset
    ```
    &#34;&#34;&#34;
    self.check_trained()
    return self.preprocess_train(texts, y=y, mode=&#39;test&#39;, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.Transformer.preprocess_train"><code class="name flex">
<span>def <span class="ident">preprocess_train</span></span>(<span>self, texts, y=None, mode='train', verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Preprocess training set for A Transformer model

Y values can be in one of the following forms:
1) integers representing the class (index into array returned by get_classes)
   for binary and multiclass text classification.
   If labels are integers, class_names argument to Transformer constructor is required.
2) strings representing the class (e.g., 'negative', 'positive').
   If labels are strings, class_names argument to Transformer constructor is ignored,
   as class labels will be extracted from y.
3) multi-hot-encoded vector for multilabel text classification problems
   If labels are multi-hot-encoded, class_names argument to Transformer constructor is requird.
4) Numerical values for regression problems.
   &lt;class_names&gt; argument to Transformer constructor should NOT be supplied

Args:
    texts (list of strings): text of documents
    y: labels
    mode (str):  If 'train' and prepare_for_learner=False,
                 a tf.Dataset will be returned with repeat enabled
                 for training with fit_generator
    verbose(bool): verbosity
Returns:
  TransformerDataset if self.use_with_learner = True else tf.Dataset
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess_train(self, texts, y=None, mode=&#39;train&#39;, verbose=1):
    &#34;&#34;&#34;
    ```
    Preprocess training set for A Transformer model

    Y values can be in one of the following forms:
    1) integers representing the class (index into array returned by get_classes)
       for binary and multiclass text classification.
       If labels are integers, class_names argument to Transformer constructor is required.
    2) strings representing the class (e.g., &#39;negative&#39;, &#39;positive&#39;).
       If labels are strings, class_names argument to Transformer constructor is ignored,
       as class labels will be extracted from y.
    3) multi-hot-encoded vector for multilabel text classification problems
       If labels are multi-hot-encoded, class_names argument to Transformer constructor is requird.
    4) Numerical values for regression problems.
       &lt;class_names&gt; argument to Transformer constructor should NOT be supplied

    Args:
        texts (list of strings): text of documents
        y: labels
        mode (str):  If &#39;train&#39; and prepare_for_learner=False,
                     a tf.Dataset will be returned with repeat enabled
                     for training with fit_generator
        verbose(bool): verbosity
    Returns:
      TransformerDataset if self.use_with_learner = True else tf.Dataset
    ```
    &#34;&#34;&#34;
    tseq = super().preprocess_train(texts, y=y, mode=mode, verbose=verbose)
    if self.use_with_learner: return tseq
    tseq.batch_size = self.batch_size
    train = (mode == &#39;train&#39;)
    return tseq.to_tfdataset(train=train)</code></pre>
</details>
</dd>
</dl>
<h3>Inherited members</h3>
<ul class="hlist">
<li><code><b><a title="ktrain.text.preprocessor.TransformersPreprocessor" href="preprocessor.html#ktrain.text.preprocessor.TransformersPreprocessor">TransformersPreprocessor</a></b></code>:
<ul class="hlist">
<li><code><a title="ktrain.text.preprocessor.TransformersPreprocessor.get_classifier" href="preprocessor.html#ktrain.text.preprocessor.TransformersPreprocessor.get_classifier">get_classifier</a></code></li>
<li><code><a title="ktrain.text.preprocessor.TransformersPreprocessor.get_regression_model" href="preprocessor.html#ktrain.text.preprocessor.TransformersPreprocessor.get_regression_model">get_regression_model</a></code></li>
<li><code><a title="ktrain.text.preprocessor.TransformersPreprocessor.load_model_and_configure_from_data" href="preprocessor.html#ktrain.text.preprocessor.TransformersPreprocessor.load_model_and_configure_from_data">load_model_and_configure_from_data</a></code></li>
<li><code><a title="ktrain.text.preprocessor.TransformersPreprocessor.print_seqlen_stats" href="preprocessor.html#ktrain.text.preprocessor.TextPreprocessor.print_seqlen_stats">print_seqlen_stats</a></code></li>
<li><code><a title="ktrain.text.preprocessor.TransformersPreprocessor.seqlen_stats" href="preprocessor.html#ktrain.text.preprocessor.TextPreprocessor.seqlen_stats">seqlen_stats</a></code></li>
<li><code><a title="ktrain.text.preprocessor.TransformersPreprocessor.undo" href="preprocessor.html#ktrain.text.preprocessor.TextPreprocessor.undo">undo</a></code></li>
</ul>
</li>
</ul>
</dd>
<dt id="ktrain.text.TransformerEmbedding"><code class="flex name class">
<span>class <span class="ident">TransformerEmbedding</span></span>
<span>(</span><span>model_name, layers=[-2])</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Args:
    model_name (str):  name of Hugging Face pretrained model.
                       Choose from here: https://huggingface.co/transformers/pretrained_models.html
    layers(list): list of indexes indicating which hidden layers to use when
                  constructing the embedding (e.g., last=[-1])

</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class TransformerEmbedding():
    def __init__(self, model_name, layers=U.DEFAULT_TRANSFORMER_LAYERS):
        &#34;&#34;&#34;
        ```
        Args:
            model_name (str):  name of Hugging Face pretrained model.
                               Choose from here: https://huggingface.co/transformers/pretrained_models.html
            layers(list): list of indexes indicating which hidden layers to use when
                          constructing the embedding (e.g., last=[-1])
                               
        ```
        &#34;&#34;&#34;
        self.layers = layers
        self.model_name = model_name
        if model_name.startswith(&#39;xlm-roberta&#39;):
            self.name = &#39;xlm_roberta&#39;
        else:
            self.name = model_name.split(&#39;-&#39;)[0]

        self.config = AutoConfig.from_pretrained(model_name)
        self.model_type = TFAutoModel
        self.tokenizer_type = AutoTokenizer

        if &#34;bert-base-japanese&#34; in model_name:
            self.tokenizer_type = transformers.BertJapaneseTokenizer

        self.tokenizer = self.tokenizer_type.from_pretrained(model_name)
        self.model = self._load_pretrained(model_name)
        try:
            self.embsize = self.embed(&#39;ktrain&#39;, word_level=False).shape[1] # (batch_size, embsize)
        except:
            warnings.warn(&#39;could not determine Embedding size&#39;)
        if type(self.model).__name__ not in [&#39;TFBertModel&#39;, &#39;TFDistilBertModel&#39;, &#39;TFAlbertModel&#39;]:
            raise ValueError(&#39;TransformerEmbedding class currently only supports BERT-style models: &#39; +\
                             &#39;Bert, DistilBert, and Albert and variants like BioBERT and SciBERT\n\n&#39; +\
                             &#39;model received: %s (%s))&#39; % (type(self.model).__name__, model_name))


    def _load_pretrained(self, model_name):
        &#34;&#34;&#34;
        ```
        load pretrained model
        ```
        &#34;&#34;&#34;
        if self.config is not None:
            self.config.output_hidden_states = True
            try:
                model = self.model_type.from_pretrained(model_name, config=self.config)
            except:
                warnings.warn(&#39;Could not find Tensorflow version of model.  Attempting to download/load PyTorch version as TensorFlow model using from_pt=True. &#39; +\
                              &#39;You will need PyTorch installed for this.&#39;)
                try:
                    model = self.model_type.from_pretrained(model_name, config=self.config, from_pt=True)
                except:
                    raise ValueError(&#39;could not load pretrained model %s using both from_pt=False and from_pt=True&#39; % (model_name))
        else:
            model = self.model_type.from_pretrained(model_name, output_hidden_states=True)
        return model


    def embed(self, texts, word_level=True, max_length=512):
        &#34;&#34;&#34;
        ```
        get embedding for word, phrase, or sentence
        Args:
          text(str|list): word, phrase, or sentence or list of them representing a batch
          word_level(bool): If True, returns embedding for each token in supplied texts.
                            If False, returns embedding for each text in texts
          max_length(int): max length of tokens
        Returns:
            np.ndarray : embeddings
        ```
        &#34;&#34;&#34;
        if isinstance(texts, str): texts = [texts]
        if not isinstance(texts[0], str): texts = [&#34; &#34;.join(text) for text in texts]

        sentences = []
        for text in texts:
            sentences.append(self.tokenizer.tokenize(text))
        maxlen = len(max([tokens for tokens in sentences], key=len,)) + 2
        if max_length is not None and maxlen &gt; max_length: maxlen = max_length # added due to issue #270
        sentences = []

        all_input_ids = []
        all_input_masks = []
        for text in texts:
            tokens = self.tokenizer.tokenize(text)
            if len(tokens) &gt; maxlen - 2:
                tokens = tokens[0 : (maxlen - 2)]
            sentences.append(tokens)
            tokens = [self.tokenizer.cls_token] + tokens + [self.tokenizer.sep_token]
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            while len(input_ids) &lt; maxlen:
                input_ids.append(0)
                input_mask.append(0)
            all_input_ids.append(input_ids)
            all_input_masks.append(input_mask)

        all_input_ids = np.array(all_input_ids)
        all_input_masks = np.array(all_input_masks)
        outputs = self.model(all_input_ids, attention_mask=all_input_masks)
        hidden_states = outputs[-1] # output_hidden_states=True

        # compile raw embeddings
        if len(self.layers) == 1:
            #raw_embeddings = hidden_states[-1].numpy()
            raw_embeddings = hidden_states[self.layers[0]].numpy()
        else:
            raw_embeddings = []
            for batch_id in range(hidden_states[0].shape[0]):
                token_embeddings = []
                for token_id in range(hidden_states[0].shape[1]):
                    all_layers = []
                    for layer_id in self.layers:
                        all_layers.append(hidden_states[layer_id][batch_id][token_id].numpy())
                    token_embeddings.append(np.concatenate(all_layers) )  
                raw_embeddings.append(token_embeddings)
            raw_embeddings = np.array(raw_embeddings)

        if not word_level: # sentence-level embedding
            return np.mean(raw_embeddings, axis=1)
            #return np.squeeze(raw_embeddings[:,0:1,:], axis=1)

        # filter-out extra subword tokens and special tokens 
        # (using first subword of each token as embedding representations)
        filtered_embeddings = []
        for batch_idx, tokens in enumerate(sentences):
            embedding = []
            for token_idx, token in enumerate(tokens):
                if token in [self.tokenizer.cls_token, self.tokenizer.sep_token] or token.startswith(&#39;##&#39;): continue
                embedding.append(raw_embeddings[batch_idx][token_idx])
            filtered_embeddings.append(embedding)

        # pad embeddings with zeros
        max_length = max([len(e) for e in filtered_embeddings])
        embeddings = []
        for e in filtered_embeddings:
            for i in range(max_length-len(e)):
                e.append(np.zeros((self.embsize,)))
            embeddings.append(np.array(e))
        return np.array(embeddings)</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.TransformerEmbedding.embed"><code class="name flex">
<span>def <span class="ident">embed</span></span>(<span>self, texts, word_level=True, max_length=512)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>get embedding for word, phrase, or sentence
Args:
  text(str|list): word, phrase, or sentence or list of them representing a batch
  word_level(bool): If True, returns embedding for each token in supplied texts.
                    If False, returns embedding for each text in texts
  max_length(int): max length of tokens
Returns:
    np.ndarray : embeddings
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def embed(self, texts, word_level=True, max_length=512):
    &#34;&#34;&#34;
    ```
    get embedding for word, phrase, or sentence
    Args:
      text(str|list): word, phrase, or sentence or list of them representing a batch
      word_level(bool): If True, returns embedding for each token in supplied texts.
                        If False, returns embedding for each text in texts
      max_length(int): max length of tokens
    Returns:
        np.ndarray : embeddings
    ```
    &#34;&#34;&#34;
    if isinstance(texts, str): texts = [texts]
    if not isinstance(texts[0], str): texts = [&#34; &#34;.join(text) for text in texts]

    sentences = []
    for text in texts:
        sentences.append(self.tokenizer.tokenize(text))
    maxlen = len(max([tokens for tokens in sentences], key=len,)) + 2
    if max_length is not None and maxlen &gt; max_length: maxlen = max_length # added due to issue #270
    sentences = []

    all_input_ids = []
    all_input_masks = []
    for text in texts:
        tokens = self.tokenizer.tokenize(text)
        if len(tokens) &gt; maxlen - 2:
            tokens = tokens[0 : (maxlen - 2)]
        sentences.append(tokens)
        tokens = [self.tokenizer.cls_token] + tokens + [self.tokenizer.sep_token]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) &lt; maxlen:
            input_ids.append(0)
            input_mask.append(0)
        all_input_ids.append(input_ids)
        all_input_masks.append(input_mask)

    all_input_ids = np.array(all_input_ids)
    all_input_masks = np.array(all_input_masks)
    outputs = self.model(all_input_ids, attention_mask=all_input_masks)
    hidden_states = outputs[-1] # output_hidden_states=True

    # compile raw embeddings
    if len(self.layers) == 1:
        #raw_embeddings = hidden_states[-1].numpy()
        raw_embeddings = hidden_states[self.layers[0]].numpy()
    else:
        raw_embeddings = []
        for batch_id in range(hidden_states[0].shape[0]):
            token_embeddings = []
            for token_id in range(hidden_states[0].shape[1]):
                all_layers = []
                for layer_id in self.layers:
                    all_layers.append(hidden_states[layer_id][batch_id][token_id].numpy())
                token_embeddings.append(np.concatenate(all_layers) )  
            raw_embeddings.append(token_embeddings)
        raw_embeddings = np.array(raw_embeddings)

    if not word_level: # sentence-level embedding
        return np.mean(raw_embeddings, axis=1)
        #return np.squeeze(raw_embeddings[:,0:1,:], axis=1)

    # filter-out extra subword tokens and special tokens 
    # (using first subword of each token as embedding representations)
    filtered_embeddings = []
    for batch_idx, tokens in enumerate(sentences):
        embedding = []
        for token_idx, token in enumerate(tokens):
            if token in [self.tokenizer.cls_token, self.tokenizer.sep_token] or token.startswith(&#39;##&#39;): continue
            embedding.append(raw_embeddings[batch_idx][token_idx])
        filtered_embeddings.append(embedding)

    # pad embeddings with zeros
    max_length = max([len(e) for e in filtered_embeddings])
    embeddings = []
    for e in filtered_embeddings:
        for i in range(max_length-len(e)):
            e.append(np.zeros((self.embsize,)))
        embeddings.append(np.array(e))
    return np.array(embeddings)</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.TransformerSummarizer"><code class="flex name class">
<span>class <span class="ident">TransformerSummarizer</span></span>
<span>(</span><span>model_name='facebook/bart-large-cnn', device=None)</span>
</code></dt>
<dd>
<div class="desc"><p>interface to Transformer-based text summarization</p>
<pre><code>interface to BART-based text summarization using transformers library

Args:
  model_name(str): name of BART model for summarization
  device(str): device to use (e.g., 'cuda', 'cpu')
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class TransformerSummarizer():
    &#34;&#34;&#34;
    interface to Transformer-based text summarization
    &#34;&#34;&#34;

    def __init__(self, model_name=&#39;facebook/bart-large-cnn&#39;, device=None):
        &#34;&#34;&#34;
        ```
        interface to BART-based text summarization using transformers library

        Args:
          model_name(str): name of BART model for summarization
          device(str): device to use (e.g., &#39;cuda&#39;, &#39;cpu&#39;)
        ```
        &#34;&#34;&#34;
        if &#39;bart&#39; not in model_name:
            raise ValueError(&#39;TransformerSummarizer currently only accepts BART models&#39;)
        try:
            import torch
        except ImportError:
            raise Exception(&#39;TransformerSummarizer requires PyTorch to be installed.&#39;)
        self.torch_device = device
        if self.torch_device is None: self.torch_device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
        from transformers import BartTokenizer, BartForConditionalGeneration
        self.tokenizer = BartTokenizer.from_pretrained(model_name)
        self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.torch_device)


    def summarize(self, doc):
        &#34;&#34;&#34;
        ```
        summarize document text
        Args:
          doc(str): text of document
        Returns:
          str: summary text
        ```
        &#34;&#34;&#34;
        import torch
        with torch.no_grad():
            answers_input_ids = self.tokenizer.batch_encode_plus([doc], 
                                                                 return_tensors=&#39;pt&#39;, truncation=True,
                                                                 max_length=1024)[&#39;input_ids&#39;].to(self.torch_device)
            summary_ids = self.model.generate(answers_input_ids,
                                              num_beams=4,
                                              length_penalty=2.0,
                                              max_length=142,
                                              min_length=56,
                                              no_repeat_ngram_size=3)

            exec_sum = self.tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
        return exec_sum</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.TransformerSummarizer.summarize"><code class="name flex">
<span>def <span class="ident">summarize</span></span>(<span>self, doc)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>summarize document text
Args:
  doc(str): text of document
Returns:
  str: summary text
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def summarize(self, doc):
    &#34;&#34;&#34;
    ```
    summarize document text
    Args:
      doc(str): text of document
    Returns:
      str: summary text
    ```
    &#34;&#34;&#34;
    import torch
    with torch.no_grad():
        answers_input_ids = self.tokenizer.batch_encode_plus([doc], 
                                                             return_tensors=&#39;pt&#39;, truncation=True,
                                                             max_length=1024)[&#39;input_ids&#39;].to(self.torch_device)
        summary_ids = self.model.generate(answers_input_ids,
                                          num_beams=4,
                                          length_penalty=2.0,
                                          max_length=142,
                                          min_length=56,
                                          no_repeat_ngram_size=3)

        exec_sum = self.tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
    return exec_sum</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.Translator"><code class="flex name class">
<span>class <span class="ident">Translator</span></span>
<span>(</span><span>model_name=None, device=None, half=False)</span>
</code></dt>
<dd>
<div class="desc"><p>Translator: basic wrapper around MarianMT model for language translation</p>
<pre><code>basic wrapper around MarianMT model for language translation

Args:
  model_name(str): Helsinki-NLP model
  device(str): device to use (e.g., 'cuda', 'cpu')
  half(bool): If True, use half precision.
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Translator():
    &#34;&#34;&#34;
    Translator: basic wrapper around MarianMT model for language translation
    &#34;&#34;&#34;

    def __init__(self, model_name=None, device=None, half=False):
        &#34;&#34;&#34;
        ```
        basic wrapper around MarianMT model for language translation

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., &#39;cuda&#39;, &#39;cpu&#39;)
          half(bool): If True, use half precision.
        ```
        &#34;&#34;&#34;
        if &#39;Helsinki-NLP&#39; not in model_name:
            warnings.warn(&#39;Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP&#39;)
        try:
            import torch
        except ImportError:
            raise Exception(&#39;Translator requires PyTorch to be installed.&#39;)
        self.torch_device = device
        if self.torch_device is None: self.torch_device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
        from transformers import MarianMTModel, MarianTokenizer
        self.tokenizer = MarianTokenizer.from_pretrained(model_name)
        self.model = MarianMTModel.from_pretrained(model_name).to(self.torch_device)
        if half: self.model = self.model.half()


    def translate(self, src_text, join_with=&#39;\n&#39;, num_beams=None, early_stopping=None):
        &#34;&#34;&#34;
        ```
        Translate document (src_text).
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          src_text(str): source text.
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs. 
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        &#34;&#34;&#34;
        sentences = TU.sent_tokenize(src_text)
        tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping)
        return join_with.join(tgt_sentences)


    def translate_sentences(self, sentences, num_beams=None, early_stopping=None):
        &#34;&#34;&#34;
        ```
        Translate sentences using model_name as model.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          sentences(list): list of strings representing sentences that need to be translated
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated sentences
        ```
        &#34;&#34;&#34;
        import torch
        with torch.no_grad():
            translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors=&#39;pt&#39;).to(self.torch_device), 
                                             num_beams=num_beams, early_stopping=early_stopping)
            tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        return tgt_sentences</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.Translator.translate"><code class="name flex">
<span>def <span class="ident">translate</span></span>(<span>self, src_text, join_with='\n', num_beams=None, early_stopping=None)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Translate document (src_text).
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
  src_text(str): source text.
                 The source text can either be a single sentence or an entire document with multiple sentences
                 and paragraphs. 
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  join_with(str):  list of translated sentences will be delimited with this character.
                   default: each sentence on separate line
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated text
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def translate(self, src_text, join_with=&#39;\n&#39;, num_beams=None, early_stopping=None):
    &#34;&#34;&#34;
    ```
    Translate document (src_text).
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
    Args:
      src_text(str): source text.
                     The source text can either be a single sentence or an entire document with multiple sentences
                     and paragraphs. 
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      join_with(str):  list of translated sentences will be delimited with this character.
                       default: each sentence on separate line
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated text
    ```
    &#34;&#34;&#34;
    sentences = TU.sent_tokenize(src_text)
    tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping)
    return join_with.join(tgt_sentences)</code></pre>
</details>
</dd>
<dt id="ktrain.text.Translator.translate_sentences"><code class="name flex">
<span>def <span class="ident">translate_sentences</span></span>(<span>self, sentences, num_beams=None, early_stopping=None)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Translate sentences using model_name as model.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
  sentences(list): list of strings representing sentences that need to be translated
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated sentences
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def translate_sentences(self, sentences, num_beams=None, early_stopping=None):
    &#34;&#34;&#34;
    ```
    Translate sentences using model_name as model.
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
    Args:
      sentences(list): list of strings representing sentences that need to be translated
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated sentences
    ```
    &#34;&#34;&#34;
    import torch
    with torch.no_grad():
        translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors=&#39;pt&#39;).to(self.torch_device), 
                                         num_beams=num_beams, early_stopping=early_stopping)
        tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return tgt_sentences</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.ZeroShotClassifier"><code class="flex name class">
<span>class <span class="ident">ZeroShotClassifier</span></span>
<span>(</span><span>model_name='facebook/bart-large-mnli', device=None)</span>
</code></dt>
<dd>
<div class="desc"><p>interface to Zero Shot Topic Classifier</p>
<pre><code>ZeroShotClassifier constructor

Args:
  model_name(str): name of a BART NLI model
  device(str): device to use (e.g., 'cuda', 'cpu')
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class ZeroShotClassifier():
    &#34;&#34;&#34;
    interface to Zero Shot Topic Classifier
    &#34;&#34;&#34;

    def __init__(self, model_name=&#39;facebook/bart-large-mnli&#39;, device=None):
        &#34;&#34;&#34;
        ```
        ZeroShotClassifier constructor

        Args:
          model_name(str): name of a BART NLI model
          device(str): device to use (e.g., &#39;cuda&#39;, &#39;cpu&#39;)
        ```
        &#34;&#34;&#34;
        if &#39;mnli&#39; not in model_name and &#39;xnli&#39; not in model_name:
            raise ValueError(&#39;ZeroShotClasifier requires an MNLI or XNLI model&#39;)
        try:
            import torch
        except ImportError:
            raise Exception(&#39;ZeroShotClassifier requires PyTorch to be installed.&#39;)
        self.torch_device = device
        if self.torch_device is None: self.torch_device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
        from transformers import AutoModelForSequenceClassification, AutoTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.torch_device)


    def predict(self, docs, labels=[], include_labels=False, multilabel=True,
               max_length=512, batch_size=8, nli_template=&#39;This text is about {}.&#39;,  topic_strings=[]):
        &#34;&#34;&#34;
        ```
        This method performs zero-shot text classification using Natural Language Inference (NLI).
        Args:
          docs(list|str): text of document or list of texts
          labels(list): a list of strings representing topics of your choice
                        Example:
                          labels=[&#39;political science&#39;, &#39;sports&#39;, &#39;science&#39;]
          include_labels(bool): If True, will return topic labels along with topic probabilities
          multilabel(bool): If True, labels are considered independent and multiple labels can predicted true for document and be close to 1.
                            If False, scores are normalized such that probabilities sum to 1.
          max_length(int): truncate long documents to this many tokens
          batch_size(int): batch_size to use. default:8
                           Increase this value to speed up predictions - especially
                           if len(topic_strings) is large.
          nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
          topic_strings(list): alias for labels parameter for backwards compatibility
        Returns:
          inferred probabilities or list of inferred probabilities if doc is list
        ```
        &#34;&#34;&#34;

        # error checks
        is_str_input = False
        if not isinstance(docs, (list, np.ndarray)): 
            docs = [docs]
            is_str_input = True
        if not isinstance(docs[0], str): raise ValueError(&#39;docs must be string or a list of strings representing document(s)&#39;)
        if len(labels) &gt; 0 and len(topic_strings) &gt; 0: raise ValueError(&#39;labels and topic_strings are mutually exclusive&#39;)
        if not labels and not topic_strings: raise ValueError(&#39;labels must be a list of strings&#39;)
        if topic_strings: 
            labels = topic_strings


        # convert to sequences
        sequence_pairs = []
        for premise in docs:
            sequence_pairs.extend([[premise, nli_template.format(label)] for label in labels])
        if batch_size  &gt; len(sequence_pairs): batch_size = len(sequence_pairs)
        if len(sequence_pairs) &gt;= 100 and batch_size==8:
            warnings.warn(&#39;TIP: Try increasing batch_size to speedup ZeroShotClassifier predictions&#39;)
        num_chunks = math.ceil(len(sequence_pairs)/batch_size)
        sequence_chunks = list2chunks(sequence_pairs, n=num_chunks)

        # inference
        import torch
        with torch.no_grad():
            outputs = []
            for sequences in sequence_chunks:
                batch = self.tokenizer.batch_encode_plus(sequences, return_tensors=&#39;pt&#39;, max_length=max_length, truncation=&#39;only_first&#39;, padding=True).to(self.torch_device)
                logits = self.model(batch[&#39;input_ids&#39;], attention_mask=batch[&#39;attention_mask&#39;], return_dict=False)[0]
                outputs.extend(logits.cpu().detach().numpy())
                #entail_contradiction_logits = logits[:,[0,2]]

                #probs = entail_contradiction_logits.softmax(dim=1)
                #true_probs = list(probs[:,1].cpu().detach().numpy())
                #result.extend(true_probs)
        outputs = np.array(outputs)
        outputs = outputs.reshape((len(docs), len(labels), -1))

        # process outputs
        # 2020-08-24: modified based on transformers pipeline implementation
        if multilabel:
            # softmax over the entailment vs. contradiction dim for each label independently
            entail_contr_logits = outputs[..., [0, -1]]
            scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
            scores = scores[..., 1]
        else:
            # softmax the &#34;entailment&#34; logits over all candidate labels
            entail_logits = outputs[..., -1]
            scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
        scores = scores.tolist()
        if include_labels:
            scores = [list(zip(labels, s)) for s in scores]
        if is_str_input: scores = scores[0]
        return scores</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.ZeroShotClassifier.predict"><code class="name flex">
<span>def <span class="ident">predict</span></span>(<span>self, docs, labels=[], include_labels=False, multilabel=True, max_length=512, batch_size=8, nli_template='This text is about {}.', topic_strings=[])</span>
</code></dt>
<dd>
<div class="desc"><pre><code>This method performs zero-shot text classification using Natural Language Inference (NLI).
Args:
  docs(list|str): text of document or list of texts
  labels(list): a list of strings representing topics of your choice
                Example:
                  labels=['political science', 'sports', 'science']
  include_labels(bool): If True, will return topic labels along with topic probabilities
  multilabel(bool): If True, labels are considered independent and multiple labels can predicted true for document and be close to 1.
                    If False, scores are normalized such that probabilities sum to 1.
  max_length(int): truncate long documents to this many tokens
  batch_size(int): batch_size to use. default:8
                   Increase this value to speed up predictions - especially
                   if len(topic_strings) is large.
  nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
  topic_strings(list): alias for labels parameter for backwards compatibility
Returns:
  inferred probabilities or list of inferred probabilities if doc is list
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def predict(self, docs, labels=[], include_labels=False, multilabel=True,
           max_length=512, batch_size=8, nli_template=&#39;This text is about {}.&#39;,  topic_strings=[]):
    &#34;&#34;&#34;
    ```
    This method performs zero-shot text classification using Natural Language Inference (NLI).
    Args:
      docs(list|str): text of document or list of texts
      labels(list): a list of strings representing topics of your choice
                    Example:
                      labels=[&#39;political science&#39;, &#39;sports&#39;, &#39;science&#39;]
      include_labels(bool): If True, will return topic labels along with topic probabilities
      multilabel(bool): If True, labels are considered independent and multiple labels can predicted true for document and be close to 1.
                        If False, scores are normalized such that probabilities sum to 1.
      max_length(int): truncate long documents to this many tokens
      batch_size(int): batch_size to use. default:8
                       Increase this value to speed up predictions - especially
                       if len(topic_strings) is large.
      nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
      topic_strings(list): alias for labels parameter for backwards compatibility
    Returns:
      inferred probabilities or list of inferred probabilities if doc is list
    ```
    &#34;&#34;&#34;

    # error checks
    is_str_input = False
    if not isinstance(docs, (list, np.ndarray)): 
        docs = [docs]
        is_str_input = True
    if not isinstance(docs[0], str): raise ValueError(&#39;docs must be string or a list of strings representing document(s)&#39;)
    if len(labels) &gt; 0 and len(topic_strings) &gt; 0: raise ValueError(&#39;labels and topic_strings are mutually exclusive&#39;)
    if not labels and not topic_strings: raise ValueError(&#39;labels must be a list of strings&#39;)
    if topic_strings: 
        labels = topic_strings


    # convert to sequences
    sequence_pairs = []
    for premise in docs:
        sequence_pairs.extend([[premise, nli_template.format(label)] for label in labels])
    if batch_size  &gt; len(sequence_pairs): batch_size = len(sequence_pairs)
    if len(sequence_pairs) &gt;= 100 and batch_size==8:
        warnings.warn(&#39;TIP: Try increasing batch_size to speedup ZeroShotClassifier predictions&#39;)
    num_chunks = math.ceil(len(sequence_pairs)/batch_size)
    sequence_chunks = list2chunks(sequence_pairs, n=num_chunks)

    # inference
    import torch
    with torch.no_grad():
        outputs = []
        for sequences in sequence_chunks:
            batch = self.tokenizer.batch_encode_plus(sequences, return_tensors=&#39;pt&#39;, max_length=max_length, truncation=&#39;only_first&#39;, padding=True).to(self.torch_device)
            logits = self.model(batch[&#39;input_ids&#39;], attention_mask=batch[&#39;attention_mask&#39;], return_dict=False)[0]
            outputs.extend(logits.cpu().detach().numpy())
            #entail_contradiction_logits = logits[:,[0,2]]

            #probs = entail_contradiction_logits.softmax(dim=1)
            #true_probs = list(probs[:,1].cpu().detach().numpy())
            #result.extend(true_probs)
    outputs = np.array(outputs)
    outputs = outputs.reshape((len(docs), len(labels), -1))

    # process outputs
    # 2020-08-24: modified based on transformers pipeline implementation
    if multilabel:
        # softmax over the entailment vs. contradiction dim for each label independently
        entail_contr_logits = outputs[..., [0, -1]]
        scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
        scores = scores[..., 1]
    else:
        # softmax the &#34;entailment&#34; logits over all candidate labels
        entail_logits = outputs[..., -1]
        scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
    scores = scores.tolist()
    if include_labels:
        scores = [list(zip(labels, s)) for s in scores]
    if is_str_input: scores = scores[0]
    return scores</code></pre>
</details>
</dd>
</dl>
</dd>
</dl>
</section>
</article>
<nav id="sidebar">
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="ktrain" href="../index.html">ktrain</a></code></li>
</ul>
</li>
<li><h3><a href="#header-submodules">Sub-modules</a></h3>
<ul>
<li><code><a title="ktrain.text.data" href="data.html">ktrain.text.data</a></code></li>
<li><code><a title="ktrain.text.eda" href="eda.html">ktrain.text.eda</a></code></li>
<li><code><a title="ktrain.text.learner" href="learner.html">ktrain.text.learner</a></code></li>
<li><code><a title="ktrain.text.models" href="models.html">ktrain.text.models</a></code></li>
<li><code><a title="ktrain.text.ner" href="ner/index.html">ktrain.text.ner</a></code></li>
<li><code><a title="ktrain.text.predictor" href="predictor.html">ktrain.text.predictor</a></code></li>
<li><code><a title="ktrain.text.preprocessor" href="preprocessor.html">ktrain.text.preprocessor</a></code></li>
<li><code><a title="ktrain.text.qa" href="qa/index.html">ktrain.text.qa</a></code></li>
<li><code><a title="ktrain.text.shallownlp" href="shallownlp/index.html">ktrain.text.shallownlp</a></code></li>
<li><code><a title="ktrain.text.summarization" href="summarization/index.html">ktrain.text.summarization</a></code></li>
<li><code><a title="ktrain.text.textutils" href="textutils.html">ktrain.text.textutils</a></code></li>
<li><code><a title="ktrain.text.translation" href="translation/index.html">ktrain.text.translation</a></code></li>
<li><code><a title="ktrain.text.zsl" href="zsl/index.html">ktrain.text.zsl</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="ktrain.text.entities_from_array" href="#ktrain.text.entities_from_array">entities_from_array</a></code></li>
<li><code><a title="ktrain.text.entities_from_conll2003" href="#ktrain.text.entities_from_conll2003">entities_from_conll2003</a></code></li>
<li><code><a title="ktrain.text.entities_from_df" href="#ktrain.text.entities_from_df">entities_from_df</a></code></li>
<li><code><a title="ktrain.text.entities_from_gmb" href="#ktrain.text.entities_from_gmb">entities_from_gmb</a></code></li>
<li><code><a title="ktrain.text.entities_from_txt" href="#ktrain.text.entities_from_txt">entities_from_txt</a></code></li>
<li><code><a title="ktrain.text.extract_filenames" href="#ktrain.text.extract_filenames">extract_filenames</a></code></li>
<li><code><a title="ktrain.text.load_text_files" href="#ktrain.text.load_text_files">load_text_files</a></code></li>
<li><code><a title="ktrain.text.print_sequence_taggers" href="#ktrain.text.print_sequence_taggers">print_sequence_taggers</a></code></li>
<li><code><a title="ktrain.text.print_text_classifiers" href="#ktrain.text.print_text_classifiers">print_text_classifiers</a></code></li>
<li><code><a title="ktrain.text.print_text_regression_models" href="#ktrain.text.print_text_regression_models">print_text_regression_models</a></code></li>
<li><code><a title="ktrain.text.sequence_tagger" href="#ktrain.text.sequence_tagger">sequence_tagger</a></code></li>
<li><code><a title="ktrain.text.text_classifier" href="#ktrain.text.text_classifier">text_classifier</a></code></li>
<li><code><a title="ktrain.text.text_regression_model" href="#ktrain.text.text_regression_model">text_regression_model</a></code></li>
<li><code><a title="ktrain.text.texts_from_array" href="#ktrain.text.texts_from_array">texts_from_array</a></code></li>
<li><code><a title="ktrain.text.texts_from_csv" href="#ktrain.text.texts_from_csv">texts_from_csv</a></code></li>
<li><code><a title="ktrain.text.texts_from_df" href="#ktrain.text.texts_from_df">texts_from_df</a></code></li>
<li><code><a title="ktrain.text.texts_from_folder" href="#ktrain.text.texts_from_folder">texts_from_folder</a></code></li>
</ul>
</li>
<li><h3><a href="#header-classes">Classes</a></h3>
<ul>
<li>
<h4><code><a title="ktrain.text.EnglishTranslator" href="#ktrain.text.EnglishTranslator">EnglishTranslator</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.EnglishTranslator.translate" href="#ktrain.text.EnglishTranslator.translate">translate</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.SimpleQA" href="#ktrain.text.SimpleQA">SimpleQA</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.SimpleQA.index_from_folder" href="#ktrain.text.SimpleQA.index_from_folder">index_from_folder</a></code></li>
<li><code><a title="ktrain.text.SimpleQA.index_from_list" href="#ktrain.text.SimpleQA.index_from_list">index_from_list</a></code></li>
<li><code><a title="ktrain.text.SimpleQA.initialize_index" href="#ktrain.text.SimpleQA.initialize_index">initialize_index</a></code></li>
<li><code><a title="ktrain.text.SimpleQA.search" href="#ktrain.text.SimpleQA.search">search</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.TopicModel" href="#ktrain.text.TopicModel">TopicModel</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.TopicModel.build" href="#ktrain.text.TopicModel.build">build</a></code></li>
<li><code><a title="ktrain.text.TopicModel.filter" href="#ktrain.text.TopicModel.filter">filter</a></code></li>
<li><code><a title="ktrain.text.TopicModel.get_docs" href="#ktrain.text.TopicModel.get_docs">get_docs</a></code></li>
<li><code><a title="ktrain.text.TopicModel.get_doctopics" href="#ktrain.text.TopicModel.get_doctopics">get_doctopics</a></code></li>
<li><code><a title="ktrain.text.TopicModel.get_document_topic_distribution" href="#ktrain.text.TopicModel.get_document_topic_distribution">get_document_topic_distribution</a></code></li>
<li><code><a title="ktrain.text.TopicModel.get_sorted_docs" href="#ktrain.text.TopicModel.get_sorted_docs">get_sorted_docs</a></code></li>
<li><code><a title="ktrain.text.TopicModel.get_texts" href="#ktrain.text.TopicModel.get_texts">get_texts</a></code></li>
<li><code><a title="ktrain.text.TopicModel.get_topics" href="#ktrain.text.TopicModel.get_topics">get_topics</a></code></li>
<li><code><a title="ktrain.text.TopicModel.get_word_weights" href="#ktrain.text.TopicModel.get_word_weights">get_word_weights</a></code></li>
<li><code><a title="ktrain.text.TopicModel.predict" href="#ktrain.text.TopicModel.predict">predict</a></code></li>
<li><code><a title="ktrain.text.TopicModel.print_topics" href="#ktrain.text.TopicModel.print_topics">print_topics</a></code></li>
<li><code><a title="ktrain.text.TopicModel.recommend" href="#ktrain.text.TopicModel.recommend">recommend</a></code></li>
<li><code><a title="ktrain.text.TopicModel.save" href="#ktrain.text.TopicModel.save">save</a></code></li>
<li><code><a title="ktrain.text.TopicModel.score" href="#ktrain.text.TopicModel.score">score</a></code></li>
<li><code><a title="ktrain.text.TopicModel.search" href="#ktrain.text.TopicModel.search">search</a></code></li>
<li><code><a title="ktrain.text.TopicModel.topics" href="#ktrain.text.TopicModel.topics">topics</a></code></li>
<li><code><a title="ktrain.text.TopicModel.train" href="#ktrain.text.TopicModel.train">train</a></code></li>
<li><code><a title="ktrain.text.TopicModel.train_recommender" href="#ktrain.text.TopicModel.train_recommender">train_recommender</a></code></li>
<li><code><a title="ktrain.text.TopicModel.train_scorer" href="#ktrain.text.TopicModel.train_scorer">train_scorer</a></code></li>
<li><code><a title="ktrain.text.TopicModel.visualize_documents" href="#ktrain.text.TopicModel.visualize_documents">visualize_documents</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.Transformer" href="#ktrain.text.Transformer">Transformer</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.Transformer.preprocess_test" href="#ktrain.text.Transformer.preprocess_test">preprocess_test</a></code></li>
<li><code><a title="ktrain.text.Transformer.preprocess_train" href="#ktrain.text.Transformer.preprocess_train">preprocess_train</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.TransformerEmbedding" href="#ktrain.text.TransformerEmbedding">TransformerEmbedding</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.TransformerEmbedding.embed" href="#ktrain.text.TransformerEmbedding.embed">embed</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.TransformerSummarizer" href="#ktrain.text.TransformerSummarizer">TransformerSummarizer</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.TransformerSummarizer.summarize" href="#ktrain.text.TransformerSummarizer.summarize">summarize</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.Translator" href="#ktrain.text.Translator">Translator</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.Translator.translate" href="#ktrain.text.Translator.translate">translate</a></code></li>
<li><code><a title="ktrain.text.Translator.translate_sentences" href="#ktrain.text.Translator.translate_sentences">translate_sentences</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.ZeroShotClassifier" href="#ktrain.text.ZeroShotClassifier">ZeroShotClassifier</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.ZeroShotClassifier.predict" href="#ktrain.text.ZeroShotClassifier.predict">predict</a></code></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc"><cite>pdoc</cite> 0.9.2</a>.</p>
</footer>
</body>
</html>