docs/text/shallownlp/index.html

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.10.0" />
<title>ktrain.text.shallownlp API documentation</title>
<meta name="description" content="" />
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>ktrain.text.shallownlp</code></h1>
</header>
<section id="section-intro">
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">from .classifier import Classifier
from .ner import NER
from .searcher import *
from .utils import extract_filenames, read_text, sent_tokenize

__all__ = [
    &#34;Classifier&#34;,
    &#34;Searcher&#34;,
    &#34;search&#34;,
    &#34;find_chinese&#34;,
    &#34;find_arabic&#34;,
    &#34;find_russian&#34;,
    &#34;read_text&#34;,
    &#34;NER&#34;,
    &#34;sent_tokenize&#34;,
    &#34;extract_filenames&#34;,
    &#34;read_text&#34;,
]</code></pre>
</details>
</section>
<section>
<h2 class="section-title" id="header-submodules">Sub-modules</h2>
<dl>
<dt><code class="name"><a title="ktrain.text.shallownlp.classifier" href="classifier.html">ktrain.text.shallownlp.classifier</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.shallownlp.imports" href="imports.html">ktrain.text.shallownlp.imports</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.shallownlp.ner" href="ner.html">ktrain.text.shallownlp.ner</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.shallownlp.searcher" href="searcher.html">ktrain.text.shallownlp.searcher</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="ktrain.text.shallownlp.utils" href="utils.html">ktrain.text.shallownlp.utils</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="ktrain.text.shallownlp.extract_filenames"><code class="name flex">
<span>def <span class="ident">extract_filenames</span></span>(<span>corpus_path, follow_links=False)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def extract_filenames(corpus_path, follow_links=False):
    if os.listdir(corpus_path) == []:
        raise ValueError(&#34;%s: path is empty&#34; % corpus_path)
    for root, _, fnames in os.walk(corpus_path, followlinks=follow_links):
        for filename in fnames:
            try:
                yield os.path.join(root, filename)
            except Exception:
                continue</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.find_arabic"><code class="name flex">
<span>def <span class="ident">find_arabic</span></span>(<span>s)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def find_arabic(s):
    return re.findall(r&#34;[\u0600-\u06FF]+&#34;, s)</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.find_chinese"><code class="name flex">
<span>def <span class="ident">find_chinese</span></span>(<span>s)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def find_chinese(s):
    return re.findall(r&#34;[\u4e00-\u9fff]+&#34;, s)</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.find_russian"><code class="name flex">
<span>def <span class="ident">find_russian</span></span>(<span>s)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def find_russian(s):
    return find_cyrillic(s)</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.read_text"><code class="name flex">
<span>def <span class="ident">read_text</span></span>(<span>filename)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def read_text(filename):
    with open(filename, &#34;rb&#34;) as f:
        text = f.read()
    encoding = detect_encoding([text])
    try:
        decoded_text = text.decode(encoding)
    except:
        U.vprint(
            &#34;Decoding with %s failed 1st attempt - using %s with skips&#34;
            % (encoding, encoding),
            verbose=verbose,
        )
        decoded_text = decode_by_line(text, encoding=encoding)
    return decoded_text.strip()</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.search"><code class="name flex">
<span>def <span class="ident">search</span></span>(<span>query, doc, case_sensitive=False, keys=[], progress=False)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def search(query, doc, case_sensitive=False, keys=[], progress=False):
    searcher = Searcher(query)
    return searcher.search(
        doc, case_sensitive=case_sensitive, keys=keys, progress=progress
    )</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.sent_tokenize"><code class="name flex">
<span>def <span class="ident">sent_tokenize</span></span>(<span>text)</span>
</code></dt>
<dd>
<div class="desc"><p>segment text into sentences</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def sent_tokenize(text):
    &#34;&#34;&#34;
    segment text into sentences
    &#34;&#34;&#34;
    lang = detect_lang(text)
    sents = []
    if is_chinese(lang):
        for sent in re.findall(&#34;[^!?。\.\!\?]+[!?。\.\!\?]?&#34;, text, flags=re.U):
            sents.append(sent)
    else:
        for paragraph in segmenter.process(text):
            for sentence in paragraph:
                sents.append(&#34; &#34;.join([t.value for t in sentence]))
    return sents</code></pre>
</details>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="ktrain.text.shallownlp.Classifier"><code class="flex name class">
<span>class <span class="ident">Classifier</span></span>
<span>(</span><span>model=None)</span>
</code></dt>
<dd>
<div class="desc"><p>instantiate a classifier with an optional previously-saved model</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Classifier:
    def __init__(self, model=None):
        &#34;&#34;&#34;
        instantiate a classifier with an optional previously-saved model
        &#34;&#34;&#34;
        self.model = None

    def create_model(self, ctype, texts, hp_dict={}, ngram_range=(1, 3), binary=True):
        &#34;&#34;&#34;
        ```
        create a model
        Args:
          ctype(str): one of {&#39;nbsvm&#39;, &#39;logreg&#39;, &#39;sgdclassifier&#39;}
          texts(list): list of texts
          hp_dict(dict): dictionary of hyperparameters to use for the ctype selected.
                         hp_dict can also be used to supply arguments to CountVectorizer
          ngram_range(tuple): default ngram_range.
                              overridden if &#39;ngram_range&#39; in hp_dict
          binary(bool): default value for binary argument to CountVectorizer.
                        overridden if &#39;binary&#39; key in hp_dict
        ```
        &#34;&#34;&#34;
        lang = U.detect_lang(texts)
        if U.is_chinese(lang):
            token_pattern = r&#34;(?u)\b\w+\b&#34;
        else:
            token_pattern = r&#34;\w+|[%s]&#34; % string.punctuation
        if ctype == &#34;nbsvm&#34;:
            clf = NBSVM(
                C=hp_dict.get(&#34;C&#34;, 0.01),
                alpha=hp_dict.get(&#34;alpha&#34;, 0.75),
                beta=hp_dict.get(&#34;beta&#34;, 0.25),
                fit_intercept=hp_dict.get(&#34;fit_intercept&#34;, False),
            )
        elif ctype == &#34;logreg&#34;:
            clf = LogisticRegression(
                C=hp_dict.get(&#34;C&#34;, 0.1),
                dual=hp_dict.get(&#34;dual&#34;, True),
                penalty=hp_dict.get(&#34;penalty&#34;, &#34;l2&#34;),
                tol=hp_dict.get(&#34;tol&#34;, 1e-4),
                intercept_scaling=hp_dict.get(&#34;intercept_scaling&#34;, 1),
                solver=hp_dict.get(&#34;solver&#34;, &#34;liblinear&#34;),
                max_iter=hp_dict.get(&#34;max_iter&#34;, 100),
                multi_class=hp_dict.get(&#34;multi_class&#34;, &#34;auto&#34;),
                warm_start=hp_dict.get(&#34;warm_start&#34;, False),
                n_jobs=hp_dict.get(&#34;n_jobs&#34;, None),
                l1_ratio=hp_dict.get(&#34;l1_ratio&#34;, None),
                random_state=hp_dict.get(&#34;random_state&#34;, 42),
                class_weight=hp_dict.get(&#34;class_weight&#34;, None),
            )
        elif ctype == &#34;sgdclassifier&#34;:
            clf = SGDClassifier(
                loss=hp_dict.get(&#34;loss&#34;, &#34;hinge&#34;),
                penalty=hp_dict.get(&#34;penalty&#34;, &#34;l2&#34;),
                alpha=hp_dict.get(&#34;alpha&#34;, 1e-3),
                random_state=hp_dict.get(&#34;random_state&#34;, 42),
                max_iter=hp_dict.get(&#34;max_iter&#34;, 5),  # scikit-learn default is 1000
                tol=hp_dict.get(&#34;tol&#34;, None),
                l1_ratio=hp_dict.get(&#34;l1_ratio&#34;, 0.15),
                fit_intercept=hp_dict.get(&#34;fit_intercept&#34;, True),
                episilon=hp_dict.get(&#34;epsilon&#34;, 0.1),
                n_jobs=hp_dict.get(&#34;n_jobs&#34;, None),
                learning_rate=hp_dict.get(&#34;learning_rate&#34;, &#34;optimal&#34;),
                eta0=hp_dict.get(&#34;eta0&#34;, 0.0),
                power_t=hp_dict.get(&#34;power_t&#34;, 0.5),
                early_stopping=hp_dict.get(&#34;early_stopping&#34;, False),
                validation_fraction=hp_dict.get(&#34;validation_fraction&#34;, 0.1),
                n_iter_no_change=hp_dict.get(&#34;n_iter_no_change&#34;, 5),
                warm_start=hp_dict.get(&#34;warm_start&#34;, False),
                average=hp_dict.get(&#34;average&#34;, False),
                class_weight=hp_dict.get(&#34;class_weight&#34;, None),
            )
        else:
            raise ValueError(&#34;Unknown ctype: %s&#34; % (ctype))

        self.model = Pipeline(
            [
                (
                    &#34;vect&#34;,
                    CountVectorizer(
                        ngram_range=hp_dict.get(&#34;ngram_range&#34;, ngram_range),
                        binary=hp_dict.get(&#34;binary&#34;, binary),
                        token_pattern=token_pattern,
                        max_features=hp_dict.get(&#34;max_features&#34;, None),
                        max_df=hp_dict.get(&#34;max_df&#34;, 1.0),
                        min_df=hp_dict.get(&#34;min_df&#34;, 1),
                        stop_words=hp_dict.get(&#34;stop_words&#34;, None),
                        lowercase=hp_dict.get(&#34;lowercase&#34;, True),
                        strip_accents=hp_dict.get(&#34;strip_accents&#34;, None),
                        encoding=hp_dict.get(&#34;encoding&#34;, &#34;utf-8&#34;),
                    ),
                ),
                (&#34;clf&#34;, clf),
            ]
        )
        return

    @classmethod
    def load_texts_from_folder(
        cls, folder_path, subfolders=None, shuffle=True, encoding=None
    ):
        &#34;&#34;&#34;
        ```
        load text files from folder

        Args:
          folder_path(str): path to folder containing documents
                            The supplied folder should contain a subfolder
                            for each category, which will be used as the class label
          subfolders(list): list of subfolders under folder_path to consider
                            Example: If folder_path contains subfolders pos, neg, and
                            unlabeled, then unlabeled folder can be ignored by
                            setting subfolders=[&#39;pos&#39;, &#39;neg&#39;]
          shuffle(bool):  If True, list of texts will be shuffled
          encoding(str): encoding to use.  default:None (auto-detected)
        Returns:
          tuple: (texts, labels, label_names)
        ```
        &#34;&#34;&#34;
        bunch = load_files(folder_path, categories=subfolders, shuffle=shuffle)
        texts = bunch.data
        labels = bunch.target
        label_names = bunch.target_names
        # print(&#39;target names:&#39;)
        # for idx, label_name in enumerate(bunch.target_names):
        # print(&#39;\t%s:%s&#39; % (idx, label_name))

        # decode based on supplied encoding
        if encoding is None:
            encoding = U.detect_encoding(texts)
            if encoding != &#34;utf-8&#34;:
                print(&#34;detected encoding: %s&#34; % (encoding))

        try:
            texts = [text.decode(encoding) for text in texts]
        except:
            print(
                &#34;Decoding with %s failed 1st attempt - using %s with skips&#34;
                % (encoding, encoding)
            )
            texts = U.decode_by_line(texts, encoding=encoding)
        return (texts, labels, label_names)

    @classmethod
    def load_texts_from_csv(
        cls,
        csv_filepath,
        text_column=&#34;text&#34;,
        label_column=&#34;label&#34;,
        sep=&#34;,&#34;,
        encoding=None,
    ):
        &#34;&#34;&#34;
        ```
        load text files from csv file
        CSV should have at least two columns.
        Example:
        Text               | Label
        I love this movie. | positive
        I hated this movie.| negative


        Args:
          csv_filepath(str): path to CSV file
          text_column(str): name of column containing the texts. default:&#39;text&#39;
          label_column(str): name of column containing the labels in string format
                             default:&#39;label&#39;
          sep(str): character that separates columns in CSV. default:&#39;,&#39;
          encoding(str): encoding to use. default:None (auto-detected)
        Returns:
          tuple: (texts, labels, label_names)
        ```
        &#34;&#34;&#34;
        if encoding is None:
            with open(csv_filepath, &#34;rb&#34;) as f:
                encoding = U.detect_encoding([f.read()])
                if encoding != &#34;utf-8&#34;:
                    print(&#34;detected encoding: %s (if wrong, set manually)&#34; % (encoding))
        import pandas as pd

        df = pd.read_csv(csv_filepath, encoding=encoding, sep=sep)
        texts = df[text_column].fillna(&#34;fillna&#34;).values
        labels = df[label_column].values
        le = LabelEncoder()
        le.fit(labels)
        labels = le.transform(labels)
        return (texts, labels, le.classes_)

    def fit(self, x_train, y_train, ctype=&#34;logreg&#34;):
        &#34;&#34;&#34;
        ```
        train a classifier
        Args:
          x_train(list or np.ndarray):  training texts
          y_train(np.ndarray):  training labels
          ctype(str):  One of {&#39;logreg&#39;, &#39;nbsvm&#39;, &#39;sgdclassifier&#39;}.  default:nbsvm
        ```
        &#34;&#34;&#34;
        lang = U.detect_lang(x_train)
        if U.is_chinese(lang):
            x_train = U.split_chinese(x_train)
        if self.model is None:
            self.create_model(ctype, x_train)
        self.model.fit(x_train, y_train)
        return self

    def predict(self, x_test, return_proba=False):
        &#34;&#34;&#34;
        ```
        make predictions on text data
        Args:
          x_test(list or np.ndarray or str): array of texts on which to make predictions or a string representing text
        ```
        &#34;&#34;&#34;
        if return_proba and not hasattr(self.model[&#34;clf&#34;], &#34;predict_proba&#34;):
            raise ValueError(
                &#34;%s does not support predict_proba&#34; % (type(self.model[&#34;clf&#34;]).__name__)
            )
        if isinstance(x_test, str):
            x_test = [x_test]
        lang = U.detect_lang(x_test)
        if U.is_chinese(lang):
            x_test = U.split_chinese(x_test)
        if self.model is None:
            raise ValueError(&#34;model is None - call fit or load to set the model&#34;)
        if return_proba:
            predicted = self.model.predict_proba(x_test)
        else:
            predicted = self.model.predict(x_test)
        if len(predicted) == 1:
            predicted = predicted[0]
        return predicted

    def predict_proba(self, x_test):
        &#34;&#34;&#34;
        predict_proba
        &#34;&#34;&#34;
        return self.predict(x_test, return_proba=True)

    def evaluate(self, x_test, y_test):
        &#34;&#34;&#34;
        ```
        evaluate
        Args:
          x_test(list or np.ndarray):  training texts
          y_test(np.ndarray):  training labels
        ```
        &#34;&#34;&#34;
        predicted = self.predict(x_test)
        return np.mean(predicted == y_test)

    def save(self, filename):
        &#34;&#34;&#34;
        save model
        &#34;&#34;&#34;
        dump(self.model, filename)

    def load(self, filename):
        &#34;&#34;&#34;
        load model
        &#34;&#34;&#34;
        self.model = load(filename)

    def grid_search(self, params, x_train, y_train, n_jobs=-1):
        &#34;&#34;&#34;
        ```
        Performs grid search to find optimal set of hyperparameters
        Args:
          params (dict):  A dictionary defining the space of the search.
                          Example for finding optimal value of alpha in NBSVM:
                        parameters = {
                                      #&#39;clf__C&#39;: (1e0, 1e-1, 1e-2),
                                      &#39;clf__alpha&#39;: (0.1, 0.2, 0.4, 0.5, 0.75, 0.9, 1.0),
                                      #&#39;clf__fit_intercept&#39;: (True, False),
                                      #&#39;clf__beta&#39; : (0.1, 0.25, 0.5, 0.9)
                                      }
          n_jobs(int): number of jobs to run in parallel.  default:-1 (use all processors)
        ```
        &#34;&#34;&#34;
        gs_clf = GridSearchCV(self.model, params, n_jobs=n_jobs)
        gs_clf = gs_clf.fit(x_train, y_train)
        # gs_clf.best_score_
        for param_name in sorted(params.keys()):
            print(&#34;%s: %r&#34; % (param_name, gs_clf.best_params_[param_name]))
        return</code></pre>
</details>
<h3>Static methods</h3>
<dl>
<dt id="ktrain.text.shallownlp.Classifier.load_texts_from_csv"><code class="name flex">
<span>def <span class="ident">load_texts_from_csv</span></span>(<span>csv_filepath, text_column='text', label_column='label', sep=',', encoding=None)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>load text files from csv file
CSV should have at least two columns.
Example:
Text               | Label
I love this movie. | positive
I hated this movie.| negative


Args:
  csv_filepath(str): path to CSV file
  text_column(str): name of column containing the texts. default:'text'
  label_column(str): name of column containing the labels in string format
                     default:'label'
  sep(str): character that separates columns in CSV. default:','
  encoding(str): encoding to use. default:None (auto-detected)
Returns:
  tuple: (texts, labels, label_names)
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@classmethod
def load_texts_from_csv(
    cls,
    csv_filepath,
    text_column=&#34;text&#34;,
    label_column=&#34;label&#34;,
    sep=&#34;,&#34;,
    encoding=None,
):
    &#34;&#34;&#34;
    ```
    load text files from csv file
    CSV should have at least two columns.
    Example:
    Text               | Label
    I love this movie. | positive
    I hated this movie.| negative


    Args:
      csv_filepath(str): path to CSV file
      text_column(str): name of column containing the texts. default:&#39;text&#39;
      label_column(str): name of column containing the labels in string format
                         default:&#39;label&#39;
      sep(str): character that separates columns in CSV. default:&#39;,&#39;
      encoding(str): encoding to use. default:None (auto-detected)
    Returns:
      tuple: (texts, labels, label_names)
    ```
    &#34;&#34;&#34;
    if encoding is None:
        with open(csv_filepath, &#34;rb&#34;) as f:
            encoding = U.detect_encoding([f.read()])
            if encoding != &#34;utf-8&#34;:
                print(&#34;detected encoding: %s (if wrong, set manually)&#34; % (encoding))
    import pandas as pd

    df = pd.read_csv(csv_filepath, encoding=encoding, sep=sep)
    texts = df[text_column].fillna(&#34;fillna&#34;).values
    labels = df[label_column].values
    le = LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    return (texts, labels, le.classes_)</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.load_texts_from_folder"><code class="name flex">
<span>def <span class="ident">load_texts_from_folder</span></span>(<span>folder_path, subfolders=None, shuffle=True, encoding=None)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>load text files from folder

Args:
  folder_path(str): path to folder containing documents
                    The supplied folder should contain a subfolder
                    for each category, which will be used as the class label
  subfolders(list): list of subfolders under folder_path to consider
                    Example: If folder_path contains subfolders pos, neg, and
                    unlabeled, then unlabeled folder can be ignored by
                    setting subfolders=['pos', 'neg']
  shuffle(bool):  If True, list of texts will be shuffled
  encoding(str): encoding to use.  default:None (auto-detected)
Returns:
  tuple: (texts, labels, label_names)
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@classmethod
def load_texts_from_folder(
    cls, folder_path, subfolders=None, shuffle=True, encoding=None
):
    &#34;&#34;&#34;
    ```
    load text files from folder

    Args:
      folder_path(str): path to folder containing documents
                        The supplied folder should contain a subfolder
                        for each category, which will be used as the class label
      subfolders(list): list of subfolders under folder_path to consider
                        Example: If folder_path contains subfolders pos, neg, and
                        unlabeled, then unlabeled folder can be ignored by
                        setting subfolders=[&#39;pos&#39;, &#39;neg&#39;]
      shuffle(bool):  If True, list of texts will be shuffled
      encoding(str): encoding to use.  default:None (auto-detected)
    Returns:
      tuple: (texts, labels, label_names)
    ```
    &#34;&#34;&#34;
    bunch = load_files(folder_path, categories=subfolders, shuffle=shuffle)
    texts = bunch.data
    labels = bunch.target
    label_names = bunch.target_names
    # print(&#39;target names:&#39;)
    # for idx, label_name in enumerate(bunch.target_names):
    # print(&#39;\t%s:%s&#39; % (idx, label_name))

    # decode based on supplied encoding
    if encoding is None:
        encoding = U.detect_encoding(texts)
        if encoding != &#34;utf-8&#34;:
            print(&#34;detected encoding: %s&#34; % (encoding))

    try:
        texts = [text.decode(encoding) for text in texts]
    except:
        print(
            &#34;Decoding with %s failed 1st attempt - using %s with skips&#34;
            % (encoding, encoding)
        )
        texts = U.decode_by_line(texts, encoding=encoding)
    return (texts, labels, label_names)</code></pre>
</details>
</dd>
</dl>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.shallownlp.Classifier.create_model"><code class="name flex">
<span>def <span class="ident">create_model</span></span>(<span>self, ctype, texts, hp_dict={}, ngram_range=(1, 3), binary=True)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>create a model
Args:
  ctype(str): one of {'nbsvm', 'logreg', 'sgdclassifier'}
  texts(list): list of texts
  hp_dict(dict): dictionary of hyperparameters to use for the ctype selected.
                 hp_dict can also be used to supply arguments to CountVectorizer
  ngram_range(tuple): default ngram_range.
                      overridden if 'ngram_range' in hp_dict
  binary(bool): default value for binary argument to CountVectorizer.
                overridden if 'binary' key in hp_dict
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def create_model(self, ctype, texts, hp_dict={}, ngram_range=(1, 3), binary=True):
    &#34;&#34;&#34;
    ```
    create a model
    Args:
      ctype(str): one of {&#39;nbsvm&#39;, &#39;logreg&#39;, &#39;sgdclassifier&#39;}
      texts(list): list of texts
      hp_dict(dict): dictionary of hyperparameters to use for the ctype selected.
                     hp_dict can also be used to supply arguments to CountVectorizer
      ngram_range(tuple): default ngram_range.
                          overridden if &#39;ngram_range&#39; in hp_dict
      binary(bool): default value for binary argument to CountVectorizer.
                    overridden if &#39;binary&#39; key in hp_dict
    ```
    &#34;&#34;&#34;
    lang = U.detect_lang(texts)
    if U.is_chinese(lang):
        token_pattern = r&#34;(?u)\b\w+\b&#34;
    else:
        token_pattern = r&#34;\w+|[%s]&#34; % string.punctuation
    if ctype == &#34;nbsvm&#34;:
        clf = NBSVM(
            C=hp_dict.get(&#34;C&#34;, 0.01),
            alpha=hp_dict.get(&#34;alpha&#34;, 0.75),
            beta=hp_dict.get(&#34;beta&#34;, 0.25),
            fit_intercept=hp_dict.get(&#34;fit_intercept&#34;, False),
        )
    elif ctype == &#34;logreg&#34;:
        clf = LogisticRegression(
            C=hp_dict.get(&#34;C&#34;, 0.1),
            dual=hp_dict.get(&#34;dual&#34;, True),
            penalty=hp_dict.get(&#34;penalty&#34;, &#34;l2&#34;),
            tol=hp_dict.get(&#34;tol&#34;, 1e-4),
            intercept_scaling=hp_dict.get(&#34;intercept_scaling&#34;, 1),
            solver=hp_dict.get(&#34;solver&#34;, &#34;liblinear&#34;),
            max_iter=hp_dict.get(&#34;max_iter&#34;, 100),
            multi_class=hp_dict.get(&#34;multi_class&#34;, &#34;auto&#34;),
            warm_start=hp_dict.get(&#34;warm_start&#34;, False),
            n_jobs=hp_dict.get(&#34;n_jobs&#34;, None),
            l1_ratio=hp_dict.get(&#34;l1_ratio&#34;, None),
            random_state=hp_dict.get(&#34;random_state&#34;, 42),
            class_weight=hp_dict.get(&#34;class_weight&#34;, None),
        )
    elif ctype == &#34;sgdclassifier&#34;:
        clf = SGDClassifier(
            loss=hp_dict.get(&#34;loss&#34;, &#34;hinge&#34;),
            penalty=hp_dict.get(&#34;penalty&#34;, &#34;l2&#34;),
            alpha=hp_dict.get(&#34;alpha&#34;, 1e-3),
            random_state=hp_dict.get(&#34;random_state&#34;, 42),
            max_iter=hp_dict.get(&#34;max_iter&#34;, 5),  # scikit-learn default is 1000
            tol=hp_dict.get(&#34;tol&#34;, None),
            l1_ratio=hp_dict.get(&#34;l1_ratio&#34;, 0.15),
            fit_intercept=hp_dict.get(&#34;fit_intercept&#34;, True),
            episilon=hp_dict.get(&#34;epsilon&#34;, 0.1),
            n_jobs=hp_dict.get(&#34;n_jobs&#34;, None),
            learning_rate=hp_dict.get(&#34;learning_rate&#34;, &#34;optimal&#34;),
            eta0=hp_dict.get(&#34;eta0&#34;, 0.0),
            power_t=hp_dict.get(&#34;power_t&#34;, 0.5),
            early_stopping=hp_dict.get(&#34;early_stopping&#34;, False),
            validation_fraction=hp_dict.get(&#34;validation_fraction&#34;, 0.1),
            n_iter_no_change=hp_dict.get(&#34;n_iter_no_change&#34;, 5),
            warm_start=hp_dict.get(&#34;warm_start&#34;, False),
            average=hp_dict.get(&#34;average&#34;, False),
            class_weight=hp_dict.get(&#34;class_weight&#34;, None),
        )
    else:
        raise ValueError(&#34;Unknown ctype: %s&#34; % (ctype))

    self.model = Pipeline(
        [
            (
                &#34;vect&#34;,
                CountVectorizer(
                    ngram_range=hp_dict.get(&#34;ngram_range&#34;, ngram_range),
                    binary=hp_dict.get(&#34;binary&#34;, binary),
                    token_pattern=token_pattern,
                    max_features=hp_dict.get(&#34;max_features&#34;, None),
                    max_df=hp_dict.get(&#34;max_df&#34;, 1.0),
                    min_df=hp_dict.get(&#34;min_df&#34;, 1),
                    stop_words=hp_dict.get(&#34;stop_words&#34;, None),
                    lowercase=hp_dict.get(&#34;lowercase&#34;, True),
                    strip_accents=hp_dict.get(&#34;strip_accents&#34;, None),
                    encoding=hp_dict.get(&#34;encoding&#34;, &#34;utf-8&#34;),
                ),
            ),
            (&#34;clf&#34;, clf),
        ]
    )
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.evaluate"><code class="name flex">
<span>def <span class="ident">evaluate</span></span>(<span>self, x_test, y_test)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>evaluate
Args:
  x_test(list or np.ndarray):  training texts
  y_test(np.ndarray):  training labels
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def evaluate(self, x_test, y_test):
    &#34;&#34;&#34;
    ```
    evaluate
    Args:
      x_test(list or np.ndarray):  training texts
      y_test(np.ndarray):  training labels
    ```
    &#34;&#34;&#34;
    predicted = self.predict(x_test)
    return np.mean(predicted == y_test)</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.fit"><code class="name flex">
<span>def <span class="ident">fit</span></span>(<span>self, x_train, y_train, ctype='logreg')</span>
</code></dt>
<dd>
<div class="desc"><pre><code>train a classifier
Args:
  x_train(list or np.ndarray):  training texts
  y_train(np.ndarray):  training labels
  ctype(str):  One of {'logreg', 'nbsvm', 'sgdclassifier'}.  default:nbsvm
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def fit(self, x_train, y_train, ctype=&#34;logreg&#34;):
    &#34;&#34;&#34;
    ```
    train a classifier
    Args:
      x_train(list or np.ndarray):  training texts
      y_train(np.ndarray):  training labels
      ctype(str):  One of {&#39;logreg&#39;, &#39;nbsvm&#39;, &#39;sgdclassifier&#39;}.  default:nbsvm
    ```
    &#34;&#34;&#34;
    lang = U.detect_lang(x_train)
    if U.is_chinese(lang):
        x_train = U.split_chinese(x_train)
    if self.model is None:
        self.create_model(ctype, x_train)
    self.model.fit(x_train, y_train)
    return self</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.grid_search"><code class="name flex">
<span>def <span class="ident">grid_search</span></span>(<span>self, params, x_train, y_train, n_jobs=-1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Performs grid search to find optimal set of hyperparameters
Args:
  params (dict):  A dictionary defining the space of the search.
                  Example for finding optimal value of alpha in NBSVM:
                parameters = {
                              #'clf__C': (1e0, 1e-1, 1e-2),
                              'clf__alpha': (0.1, 0.2, 0.4, 0.5, 0.75, 0.9, 1.0),
                              #'clf__fit_intercept': (True, False),
                              #'clf__beta' : (0.1, 0.25, 0.5, 0.9)
                              }
  n_jobs(int): number of jobs to run in parallel.  default:-1 (use all processors)
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def grid_search(self, params, x_train, y_train, n_jobs=-1):
    &#34;&#34;&#34;
    ```
    Performs grid search to find optimal set of hyperparameters
    Args:
      params (dict):  A dictionary defining the space of the search.
                      Example for finding optimal value of alpha in NBSVM:
                    parameters = {
                                  #&#39;clf__C&#39;: (1e0, 1e-1, 1e-2),
                                  &#39;clf__alpha&#39;: (0.1, 0.2, 0.4, 0.5, 0.75, 0.9, 1.0),
                                  #&#39;clf__fit_intercept&#39;: (True, False),
                                  #&#39;clf__beta&#39; : (0.1, 0.25, 0.5, 0.9)
                                  }
      n_jobs(int): number of jobs to run in parallel.  default:-1 (use all processors)
    ```
    &#34;&#34;&#34;
    gs_clf = GridSearchCV(self.model, params, n_jobs=n_jobs)
    gs_clf = gs_clf.fit(x_train, y_train)
    # gs_clf.best_score_
    for param_name in sorted(params.keys()):
        print(&#34;%s: %r&#34; % (param_name, gs_clf.best_params_[param_name]))
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.load"><code class="name flex">
<span>def <span class="ident">load</span></span>(<span>self, filename)</span>
</code></dt>
<dd>
<div class="desc"><p>load model</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def load(self, filename):
    &#34;&#34;&#34;
    load model
    &#34;&#34;&#34;
    self.model = load(filename)</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.predict"><code class="name flex">
<span>def <span class="ident">predict</span></span>(<span>self, x_test, return_proba=False)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>make predictions on text data
Args:
  x_test(list or np.ndarray or str): array of texts on which to make predictions or a string representing text
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def predict(self, x_test, return_proba=False):
    &#34;&#34;&#34;
    ```
    make predictions on text data
    Args:
      x_test(list or np.ndarray or str): array of texts on which to make predictions or a string representing text
    ```
    &#34;&#34;&#34;
    if return_proba and not hasattr(self.model[&#34;clf&#34;], &#34;predict_proba&#34;):
        raise ValueError(
            &#34;%s does not support predict_proba&#34; % (type(self.model[&#34;clf&#34;]).__name__)
        )
    if isinstance(x_test, str):
        x_test = [x_test]
    lang = U.detect_lang(x_test)
    if U.is_chinese(lang):
        x_test = U.split_chinese(x_test)
    if self.model is None:
        raise ValueError(&#34;model is None - call fit or load to set the model&#34;)
    if return_proba:
        predicted = self.model.predict_proba(x_test)
    else:
        predicted = self.model.predict(x_test)
    if len(predicted) == 1:
        predicted = predicted[0]
    return predicted</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.predict_proba"><code class="name flex">
<span>def <span class="ident">predict_proba</span></span>(<span>self, x_test)</span>
</code></dt>
<dd>
<div class="desc"><p>predict_proba</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def predict_proba(self, x_test):
    &#34;&#34;&#34;
    predict_proba
    &#34;&#34;&#34;
    return self.predict(x_test, return_proba=True)</code></pre>
</details>
</dd>
<dt id="ktrain.text.shallownlp.Classifier.save"><code class="name flex">
<span>def <span class="ident">save</span></span>(<span>self, filename)</span>
</code></dt>
<dd>
<div class="desc"><p>save model</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def save(self, filename):
    &#34;&#34;&#34;
    save model
    &#34;&#34;&#34;
    dump(self.model, filename)</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.shallownlp.NER"><code class="flex name class">
<span>class <span class="ident">NER</span></span>
<span>(</span><span>lang='en', predictor_path=None)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>pretrained NER.
Only English and Chinese are currenty supported.

Args:
  lang(str): Currently, one of {'en', 'zh', 'ru'}: en=English , zh=Chinese, or ru=Russian
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class NER:
    def __init__(self, lang=&#34;en&#34;, predictor_path=None):
        &#34;&#34;&#34;
        ```
        pretrained NER.
        Only English and Chinese are currenty supported.

        Args:
          lang(str): Currently, one of {&#39;en&#39;, &#39;zh&#39;, &#39;ru&#39;}: en=English , zh=Chinese, or ru=Russian
        ```
        &#34;&#34;&#34;
        if lang is None:
            raise ValueError(
                &#39;lang is required (e.g., &#34;en&#34; for English, &#34;zh&#34; for Chinese, &#34;ru&#34; for Russian, etc.&#39;
            )
        if predictor_path is None and lang not in [&#34;en&#34;, &#34;zh&#34;, &#34;ru&#34;]:
            raise ValueError(
                &#34;Unsupported language: if predictor_path is None,  then lang must be &#34;
                + &#34;&#39;en&#39; for English, &#39;zh&#39; for Chinese, or &#39;ru&#39; for Chinese&#34;
            )
        self.lang = lang
        if os.environ.get(&#34;DISABLE_V2_BEHAVIOR&#34;, None) != &#34;1&#34;:
            warnings.warn(
                &#34;Please add os.environ[&#39;DISABLE_V2_BEHAVIOR&#39;] = &#39;1&#39; at top of your script or notebook&#34;
            )
            msg = (
                &#34;\nNER in ktrain uses the CRF module from keras_contrib, which is not yet\n&#34;
                + &#34;fully compatible with TensorFlow 2. To use NER, you must add the following to the top of your\n&#34;
                + &#34;script or notebook BEFORE you import ktrain (after restarting runtime):\n\n&#34;
                + &#34;import os\n&#34;
                + &#34;os.environ[&#39;DISABLE_V2_BEHAVIOR&#39;] = &#39;1&#39;\n&#34;
            )
            print(msg)
            return
        else:
            import tensorflow.compat.v1 as tf

            tf.disable_v2_behavior()

        if predictor_path is None and self.lang == &#34;zh&#34;:
            dirpath = os.path.dirname(os.path.abspath(__file__))
            fpath = os.path.join(dirpath, &#34;ner_models/ner_chinese&#34;)
        elif predictor_path is None and self.lang == &#34;ru&#34;:
            dirpath = os.path.dirname(os.path.abspath(__file__))
            fpath = os.path.join(dirpath, &#34;ner_models/ner_russian&#34;)
        elif predictor_path is None and self.lang == &#34;en&#34;:
            dirpath = os.path.dirname(os.path.abspath(__file__))
            fpath = os.path.join(dirpath, &#34;ner_models/ner_english&#34;)
        elif predictor_path is None:
            raise ValueError(
                &#34;Unsupported language: if predictor_path is None,  then lang must be &#34;
                + &#34;&#39;en&#39; for English, &#39;zh&#39; for Chinese, or &#39;ru&#39; for Chinese&#34;
            )
        else:
            if not os.path.isfile(predictor_path) or not os.path.isfile(
                predictor_path + &#34;.preproc&#34;
            ):
                raise ValueError(
                    &#34;could not find a valid predictor model &#34;
                    + &#34;%s or valid Preprocessor %s at specified path&#34;
                    % (predictor_path, predictor_path + &#34;.preproc&#34;)
                )
            fpath = predictor_path
        try:
            import io
            from contextlib import redirect_stdout

            f = io.StringIO()
            with redirect_stdout(f):
                import ktrain
        except:
            raise ValueError(
                &#34;ktrain could not be imported. Install with: pip install ktrain&#34;
            )
        self.predictor = ktrain.load_predictor(fpath)

    def predict(self, texts, merge_tokens=True, batch_size=32):
        &#34;&#34;&#34;
        ```
        Extract named entities from supplied text

        Args:
          texts (list of str or str): list of texts to annotate
          merge_tokens(bool):  If True, tokens will be merged together by the entity
                               to which they are associated:
                               (&#39;Paul&#39;, &#39;B-PER&#39;), (&#39;Newman&#39;, &#39;I-PER&#39;) becomes (&#39;Paul Newman&#39;, &#39;PER&#39;)
          batch_size(int):    Batch size to use for predictions (default:32)
        ```
        &#34;&#34;&#34;
        if isinstance(texts, str):
            texts = [texts]
        self.predictor.batch_size = batch_size
        texts = [t.strip() for t in texts]
        results = self.predictor.predict(texts, merge_tokens=merge_tokens)
        if len(results) == 1:
            results = results[0]
        return results

    # 2020-04-30: moved to text.ner.predictor
    # def merge_tokens(self, annotated_sentence):
    #    if self.lang.startswith(&#39;zh&#39;):
    #        sep = &#39;&#39;
    #    else:
    #        sep = &#39; &#39;
    #    current_token = &#34;&#34;
    #    current_tag = &#34;&#34;
    #    entities = []

    #    for tup in annotated_sentence:
    #        token = tup[0]
    #        entity = tup[1]
    #        tag = entity.split(&#39;-&#39;)[1] if &#39;-&#39; in entity else None
    #        prefix = entity.split(&#39;-&#39;)[0] if &#39;-&#39; in entity else None
    #        # not within entity
    #        if tag is None and not current_token:
    #            continue
    #        # beginning of entity
    #        #elif tag and prefix==&#39;B&#39;:
    #        elif tag and (prefix==&#39;B&#39; or prefix==&#39;I&#39; and not current_token):
    #            if current_token: # consecutive entities
    #                entities.append((current_token, current_tag))
    #                current_token = &#34;&#34;
    #                current_tag = None
    #            current_token = token
    #            current_tag = tag
    #        # end of entity
    #        elif tag is None and current_token:
    #            entities.append((current_token, current_tag))
    #            current_token = &#34;&#34;
    #            current_tag = None
    #            continue
    #        # within entity
    #        elif tag and current_token:  #  prefix I
    #            current_token = current_token + sep + token
    #            current_tag = tag
    #    return entities</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.shallownlp.NER.predict"><code class="name flex">
<span>def <span class="ident">predict</span></span>(<span>self, texts, merge_tokens=True, batch_size=32)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Extract named entities from supplied text

Args:
  texts (list of str or str): list of texts to annotate
  merge_tokens(bool):  If True, tokens will be merged together by the entity
                       to which they are associated:
                       ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER')
  batch_size(int):    Batch size to use for predictions (default:32)
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def predict(self, texts, merge_tokens=True, batch_size=32):
    &#34;&#34;&#34;
    ```
    Extract named entities from supplied text

    Args:
      texts (list of str or str): list of texts to annotate
      merge_tokens(bool):  If True, tokens will be merged together by the entity
                           to which they are associated:
                           (&#39;Paul&#39;, &#39;B-PER&#39;), (&#39;Newman&#39;, &#39;I-PER&#39;) becomes (&#39;Paul Newman&#39;, &#39;PER&#39;)
      batch_size(int):    Batch size to use for predictions (default:32)
    ```
    &#34;&#34;&#34;
    if isinstance(texts, str):
        texts = [texts]
    self.predictor.batch_size = batch_size
    texts = [t.strip() for t in texts]
    results = self.predictor.predict(texts, merge_tokens=merge_tokens)
    if len(results) == 1:
        results = results[0]
    return results</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.shallownlp.Searcher"><code class="flex name class">
<span>class <span class="ident">Searcher</span></span>
<span>(</span><span>queries, lang=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Search for keywords in text documents</p>
<pre><code>Args:
  queries(list of str): list of chinese text queries
  lang(str): language of queries.  default:None --&gt; auto-detected
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Searcher:
    &#34;&#34;&#34;
    Search for keywords in text documents
    &#34;&#34;&#34;

    def __init__(self, queries, lang=None):
        &#34;&#34;&#34;
        ```
        Args:
          queries(list of str): list of chinese text queries
          lang(str): language of queries.  default:None --&gt; auto-detected
        ```
        &#34;&#34;&#34;
        self.queries = queries
        if isinstance(self.queries, str):
            self.queries = [self.queries]
        self.lang = lang
        if self.lang is None:
            self.lang = U.detect_lang(queries)
        # print(&#34;lang:%s&#34; %(self.lang))

    def search(self, docs, case_sensitive=False, keys=[], min_matches=1, progress=True):
        &#34;&#34;&#34;
        ```
        executes self.queries on supplied list of documents
        Args:
          docs(list of str): list of chinese texts
          case_sensitive(bool):  If True, case sensitive search
          keys(list): list keys for supplied docs (e.g., file paths).
                      default: key is index in range(len(docs))
          min_matches(int): results must have at least these many word matches
          progress(bool): whether or not to show progress bar
        Returns:
          list of tuples of results of the form:
            (key, query, no. of matches)
          For Chinese, no. of matches will be number of unique Jieba-extracted character sequences that match
        ```
        &#34;&#34;&#34;
        if isinstance(docs, str):
            docs = [docs]
        if keys and len(keys) != len(docs):
            raise ValueError(&#34;lengths of keys and docs must be the same&#34;)
        results = []
        l = len(docs)
        for idx, text in enumerate(docs):
            for q in self.queries:
                if U.is_chinese(self.lang):
                    r = self._search_chinese(
                        q, [text], min_matches=min_matches, parse=1, progress=False
                    )
                elif self.lang == &#34;ar&#34;:
                    r = self._search(
                        q,
                        [text],
                        case_sensitive=case_sensitive,
                        min_matches=min_matches,
                        progress=False,
                        substrings_on=True,
                    )
                else:
                    r = self._search(
                        q,
                        [text],
                        case_sensitive=case_sensitive,
                        min_matches=min_matches,
                        progress=False,
                        substrings_on=False,
                    )
                if not r:
                    continue
                r = r[0]
                k = idx
                if keys:
                    k = keys[idx]
                num_matches = len(set(r[2])) if U.is_chinese(self.lang) else len(r[2])
                results.append((k, q, num_matches))
            if progress:
                printProgressBar(
                    idx + 1, l, prefix=&#34;progress: &#34;, suffix=&#34;complete&#34;, length=50
                )
        return results

    def _search(
        self,
        query,
        docs,
        case_sensitive=False,
        substrings_on=False,
        min_matches=1,
        progress=True,
    ):
        &#34;&#34;&#34;
        ```
        search documents for query string.
        Args:
            query(str or list):  the word or phrase to search (or list of them)
                                 if list is provided, each element is combined using OR
            docs (list of str): list of text documents
            case_sensitive(bool):  If True, case sensitive search
            substrings_on(bool): whether to use &#34;\b&#34; in regex. default:True
                                 If True, will find substrings
        returns:
            list or tuple:  Returns list of results if len(docs) &gt; 1.  Otherwise, returns tuple of results
        ```
        &#34;&#34;&#34;
        if not isinstance(query, (list, tuple, str)):
            raise ValueError(&#34;query must be str or list of str&#34;)
        if isinstance(query, str):
            query = [query]
        if not isinstance(docs, (list, np.ndarray)):
            raise ValueError(&#34;docs must be list of str&#34;)

        flag = 0
        if not case_sensitive:
            flag = re.I
        qlist = []
        for q in query:
            qlist.append(&#34;\s+&#34;.join(q.split()))
        original_query = query
        query = &#34;|&#34;.join(qlist)
        bound = r&#34;\b&#34;
        if substrings_on:
            bound = &#34;&#34;
        pattern_str = r&#34;%s(?:%s)%s&#34; % (bound, query, bound)
        pattern = re.compile(pattern_str, flag)

        results = []
        l = len(docs)
        for idx, text in enumerate(docs):
            matches = pattern.findall(text)
            if matches and len(matches) &gt;= min_matches:
                results.append((idx, text, matches))
            if progress:
                printProgressBar(
                    idx + 1, l, prefix=&#34;progress: &#34;, suffix=&#34;complete&#34;, length=50
                )
        return results

    def _search_chinese(
        self, query, docs, substrings_on=True, parse=1, min_matches=1, progress=False
    ):
        &#34;&#34;&#34;
        convenience method to search chinese text
        &#34;&#34;&#34;
        original_query = query
        if not isinstance(query, str):
            raise ValueError(&#34;query must be str&#34;)
        if parse &gt; 0:
            q = U.split_chinese(query)[0]
            num_words = len(q.split())
            query = build_ngrams(q, n=parse)
            query = [&#34;&#34;.join(q) for q in query]
        return self._search(query, docs, substrings_on=substrings_on, progress=progress)</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.shallownlp.Searcher.search"><code class="name flex">
<span>def <span class="ident">search</span></span>(<span>self, docs, case_sensitive=False, keys=[], min_matches=1, progress=True)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>executes self.queries on supplied list of documents
Args:
  docs(list of str): list of chinese texts
  case_sensitive(bool):  If True, case sensitive search
  keys(list): list keys for supplied docs (e.g., file paths).
              default: key is index in range(len(docs))
  min_matches(int): results must have at least these many word matches
  progress(bool): whether or not to show progress bar
Returns:
  list of tuples of results of the form:
    (key, query, no. of matches)
  For Chinese, no. of matches will be number of unique Jieba-extracted character sequences that match
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def search(self, docs, case_sensitive=False, keys=[], min_matches=1, progress=True):
    &#34;&#34;&#34;
    ```
    executes self.queries on supplied list of documents
    Args:
      docs(list of str): list of chinese texts
      case_sensitive(bool):  If True, case sensitive search
      keys(list): list keys for supplied docs (e.g., file paths).
                  default: key is index in range(len(docs))
      min_matches(int): results must have at least these many word matches
      progress(bool): whether or not to show progress bar
    Returns:
      list of tuples of results of the form:
        (key, query, no. of matches)
      For Chinese, no. of matches will be number of unique Jieba-extracted character sequences that match
    ```
    &#34;&#34;&#34;
    if isinstance(docs, str):
        docs = [docs]
    if keys and len(keys) != len(docs):
        raise ValueError(&#34;lengths of keys and docs must be the same&#34;)
    results = []
    l = len(docs)
    for idx, text in enumerate(docs):
        for q in self.queries:
            if U.is_chinese(self.lang):
                r = self._search_chinese(
                    q, [text], min_matches=min_matches, parse=1, progress=False
                )
            elif self.lang == &#34;ar&#34;:
                r = self._search(
                    q,
                    [text],
                    case_sensitive=case_sensitive,
                    min_matches=min_matches,
                    progress=False,
                    substrings_on=True,
                )
            else:
                r = self._search(
                    q,
                    [text],
                    case_sensitive=case_sensitive,
                    min_matches=min_matches,
                    progress=False,
                    substrings_on=False,
                )
            if not r:
                continue
            r = r[0]
            k = idx
            if keys:
                k = keys[idx]
            num_matches = len(set(r[2])) if U.is_chinese(self.lang) else len(r[2])
            results.append((k, q, num_matches))
        if progress:
            printProgressBar(
                idx + 1, l, prefix=&#34;progress: &#34;, suffix=&#34;complete&#34;, length=50
            )
    return results</code></pre>
</details>
</dd>
</dl>
</dd>
</dl>
</section>
</article>
<nav id="sidebar">
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="ktrain.text" href="../index.html">ktrain.text</a></code></li>
</ul>
</li>
<li><h3><a href="#header-submodules">Sub-modules</a></h3>
<ul>
<li><code><a title="ktrain.text.shallownlp.classifier" href="classifier.html">ktrain.text.shallownlp.classifier</a></code></li>
<li><code><a title="ktrain.text.shallownlp.imports" href="imports.html">ktrain.text.shallownlp.imports</a></code></li>
<li><code><a title="ktrain.text.shallownlp.ner" href="ner.html">ktrain.text.shallownlp.ner</a></code></li>
<li><code><a title="ktrain.text.shallownlp.searcher" href="searcher.html">ktrain.text.shallownlp.searcher</a></code></li>
<li><code><a title="ktrain.text.shallownlp.utils" href="utils.html">ktrain.text.shallownlp.utils</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="two-column">
<li><code><a title="ktrain.text.shallownlp.extract_filenames" href="#ktrain.text.shallownlp.extract_filenames">extract_filenames</a></code></li>
<li><code><a title="ktrain.text.shallownlp.find_arabic" href="#ktrain.text.shallownlp.find_arabic">find_arabic</a></code></li>
<li><code><a title="ktrain.text.shallownlp.find_chinese" href="#ktrain.text.shallownlp.find_chinese">find_chinese</a></code></li>
<li><code><a title="ktrain.text.shallownlp.find_russian" href="#ktrain.text.shallownlp.find_russian">find_russian</a></code></li>
<li><code><a title="ktrain.text.shallownlp.read_text" href="#ktrain.text.shallownlp.read_text">read_text</a></code></li>
<li><code><a title="ktrain.text.shallownlp.search" href="#ktrain.text.shallownlp.search">search</a></code></li>
<li><code><a title="ktrain.text.shallownlp.sent_tokenize" href="#ktrain.text.shallownlp.sent_tokenize">sent_tokenize</a></code></li>
</ul>
</li>
<li><h3><a href="#header-classes">Classes</a></h3>
<ul>
<li>
<h4><code><a title="ktrain.text.shallownlp.Classifier" href="#ktrain.text.shallownlp.Classifier">Classifier</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.shallownlp.Classifier.create_model" href="#ktrain.text.shallownlp.Classifier.create_model">create_model</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.evaluate" href="#ktrain.text.shallownlp.Classifier.evaluate">evaluate</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.fit" href="#ktrain.text.shallownlp.Classifier.fit">fit</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.grid_search" href="#ktrain.text.shallownlp.Classifier.grid_search">grid_search</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.load" href="#ktrain.text.shallownlp.Classifier.load">load</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.load_texts_from_csv" href="#ktrain.text.shallownlp.Classifier.load_texts_from_csv">load_texts_from_csv</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.load_texts_from_folder" href="#ktrain.text.shallownlp.Classifier.load_texts_from_folder">load_texts_from_folder</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.predict" href="#ktrain.text.shallownlp.Classifier.predict">predict</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.predict_proba" href="#ktrain.text.shallownlp.Classifier.predict_proba">predict_proba</a></code></li>
<li><code><a title="ktrain.text.shallownlp.Classifier.save" href="#ktrain.text.shallownlp.Classifier.save">save</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.shallownlp.NER" href="#ktrain.text.shallownlp.NER">NER</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.shallownlp.NER.predict" href="#ktrain.text.shallownlp.NER.predict">predict</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.shallownlp.Searcher" href="#ktrain.text.shallownlp.Searcher">Searcher</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.shallownlp.Searcher.search" href="#ktrain.text.shallownlp.Searcher.search">search</a></code></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
</footer>
</body>
</html>