docs/text/ner/preprocessor.html

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.10.0" />
<title>ktrain.text.ner.preprocessor API documentation</title>
<meta name="description" content="" />
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>ktrain.text.ner.preprocessor</code></h1>
</header>
<section id="section-intro">
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">from ... import utils as U
from ...imports import *
from ...preprocessor import Preprocessor
from .. import preprocessor as tpp
from .. import textutils as TU

OTHER = &#34;O&#34;
W2V = &#34;word2vec&#34;
SUPPORTED_EMBEDDINGS = [W2V]

WORD_COL = &#34;Word&#34;
TAG_COL = &#34;Tag&#34;
SENT_COL = &#34;SentenceID&#34;


# tokenizer_filter = rs=&#39;!&#34;#$%&amp;()*+,-./:;&lt;=&gt;?@[\\]^_`{|}~\t\n&#39;
# re_tok = re.compile(f&#39;([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])&#39;)
# def tokenize(s): return re_tok.sub(r&#39; \1 &#39;, s).split()


class NERPreprocessor(Preprocessor):
    &#34;&#34;&#34;
    NER preprocessing base class
    &#34;&#34;&#34;

    def __init__(self, p):
        self.p = p
        self.c = p._label_vocab._id2token

    def get_preprocessor(self):
        return self.p

    def get_classes(self):
        return self.c

    def filter_embeddings(self, embeddings, vocab, dim):
        &#34;&#34;&#34;Loads word vectors in numpy array.

        Args:
            embeddings (dict or TransformerEmbedding): a dictionary of numpy array or Transformer Embedding instance
            vocab (dict): word_index lookup table.

        Returns:
            numpy array: an array of word embeddings.
        &#34;&#34;&#34;
        if not isinstance(embeddings, dict):
            return
        _embeddings = np.zeros([len(vocab), dim])
        for word in vocab:
            if word in embeddings:
                word_idx = vocab[word]
                _embeddings[word_idx] = embeddings[word]
        return _embeddings

    def get_wv_model(self, wv_path_or_url, verbose=1):
        if wv_path_or_url is None:
            raise ValueError(
                &#34;wordvector_path_or_url is empty: supply a file path or &#34;
                + &#34;URL to fasttext word vector file&#34;
            )
        if verbose:
            print(
                &#34;pretrained word embeddings will be loaded from:\n\t%s&#34;
                % (wv_path_or_url)
            )
        word_embedding_dim = 300  # all fasttext word vectors are of dim=300
        embs = tpp.load_wv(wv_path_or_url, verbose=verbose)
        wv_model = self.filter_embeddings(
            embs, self.p._word_vocab.vocab, word_embedding_dim
        )
        return (wv_model, word_embedding_dim)

    def preprocess(self, sentences, lang=None, custom_tokenizer=None):
        if type(sentences) != list:
            raise ValueError(&#34;Param sentences must be a list of strings&#34;)

        # language detection
        if lang is None:
            lang = TU.detect_lang(sentences)

        # set tokenizer
        if custom_tokenizer is not None:
            tokfunc = custom_tokenizer
        elif TU.is_chinese(
            lang, strict=False
        ):  # strict=False: workaround for langdetect bug on short chinese texts
            tokfunc = lambda text: [c for c in text]
        else:
            tokfunc = TU.tokenize

        # preprocess
        X = []
        y = []
        for s in sentences:
            tokens = tokfunc(s)
            X.append(tokens)
            y.append([OTHER] * len(tokens))
        from .dataset import NERSequence

        nerseq = NERSequence(X, y, p=self.p)
        return nerseq

    def preprocess_test(self, x_test, y_test, verbose=1):
        &#34;&#34;&#34;
        Args:
          x_test(list of lists of str): lists of token lists
          x_test (list of lists of str):  lists of tag lists
          verbose(bool): verbosity
        Returns:
          NERSequence:  can be used as argument to NERLearner.validate() to evaluate test sets
        &#34;&#34;&#34;
        # array &gt; df &gt; array in order to print statistics more easily
        from .data import array_to_df

        test_df = array_to_df(x_test, y_test)
        (x_list, y_list) = process_df(test_df, verbose=verbose)
        from .dataset import NERSequence

        return NERSequence(x_list, y_list, batch_size=U.DEFAULT_BS, p=self.p)

    def preprocess_test_from_conll2003(self, filepath, verbose=1):
        df = conll2003_to_df(filepath)
        (x, y) = process_df(df)
        return self.preprocess_test(x, y, verbose=verbose)

    def undo(self, nerseq):
        &#34;&#34;&#34;
        undoes preprocessing and returns raw data by:
        converting a list or array of Word IDs back to words
        &#34;&#34;&#34;
        return [&#34; &#34;.join(e) for e in nerseq.x]

    def fit(self, X, y):
        &#34;&#34;&#34;
        Learn vocabulary from training set
        &#34;&#34;&#34;
        self.p.fit(X, y)
        return

    def transform(self, X, y=None):
        &#34;&#34;&#34;
        Transform documents to sequences of word IDs
        &#34;&#34;&#34;
        return self.p.transform(X, y=y)


def array_to_df(x_list, y_list):
    ids = []
    words = []
    tags = []
    for idx, lst in enumerate(x_list):
        length = len(lst)
        words.extend(lst)
        tags.extend(y_list[idx])
        ids.extend([idx] * length)
    return pd.DataFrame(zip(ids, words, tags), columns=[SENT_COL, WORD_COL, TAG_COL])


def conll2003_to_df(filepath, encoding=&#34;latin1&#34;):
    # read data and convert to dataframe
    sents, words, tags = [], [], []
    sent_id = 0
    docstart = False
    with open(filepath, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                if line.startswith(&#34;-DOCSTART-&#34;):
                    docstart = True
                    continue
                else:
                    docstart = False
                    parts = line.split()
                    words.append(parts[0])
                    tags.append(parts[-1])
                    sents.append(sent_id)
            else:
                if not docstart:
                    sent_id += 1
    df = pd.DataFrame({SENT_COL: sents, WORD_COL: words, TAG_COL: tags})
    df = df.fillna(method=&#34;ffill&#34;)
    return df


def gmb_to_df(filepath, encoding=&#34;latin1&#34;):
    df = pd.read_csv(filepath, encoding=encoding)
    df = df.fillna(method=&#34;ffill&#34;)
    return df


def process_df(
    df, sentence_column=&#34;SentenceID&#34;, word_column=&#34;Word&#34;, tag_column=&#34;Tag&#34;, verbose=1
):
    &#34;&#34;&#34;
    Extract words, tags, and sentences from dataframe
    &#34;&#34;&#34;

    # get words and tags
    words = list(set(df[word_column].values))
    n_words = len(words)
    tags = list(set(df[tag_column].values))
    n_tags = len(tags)
    if verbose:
        print(&#34;Number of sentences: &#34;, len(df.groupby([sentence_column])))
        print(&#34;Number of words in the dataset: &#34;, n_words)
        print(&#34;Tags:&#34;, tags)
        print(&#34;Number of Labels: &#34;, n_tags)

    # retrieve all sentences
    getter = SentenceGetter(df, word_column, tag_column, sentence_column)
    sentences = getter.sentences
    largest_sen = max(len(sen) for sen in sentences)
    if verbose:
        print(&#34;Longest sentence: {} words&#34;.format(largest_sen))
    data = [list(zip(*s)) for s in sentences]
    X = [list(e[0]) for e in data]
    y = [list(e[1]) for e in data]
    return (X, y)


class SentenceGetter(object):
    &#34;&#34;&#34;Class to Get the sentence in this format:
    [(Token_1, Part_of_Speech_1, Tag_1), ..., (Token_n, Part_of_Speech_1, Tag_1)]&#34;&#34;&#34;

    def __init__(self, data, word_column, tag_column, sentence_column):
        &#34;&#34;&#34;Args:
        data is the pandas.DataFrame which contains the above dataset&#34;&#34;&#34;
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [
            (w, t)
            for w, t in zip(
                s[word_column].values.tolist(), s[tag_column].values.tolist()
            )
        ]
        self.grouped = self.data.groupby(sentence_column).apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        &#34;&#34;&#34;Return one sentence&#34;&#34;&#34;
        try:
            s = self.grouped[&#34;Sentence: {}&#34;.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None</code></pre>
</details>
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="ktrain.text.ner.preprocessor.array_to_df"><code class="name flex">
<span>def <span class="ident">array_to_df</span></span>(<span>x_list, y_list)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def array_to_df(x_list, y_list):
    ids = []
    words = []
    tags = []
    for idx, lst in enumerate(x_list):
        length = len(lst)
        words.extend(lst)
        tags.extend(y_list[idx])
        ids.extend([idx] * length)
    return pd.DataFrame(zip(ids, words, tags), columns=[SENT_COL, WORD_COL, TAG_COL])</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.conll2003_to_df"><code class="name flex">
<span>def <span class="ident">conll2003_to_df</span></span>(<span>filepath, encoding='latin1')</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def conll2003_to_df(filepath, encoding=&#34;latin1&#34;):
    # read data and convert to dataframe
    sents, words, tags = [], [], []
    sent_id = 0
    docstart = False
    with open(filepath, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                if line.startswith(&#34;-DOCSTART-&#34;):
                    docstart = True
                    continue
                else:
                    docstart = False
                    parts = line.split()
                    words.append(parts[0])
                    tags.append(parts[-1])
                    sents.append(sent_id)
            else:
                if not docstart:
                    sent_id += 1
    df = pd.DataFrame({SENT_COL: sents, WORD_COL: words, TAG_COL: tags})
    df = df.fillna(method=&#34;ffill&#34;)
    return df</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.gmb_to_df"><code class="name flex">
<span>def <span class="ident">gmb_to_df</span></span>(<span>filepath, encoding='latin1')</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def gmb_to_df(filepath, encoding=&#34;latin1&#34;):
    df = pd.read_csv(filepath, encoding=encoding)
    df = df.fillna(method=&#34;ffill&#34;)
    return df</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.process_df"><code class="name flex">
<span>def <span class="ident">process_df</span></span>(<span>df, sentence_column='SentenceID', word_column='Word', tag_column='Tag', verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><p>Extract words, tags, and sentences from dataframe</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def process_df(
    df, sentence_column=&#34;SentenceID&#34;, word_column=&#34;Word&#34;, tag_column=&#34;Tag&#34;, verbose=1
):
    &#34;&#34;&#34;
    Extract words, tags, and sentences from dataframe
    &#34;&#34;&#34;

    # get words and tags
    words = list(set(df[word_column].values))
    n_words = len(words)
    tags = list(set(df[tag_column].values))
    n_tags = len(tags)
    if verbose:
        print(&#34;Number of sentences: &#34;, len(df.groupby([sentence_column])))
        print(&#34;Number of words in the dataset: &#34;, n_words)
        print(&#34;Tags:&#34;, tags)
        print(&#34;Number of Labels: &#34;, n_tags)

    # retrieve all sentences
    getter = SentenceGetter(df, word_column, tag_column, sentence_column)
    sentences = getter.sentences
    largest_sen = max(len(sen) for sen in sentences)
    if verbose:
        print(&#34;Longest sentence: {} words&#34;.format(largest_sen))
    data = [list(zip(*s)) for s in sentences]
    X = [list(e[0]) for e in data]
    y = [list(e[1]) for e in data]
    return (X, y)</code></pre>
</details>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor"><code class="flex name class">
<span>class <span class="ident">NERPreprocessor</span></span>
<span>(</span><span>p)</span>
</code></dt>
<dd>
<div class="desc"><p>NER preprocessing base class</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class NERPreprocessor(Preprocessor):
    &#34;&#34;&#34;
    NER preprocessing base class
    &#34;&#34;&#34;

    def __init__(self, p):
        self.p = p
        self.c = p._label_vocab._id2token

    def get_preprocessor(self):
        return self.p

    def get_classes(self):
        return self.c

    def filter_embeddings(self, embeddings, vocab, dim):
        &#34;&#34;&#34;Loads word vectors in numpy array.

        Args:
            embeddings (dict or TransformerEmbedding): a dictionary of numpy array or Transformer Embedding instance
            vocab (dict): word_index lookup table.

        Returns:
            numpy array: an array of word embeddings.
        &#34;&#34;&#34;
        if not isinstance(embeddings, dict):
            return
        _embeddings = np.zeros([len(vocab), dim])
        for word in vocab:
            if word in embeddings:
                word_idx = vocab[word]
                _embeddings[word_idx] = embeddings[word]
        return _embeddings

    def get_wv_model(self, wv_path_or_url, verbose=1):
        if wv_path_or_url is None:
            raise ValueError(
                &#34;wordvector_path_or_url is empty: supply a file path or &#34;
                + &#34;URL to fasttext word vector file&#34;
            )
        if verbose:
            print(
                &#34;pretrained word embeddings will be loaded from:\n\t%s&#34;
                % (wv_path_or_url)
            )
        word_embedding_dim = 300  # all fasttext word vectors are of dim=300
        embs = tpp.load_wv(wv_path_or_url, verbose=verbose)
        wv_model = self.filter_embeddings(
            embs, self.p._word_vocab.vocab, word_embedding_dim
        )
        return (wv_model, word_embedding_dim)

    def preprocess(self, sentences, lang=None, custom_tokenizer=None):
        if type(sentences) != list:
            raise ValueError(&#34;Param sentences must be a list of strings&#34;)

        # language detection
        if lang is None:
            lang = TU.detect_lang(sentences)

        # set tokenizer
        if custom_tokenizer is not None:
            tokfunc = custom_tokenizer
        elif TU.is_chinese(
            lang, strict=False
        ):  # strict=False: workaround for langdetect bug on short chinese texts
            tokfunc = lambda text: [c for c in text]
        else:
            tokfunc = TU.tokenize

        # preprocess
        X = []
        y = []
        for s in sentences:
            tokens = tokfunc(s)
            X.append(tokens)
            y.append([OTHER] * len(tokens))
        from .dataset import NERSequence

        nerseq = NERSequence(X, y, p=self.p)
        return nerseq

    def preprocess_test(self, x_test, y_test, verbose=1):
        &#34;&#34;&#34;
        Args:
          x_test(list of lists of str): lists of token lists
          x_test (list of lists of str):  lists of tag lists
          verbose(bool): verbosity
        Returns:
          NERSequence:  can be used as argument to NERLearner.validate() to evaluate test sets
        &#34;&#34;&#34;
        # array &gt; df &gt; array in order to print statistics more easily
        from .data import array_to_df

        test_df = array_to_df(x_test, y_test)
        (x_list, y_list) = process_df(test_df, verbose=verbose)
        from .dataset import NERSequence

        return NERSequence(x_list, y_list, batch_size=U.DEFAULT_BS, p=self.p)

    def preprocess_test_from_conll2003(self, filepath, verbose=1):
        df = conll2003_to_df(filepath)
        (x, y) = process_df(df)
        return self.preprocess_test(x, y, verbose=verbose)

    def undo(self, nerseq):
        &#34;&#34;&#34;
        undoes preprocessing and returns raw data by:
        converting a list or array of Word IDs back to words
        &#34;&#34;&#34;
        return [&#34; &#34;.join(e) for e in nerseq.x]

    def fit(self, X, y):
        &#34;&#34;&#34;
        Learn vocabulary from training set
        &#34;&#34;&#34;
        self.p.fit(X, y)
        return

    def transform(self, X, y=None):
        &#34;&#34;&#34;
        Transform documents to sequences of word IDs
        &#34;&#34;&#34;
        return self.p.transform(X, y=y)</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.preprocessor.Preprocessor" href="../../preprocessor.html#ktrain.preprocessor.Preprocessor">Preprocessor</a></li>
<li>abc.ABC</li>
</ul>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.filter_embeddings"><code class="name flex">
<span>def <span class="ident">filter_embeddings</span></span>(<span>self, embeddings, vocab, dim)</span>
</code></dt>
<dd>
<div class="desc"><p>Loads word vectors in numpy array.</p>
<h2 id="args">Args</h2>
<dl>
<dt><strong><code>embeddings</code></strong> :&ensp;<code>dict</code> or <code>TransformerEmbedding</code></dt>
<dd>a dictionary of numpy array or Transformer Embedding instance</dd>
<dt><strong><code>vocab</code></strong> :&ensp;<code>dict</code></dt>
<dd>word_index lookup table.</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>numpy array</code></dt>
<dd>an array of word embeddings.</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def filter_embeddings(self, embeddings, vocab, dim):
    &#34;&#34;&#34;Loads word vectors in numpy array.

    Args:
        embeddings (dict or TransformerEmbedding): a dictionary of numpy array or Transformer Embedding instance
        vocab (dict): word_index lookup table.

    Returns:
        numpy array: an array of word embeddings.
    &#34;&#34;&#34;
    if not isinstance(embeddings, dict):
        return
    _embeddings = np.zeros([len(vocab), dim])
    for word in vocab:
        if word in embeddings:
            word_idx = vocab[word]
            _embeddings[word_idx] = embeddings[word]
    return _embeddings</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.fit"><code class="name flex">
<span>def <span class="ident">fit</span></span>(<span>self, X, y)</span>
</code></dt>
<dd>
<div class="desc"><p>Learn vocabulary from training set</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def fit(self, X, y):
    &#34;&#34;&#34;
    Learn vocabulary from training set
    &#34;&#34;&#34;
    self.p.fit(X, y)
    return</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.get_classes"><code class="name flex">
<span>def <span class="ident">get_classes</span></span>(<span>self)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_classes(self):
    return self.c</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.get_preprocessor"><code class="name flex">
<span>def <span class="ident">get_preprocessor</span></span>(<span>self)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_preprocessor(self):
    return self.p</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.get_wv_model"><code class="name flex">
<span>def <span class="ident">get_wv_model</span></span>(<span>self, wv_path_or_url, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_wv_model(self, wv_path_or_url, verbose=1):
    if wv_path_or_url is None:
        raise ValueError(
            &#34;wordvector_path_or_url is empty: supply a file path or &#34;
            + &#34;URL to fasttext word vector file&#34;
        )
    if verbose:
        print(
            &#34;pretrained word embeddings will be loaded from:\n\t%s&#34;
            % (wv_path_or_url)
        )
    word_embedding_dim = 300  # all fasttext word vectors are of dim=300
    embs = tpp.load_wv(wv_path_or_url, verbose=verbose)
    wv_model = self.filter_embeddings(
        embs, self.p._word_vocab.vocab, word_embedding_dim
    )
    return (wv_model, word_embedding_dim)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.preprocess"><code class="name flex">
<span>def <span class="ident">preprocess</span></span>(<span>self, sentences, lang=None, custom_tokenizer=None)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess(self, sentences, lang=None, custom_tokenizer=None):
    if type(sentences) != list:
        raise ValueError(&#34;Param sentences must be a list of strings&#34;)

    # language detection
    if lang is None:
        lang = TU.detect_lang(sentences)

    # set tokenizer
    if custom_tokenizer is not None:
        tokfunc = custom_tokenizer
    elif TU.is_chinese(
        lang, strict=False
    ):  # strict=False: workaround for langdetect bug on short chinese texts
        tokfunc = lambda text: [c for c in text]
    else:
        tokfunc = TU.tokenize

    # preprocess
    X = []
    y = []
    for s in sentences:
        tokens = tokfunc(s)
        X.append(tokens)
        y.append([OTHER] * len(tokens))
    from .dataset import NERSequence

    nerseq = NERSequence(X, y, p=self.p)
    return nerseq</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.preprocess_test"><code class="name flex">
<span>def <span class="ident">preprocess_test</span></span>(<span>self, x_test, y_test, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><h2 id="args">Args</h2>
<dl>
<dt>x_test(list of lists of str): lists of token lists</dt>
<dt><strong><code>x_test</code></strong> :&ensp;<code>list</code> of <code>lists</code> of <code>str</code></dt>
<dd>lists of tag lists</dd>
</dl>
<p>verbose(bool): verbosity</p>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>NERSequence</code></dt>
<dd>can be used as argument to NERLearner.validate() to evaluate test sets</dd>
</dl></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess_test(self, x_test, y_test, verbose=1):
    &#34;&#34;&#34;
    Args:
      x_test(list of lists of str): lists of token lists
      x_test (list of lists of str):  lists of tag lists
      verbose(bool): verbosity
    Returns:
      NERSequence:  can be used as argument to NERLearner.validate() to evaluate test sets
    &#34;&#34;&#34;
    # array &gt; df &gt; array in order to print statistics more easily
    from .data import array_to_df

    test_df = array_to_df(x_test, y_test)
    (x_list, y_list) = process_df(test_df, verbose=verbose)
    from .dataset import NERSequence

    return NERSequence(x_list, y_list, batch_size=U.DEFAULT_BS, p=self.p)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.preprocess_test_from_conll2003"><code class="name flex">
<span>def <span class="ident">preprocess_test_from_conll2003</span></span>(<span>self, filepath, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess_test_from_conll2003(self, filepath, verbose=1):
    df = conll2003_to_df(filepath)
    (x, y) = process_df(df)
    return self.preprocess_test(x, y, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.transform"><code class="name flex">
<span>def <span class="ident">transform</span></span>(<span>self, X, y=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Transform documents to sequences of word IDs</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def transform(self, X, y=None):
    &#34;&#34;&#34;
    Transform documents to sequences of word IDs
    &#34;&#34;&#34;
    return self.p.transform(X, y=y)</code></pre>
</details>
</dd>
<dt id="ktrain.text.ner.preprocessor.NERPreprocessor.undo"><code class="name flex">
<span>def <span class="ident">undo</span></span>(<span>self, nerseq)</span>
</code></dt>
<dd>
<div class="desc"><p>undoes preprocessing and returns raw data by:
converting a list or array of Word IDs back to words</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def undo(self, nerseq):
    &#34;&#34;&#34;
    undoes preprocessing and returns raw data by:
    converting a list or array of Word IDs back to words
    &#34;&#34;&#34;
    return [&#34; &#34;.join(e) for e in nerseq.x]</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.text.ner.preprocessor.SentenceGetter"><code class="flex name class">
<span>class <span class="ident">SentenceGetter</span></span>
<span>(</span><span>data, word_column, tag_column, sentence_column)</span>
</code></dt>
<dd>
<div class="desc"><p>Class to Get the sentence in this format:
[(Token_1, Part_of_Speech_1, Tag_1), &hellip;, (Token_n, Part_of_Speech_1, Tag_1)]</p>
<p>Args:
data is the pandas.DataFrame which contains the above dataset</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class SentenceGetter(object):
    &#34;&#34;&#34;Class to Get the sentence in this format:
    [(Token_1, Part_of_Speech_1, Tag_1), ..., (Token_n, Part_of_Speech_1, Tag_1)]&#34;&#34;&#34;

    def __init__(self, data, word_column, tag_column, sentence_column):
        &#34;&#34;&#34;Args:
        data is the pandas.DataFrame which contains the above dataset&#34;&#34;&#34;
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [
            (w, t)
            for w, t in zip(
                s[word_column].values.tolist(), s[tag_column].values.tolist()
            )
        ]
        self.grouped = self.data.groupby(sentence_column).apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        &#34;&#34;&#34;Return one sentence&#34;&#34;&#34;
        try:
            s = self.grouped[&#34;Sentence: {}&#34;.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None</code></pre>
</details>
<h3>Methods</h3>
<dl>
<dt id="ktrain.text.ner.preprocessor.SentenceGetter.get_next"><code class="name flex">
<span>def <span class="ident">get_next</span></span>(<span>self)</span>
</code></dt>
<dd>
<div class="desc"><p>Return one sentence</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_next(self):
    &#34;&#34;&#34;Return one sentence&#34;&#34;&#34;
    try:
        s = self.grouped[&#34;Sentence: {}&#34;.format(self.n_sent)]
        self.n_sent += 1
        return s
    except:
        return None</code></pre>
</details>
</dd>
</dl>
</dd>
</dl>
</section>
</article>
<nav id="sidebar">
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="ktrain.text.ner" href="index.html">ktrain.text.ner</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="ktrain.text.ner.preprocessor.array_to_df" href="#ktrain.text.ner.preprocessor.array_to_df">array_to_df</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.conll2003_to_df" href="#ktrain.text.ner.preprocessor.conll2003_to_df">conll2003_to_df</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.gmb_to_df" href="#ktrain.text.ner.preprocessor.gmb_to_df">gmb_to_df</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.process_df" href="#ktrain.text.ner.preprocessor.process_df">process_df</a></code></li>
</ul>
</li>
<li><h3><a href="#header-classes">Classes</a></h3>
<ul>
<li>
<h4><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor" href="#ktrain.text.ner.preprocessor.NERPreprocessor">NERPreprocessor</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.filter_embeddings" href="#ktrain.text.ner.preprocessor.NERPreprocessor.filter_embeddings">filter_embeddings</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.fit" href="#ktrain.text.ner.preprocessor.NERPreprocessor.fit">fit</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.get_classes" href="#ktrain.text.ner.preprocessor.NERPreprocessor.get_classes">get_classes</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.get_preprocessor" href="#ktrain.text.ner.preprocessor.NERPreprocessor.get_preprocessor">get_preprocessor</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.get_wv_model" href="#ktrain.text.ner.preprocessor.NERPreprocessor.get_wv_model">get_wv_model</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.preprocess" href="#ktrain.text.ner.preprocessor.NERPreprocessor.preprocess">preprocess</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.preprocess_test" href="#ktrain.text.ner.preprocessor.NERPreprocessor.preprocess_test">preprocess_test</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.preprocess_test_from_conll2003" href="#ktrain.text.ner.preprocessor.NERPreprocessor.preprocess_test_from_conll2003">preprocess_test_from_conll2003</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.transform" href="#ktrain.text.ner.preprocessor.NERPreprocessor.transform">transform</a></code></li>
<li><code><a title="ktrain.text.ner.preprocessor.NERPreprocessor.undo" href="#ktrain.text.ner.preprocessor.NERPreprocessor.undo">undo</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.text.ner.preprocessor.SentenceGetter" href="#ktrain.text.ner.preprocessor.SentenceGetter">SentenceGetter</a></code></h4>
<ul class="">
<li><code><a title="ktrain.text.ner.preprocessor.SentenceGetter.get_next" href="#ktrain.text.ner.preprocessor.SentenceGetter.get_next">get_next</a></code></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
</footer>
</body>
</html>