docs/tabular/preprocessor.html

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.10.0" />
<title>ktrain.tabular.preprocessor API documentation</title>
<meta name="description" content="" />
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>ktrain.tabular.preprocessor</code></h1>
</header>
<section id="section-intro">
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">from .. import utils as U
from ..imports import *
from ..preprocessor import Preprocessor


class TabularPreprocessor(Preprocessor):
    &#34;&#34;&#34;
    ```
    Tabular preprocessing base class
    ```
    &#34;&#34;&#34;

    def __init__(
        self,
        predictor_columns,
        label_columns,
        date_columns=[],
        is_regression=False,
        procs=[],
        max_card=20,
    ):
        self.is_regression = is_regression
        self.c = None
        self.pc = predictor_columns
        self.lc = label_columns
        self.lc = [self.lc] if isinstance(self.lc, str) else self.lc
        self.dc = date_columns
        self.label_columns = None
        self.cat_names = []
        self.cont_names = []
        self.date_names = []
        self.label_transform = None
        self.procs = procs
        self.max_card = max_card

    @property
    def na_names(self):
        return [n for n in self.cat_names if n[-3:] == &#34;_na&#34;]

    def get_preprocessor(self):
        return (self.label_transform, self.procs)

    def get_classes(self):
        return self.label_columns if not self.is_regression else []

    def preprocess(self, df):
        return self.preprocess_test(df)

    def _validate_columns(self, df):
        missing_columns = []
        for col in df.columns.values:
            if col not in self.lc and col not in self.pc:
                missing_columns.append(col)
        if len(missing_columns) &gt; 0:
            raise ValueError(&#34;df is missing columns: %s&#34; % (missing_columns))
        return

    def denormalize(self, df):
        normalizer = None
        for proc in self.procs:
            if type(proc).__name__ == &#34;Normalize&#34;:
                normalizer = proc
                break
        if normalizer is None:
            return df
        return normalizer.revert(df)

    # def codify(self, df):
    # df = df.copy()
    # for lab in self.lc:
    # df[lab] = df[lab].cat.codes
    # return df

    def preprocess_train(self, df, mode=&#34;train&#34;, verbose=1):
        &#34;&#34;&#34;
        ```
        preprocess training set
        ```
        &#34;&#34;&#34;
        df = df.copy()

        clean_df(df, pc=self.pc, lc=self.lc, check_labels=mode == &#34;train&#34;)

        if not isinstance(df, pd.DataFrame):
            raise ValueError(&#34;df must be a pd.DataFrame&#34;)

        # validate columns
        self._validate_columns(df)

        # validate mode
        # if mode != &#39;train&#39; and self.label_transform is None:
        # raise ValueError(&#39;self.label_transform is None but mode is %s: are you sure preprocess_train was invoked first?&#39; % (mode))

        # verbose
        if verbose:
            print(
                &#34;processing %s: %s rows x %s columns&#34; % (mode, df.shape[0], df.shape[1])
            )

        # convert date fields
        for field in self.dc:
            df = df.copy()  # TODO: fix this
            df, date_names = add_datepart(df, field)
            self.date_names = date_names

        # preprocess labels and data
        if mode == &#34;train&#34;:
            label_columns = self.lc[:]
            # label_columns.sort() # leave label columns sorted in same order as in DataFrame
            self.label_transform = U.YTransformDataFrame(
                label_columns, is_regression=self.is_regression
            )
            df = self.label_transform.apply_train(df)
            self.label_columns = (
                self.label_transform.get_classes()
                if not self.is_regression
                else self.label_transform.label_columns
            )
            self.cont_names, self.cat_names = cont_cat_split(
                df, label_columns=self.label_columns, max_card=self.max_card
            )
            self.procs = [
                proc(self.cat_names, self.cont_names) for proc in self.procs
            ]  # &#34;objectivy&#34;
        else:
            df = self.label_transform.apply_test(df)
        for proc in self.procs:
            proc(df, test=mode != &#34;train&#34;)  # apply processors
        from .dataset import TabularDataset

        return TabularDataset(df, self.cat_names, self.cont_names, self.label_columns)

    def preprocess_valid(self, df, verbose=1):
        &#34;&#34;&#34;
        ```
        preprocess validation set
        ```
        &#34;&#34;&#34;
        return self.preprocess_train(df, mode=&#34;valid&#34;, verbose=verbose)

    def preprocess_test(self, df, verbose=1):
        &#34;&#34;&#34;
        ```
        preprocess test set
        ```
        &#34;&#34;&#34;
        return self.preprocess_train(df, mode=&#34;test&#34;, verbose=verbose)


def pd_data_types(df, return_df=False):
    &#34;&#34;&#34;
    ```
    infers data type of each column in Pandas DataFrame
    Args:
      df(pd.DataFrame): pandas DataFrame
      return_df(bool): If True, returns columns and types in DataFrame.
                       Otherwise, a dictionary is returned.
    ```
    &#34;&#34;&#34;

    infer_type = lambda x: pd.api.types.infer_dtype(x, skipna=True)
    df.apply(infer_type, axis=0)

    # DataFrame with column names &amp; new types
    df_types = (
        pd.DataFrame(df.apply(pd.api.types.infer_dtype, axis=0))
        .reset_index()
        .rename(columns={&#34;index&#34;: &#34;column&#34;, 0: &#34;type&#34;})
    )
    if return_df:
        return df_types
    cols = list(df_types[&#34;column&#34;].values)
    col_types = list(df_types[&#34;type&#34;].values)
    return dict(list(zip(cols, col_types)))


def clean_df(
    train_df, val_df=None, pc=[], lc=[], check_labels=True, return_types=False
):
    train_type_dict = pd_data_types(train_df)
    for k, v in train_type_dict.items():
        if v != &#34;string&#34;:
            continue
        train_df[k] = train_df[k].str.strip()
        if val_df is not None:
            if k not in val_df.columns:
                raise ValueError(&#34;val_df is missing %s column&#34; % (k))
            val_df[k] = val_df[k].str.strip()
    if (pc and not lc) or (not pc and lc):
        raise ValueError(&#34;pc and lc: both or neither must exist&#34;)
    if pc and lc:
        inp_cols = (
            train_df.columns.values
            if check_labels
            else [col for col in train_df.columns.values if col not in lc]
        )
        original_cols = pc + lc if check_labels else pc
        if set(original_cols) != set(inp_cols):
            raise ValueError(
                &#34;DataFrame is either missing columns or includes extra columns: \n&#34;
                + &#34;expected: %s\nactual: %s&#34; % (original_cols, inp_cols)
            )
    if return_types:
        return train_type_dict
    return


# --------------------------------------------------------------------
# These are helper functions adapted from fastai:
# https://github.com/fastai/fastai
# -------------------------------------------------------------------


from numbers import Number
from types import SimpleNamespace
from typing import (
    Any,
    AnyStr,
    Callable,
    Collection,
    Dict,
    Hashable,
    Iterator,
    List,
    Mapping,
    NewType,
    Optional,
    Sequence,
    Tuple,
    TypeVar,
    Union,
)

from pandas.api.types import is_categorical_dtype, is_numeric_dtype


def ifnone(a, b):
    &#34;`a` if `a` is not None, otherwise `b`.&#34;
    return b if a is None else a


def make_date(df, date_field):
    &#34;&#34;&#34;
    Make sure `df[field_name]` is of the right date type.
    Reference: https://github.com/fastai/fastai
    &#34;&#34;&#34;
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
    return


def cont_cat_split(df, max_card=20, label_columns=[]):
    &#34;Helper function that returns column names of cont and cat variables from given df.&#34;
    cont_names, cat_names = [], []
    for col in df:
        if col in label_columns:
            continue
        if (
            df[col].dtype == int
            and df[col].unique().shape[0] &gt; max_card
            or df[col].dtype == float
        ):
            cont_names.append(col)
        else:
            cat_names.append(col)
    return cont_names, cat_names


def add_datepart(
    df: pd.DataFrame,
    field_name: str,
    prefix: str = None,
    drop: bool = True,
    time: bool = False,
    return_added_columns=True,
):
    &#34;Helper function that adds columns relevant to a date in the column `field_name` of `df`.&#34;
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub(&#34;[Dd]ate$&#34;, &#34;&#34;, field_name))
    attr = [
        &#34;Year&#34;,
        &#34;Month&#34;,
        &#34;Week&#34;,
        &#34;Day&#34;,
        &#34;Dayofweek&#34;,
        &#34;Dayofyear&#34;,
        &#34;Is_month_end&#34;,
        &#34;Is_month_start&#34;,
        &#34;Is_quarter_end&#34;,
        &#34;Is_quarter_start&#34;,
        &#34;Is_year_end&#34;,
        &#34;Is_year_start&#34;,
    ]
    if time:
        attr = attr + [&#34;Hour&#34;, &#34;Minute&#34;, &#34;Second&#34;]
    added_columns = []
    for n in attr:
        df[prefix + n] = getattr(field.dt, n.lower())
        added_columns.append(prefix + n)
    df[prefix + &#34;Elapsed&#34;] = field.astype(np.int64) // 10**9
    if drop:
        df.drop(field_name, axis=1, inplace=True)
    if return_added_columns:
        return (df, added_columns)
    else:
        return df


def cyclic_dt_feat_names(time: bool = True, add_linear: bool = False) -&gt; List[str]:
    &#34;Return feature names of date/time cycles as produced by `cyclic_dt_features`.&#34;
    fs = [&#34;cos&#34;, &#34;sin&#34;]
    attr = [
        f&#34;{r}_{f}&#34; for r in &#34;weekday day_month month_year day_year&#34;.split() for f in fs
    ]
    if time:
        attr += [f&#34;{r}_{f}&#34; for r in &#34;hour clock min sec&#34;.split() for f in fs]
    if add_linear:
        attr.append(&#34;year_lin&#34;)
    return attr


def cyclic_dt_features(d, time: bool = True, add_linear: bool = False) -&gt; List[float]:
    &#34;Calculate the cos and sin of date/time cycles.&#34;
    tt, fs = d.timetuple(), [np.cos, np.sin]
    day_year, days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]
    days_year = 366 if calendar.isleap(d.year) else 365
    rs = (
        d.weekday() / 7,
        (d.day - 1) / days_month,
        (d.month - 1) / 12,
        (day_year - 1) / days_year,
    )
    feats = [f(r * 2 * np.pi) for r in rs for f in fs]
    if time and isinstance(d, datetime) and type(d) != date:
        rs = tt.tm_hour / 24, tt.tm_hour % 12 / 12, tt.tm_min / 60, tt.tm_sec / 60
        feats += [f(r * 2 * np.pi) for r in rs for f in fs]
    if add_linear:
        if type(d) == date:
            feats.append(d.year + rs[-1])
        else:
            secs_in_year = (
                datetime(d.year + 1, 1, 1) - datetime(d.year, 1, 1)
            ).total_seconds()
            feats.append(
                d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year)
            )
    return feats


def add_cyclic_datepart(
    df: pd.DataFrame,
    field_name: str,
    prefix: str = None,
    drop: bool = True,
    time: bool = False,
    add_linear: bool = False,
):
    &#34;Helper function that adds trigonometric date/time features to a date in the column `field_name` of `df`.&#34;
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub(&#34;[Dd]ate$&#34;, &#34;&#34;, field_name))
    series = field.apply(partial(cyclic_dt_features, time=time, add_linear=add_linear))
    columns = [prefix + c for c in cyclic_dt_feat_names(time, add_linear)]
    df_feats = pd.DataFrame(
        [item for item in series], columns=columns, index=series.index
    )
    for column in columns:
        df[column] = df_feats[column]
    if drop:
        df.drop(field_name, axis=1, inplace=True)
    return df


class TabularProc:
    &#34;A processor for tabular dataframes.&#34;

    def __init__(self, cat_names, cont_names):
        self.cat_names = cat_names
        self.cont_names = cont_names

    def __call__(self, df, test=False):
        &#34;Apply the correct function to `df` depending on `test`.&#34;
        func = self.apply_test if test else self.apply_train
        func(df)

    def apply_train(self, df):
        &#34;Function applied to `df` if it&#39;s the train set.&#34;
        raise NotImplementedError

    def apply_test(self, df):
        &#34;Function applied to `df` if it&#39;s the test set.&#34;
        self.apply_train(df)


class Categorify(TabularProc):
    def __init__(self, cat_names, cont_names):
        super().__init__(cat_names, cont_names)
        self.categories = None

    def apply_train(self, df):
        self.categories = {}
        for n in self.cat_names:
            df.loc[:, n] = df.loc[:, n].astype(&#34;category&#34;).cat.as_ordered()
            self.categories[n] = df[n].cat.categories

    def apply_test(self, df):
        for n in self.cat_names:
            df.loc[:, n] = pd.Categorical(
                df[n], categories=self.categories[n], ordered=True
            )


FILL_MEDIAN = &#34;median&#34;
FILL_CONSTANT = &#34;constant&#34;


class FillMissing(TabularProc):
    &#34;Fill the missing values in continuous columns.&#34;

    def __init__(
        self,
        cat_names,
        cont_names,
        fill_strategy=FILL_MEDIAN,
        add_col=True,
        fill_val=0.0,
    ):
        super().__init__(cat_names, cont_names)
        self.fill_strategy = fill_strategy
        self.add_col = add_col
        self.fill_val = fill_val
        self.na_dict = None

    def apply_train(self, df):
        self.na_dict = {}
        self.filler_dict = {}
        for name in self.cont_names:
            if self.fill_strategy == FILL_MEDIAN:
                filler = df[name].median()
            elif self.fill_strategy == FILL_CONSTANT:
                filler = self.fill_val
            else:
                filler = df[name].dropna().value_counts().idxmax()
            self.filler_dict[name] = filler
            if pd.isnull(df[name]).sum():
                if self.add_col:
                    df[name + &#34;_na&#34;] = pd.isnull(df[name])
                    if name + &#34;_na&#34; not in self.cat_names:
                        self.cat_names.append(name + &#34;_na&#34;)
                df[name] = df[name].fillna(filler)
                self.na_dict[name] = True

    def apply_test(self, df):
        &#34;Fill missing values in `self.cont_names` like in `apply_train`.&#34;
        for name in self.cont_names:
            if name in self.na_dict:
                if self.add_col:
                    df[name + &#34;_na&#34;] = pd.isnull(df[name])
                    if name + &#34;_na&#34; not in self.cat_names:
                        self.cat_names.append(name + &#34;_na&#34;)
                df[name] = df[name].fillna(self.filler_dict[name])
            elif pd.isnull(df[name]).sum() != 0:
                warnings.warn(
                    f&#34;&#34;&#34;There are nan values in field {name} but there were none in the training set.
                Filled with {self.fill_strategy}.&#34;&#34;&#34;
                )
                df[name] = df[name].fillna(self.filler_dict[name])
                # raise Exception(f&#34;&#34;&#34;There are nan values in field {name} but there were none in the training set.
                # Please fix those manually.&#34;&#34;&#34;)


class Normalize(TabularProc):
    &#34;Normalize the continuous variables.&#34;

    def __init__(self, cat_names, cont_names):
        super().__init__(cat_names, cont_names)
        self.means = None
        self.stds = None

    def apply_train(self, df):
        &#34;Compute the means and stds of `self.cont_names` columns to normalize them.&#34;
        self.means, self.stds = {}, {}
        for n in self.cont_names:
            assert is_numeric_dtype(
                df[n]
            ), f&#34;&#34;&#34;Cannot normalize &#39;{n}&#39; column as it isn&#39;t numerical.
                Are you sure it doesn&#39;t belong in the categorical set of columns?&#34;&#34;&#34;
            self.means[n], self.stds[n] = df[n].mean(), df[n].std()
            df[n] = (df[n] - self.means[n]) / (1e-7 + self.stds[n])

    def apply_test(self, df):
        &#34;Normalize `self.cont_names` with the same statistics as in `apply_train`.&#34;
        for n in self.cont_names:
            df[n] = (df[n] - self.means[n]) / (1e-7 + self.stds[n])

    def revert(self, df):
        &#34;&#34;&#34;
        Undoes normalization and returns reverted dataframe
        &#34;&#34;&#34;
        out_df = df.copy()
        for n in self.cont_names:
            out_df[n] = (df[n] * (1e-7 + self.stds[n])) + self.means[n]
        return out_df</code></pre>
</details>
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="ktrain.tabular.preprocessor.add_cyclic_datepart"><code class="name flex">
<span>def <span class="ident">add_cyclic_datepart</span></span>(<span>df: pandas.core.frame.DataFrame, field_name: str, prefix: str = None, drop: bool = True, time: bool = False, add_linear: bool = False)</span>
</code></dt>
<dd>
<div class="desc"><p>Helper function that adds trigonometric date/time features to a date in the column <code>field_name</code> of <code>df</code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def add_cyclic_datepart(
    df: pd.DataFrame,
    field_name: str,
    prefix: str = None,
    drop: bool = True,
    time: bool = False,
    add_linear: bool = False,
):
    &#34;Helper function that adds trigonometric date/time features to a date in the column `field_name` of `df`.&#34;
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub(&#34;[Dd]ate$&#34;, &#34;&#34;, field_name))
    series = field.apply(partial(cyclic_dt_features, time=time, add_linear=add_linear))
    columns = [prefix + c for c in cyclic_dt_feat_names(time, add_linear)]
    df_feats = pd.DataFrame(
        [item for item in series], columns=columns, index=series.index
    )
    for column in columns:
        df[column] = df_feats[column]
    if drop:
        df.drop(field_name, axis=1, inplace=True)
    return df</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.add_datepart"><code class="name flex">
<span>def <span class="ident">add_datepart</span></span>(<span>df: pandas.core.frame.DataFrame, field_name: str, prefix: str = None, drop: bool = True, time: bool = False, return_added_columns=True)</span>
</code></dt>
<dd>
<div class="desc"><p>Helper function that adds columns relevant to a date in the column <code>field_name</code> of <code>df</code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def add_datepart(
    df: pd.DataFrame,
    field_name: str,
    prefix: str = None,
    drop: bool = True,
    time: bool = False,
    return_added_columns=True,
):
    &#34;Helper function that adds columns relevant to a date in the column `field_name` of `df`.&#34;
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub(&#34;[Dd]ate$&#34;, &#34;&#34;, field_name))
    attr = [
        &#34;Year&#34;,
        &#34;Month&#34;,
        &#34;Week&#34;,
        &#34;Day&#34;,
        &#34;Dayofweek&#34;,
        &#34;Dayofyear&#34;,
        &#34;Is_month_end&#34;,
        &#34;Is_month_start&#34;,
        &#34;Is_quarter_end&#34;,
        &#34;Is_quarter_start&#34;,
        &#34;Is_year_end&#34;,
        &#34;Is_year_start&#34;,
    ]
    if time:
        attr = attr + [&#34;Hour&#34;, &#34;Minute&#34;, &#34;Second&#34;]
    added_columns = []
    for n in attr:
        df[prefix + n] = getattr(field.dt, n.lower())
        added_columns.append(prefix + n)
    df[prefix + &#34;Elapsed&#34;] = field.astype(np.int64) // 10**9
    if drop:
        df.drop(field_name, axis=1, inplace=True)
    if return_added_columns:
        return (df, added_columns)
    else:
        return df</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.clean_df"><code class="name flex">
<span>def <span class="ident">clean_df</span></span>(<span>train_df, val_df=None, pc=[], lc=[], check_labels=True, return_types=False)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def clean_df(
    train_df, val_df=None, pc=[], lc=[], check_labels=True, return_types=False
):
    train_type_dict = pd_data_types(train_df)
    for k, v in train_type_dict.items():
        if v != &#34;string&#34;:
            continue
        train_df[k] = train_df[k].str.strip()
        if val_df is not None:
            if k not in val_df.columns:
                raise ValueError(&#34;val_df is missing %s column&#34; % (k))
            val_df[k] = val_df[k].str.strip()
    if (pc and not lc) or (not pc and lc):
        raise ValueError(&#34;pc and lc: both or neither must exist&#34;)
    if pc and lc:
        inp_cols = (
            train_df.columns.values
            if check_labels
            else [col for col in train_df.columns.values if col not in lc]
        )
        original_cols = pc + lc if check_labels else pc
        if set(original_cols) != set(inp_cols):
            raise ValueError(
                &#34;DataFrame is either missing columns or includes extra columns: \n&#34;
                + &#34;expected: %s\nactual: %s&#34; % (original_cols, inp_cols)
            )
    if return_types:
        return train_type_dict
    return</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.cont_cat_split"><code class="name flex">
<span>def <span class="ident">cont_cat_split</span></span>(<span>df, max_card=20, label_columns=[])</span>
</code></dt>
<dd>
<div class="desc"><p>Helper function that returns column names of cont and cat variables from given df.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def cont_cat_split(df, max_card=20, label_columns=[]):
    &#34;Helper function that returns column names of cont and cat variables from given df.&#34;
    cont_names, cat_names = [], []
    for col in df:
        if col in label_columns:
            continue
        if (
            df[col].dtype == int
            and df[col].unique().shape[0] &gt; max_card
            or df[col].dtype == float
        ):
            cont_names.append(col)
        else:
            cat_names.append(col)
    return cont_names, cat_names</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.cyclic_dt_feat_names"><code class="name flex">
<span>def <span class="ident">cyclic_dt_feat_names</span></span>(<span>time: bool = True, add_linear: bool = False) ‑> List[str]</span>
</code></dt>
<dd>
<div class="desc"><p>Return feature names of date/time cycles as produced by <code><a title="ktrain.tabular.preprocessor.cyclic_dt_features" href="#ktrain.tabular.preprocessor.cyclic_dt_features">cyclic_dt_features()</a></code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def cyclic_dt_feat_names(time: bool = True, add_linear: bool = False) -&gt; List[str]:
    &#34;Return feature names of date/time cycles as produced by `cyclic_dt_features`.&#34;
    fs = [&#34;cos&#34;, &#34;sin&#34;]
    attr = [
        f&#34;{r}_{f}&#34; for r in &#34;weekday day_month month_year day_year&#34;.split() for f in fs
    ]
    if time:
        attr += [f&#34;{r}_{f}&#34; for r in &#34;hour clock min sec&#34;.split() for f in fs]
    if add_linear:
        attr.append(&#34;year_lin&#34;)
    return attr</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.cyclic_dt_features"><code class="name flex">
<span>def <span class="ident">cyclic_dt_features</span></span>(<span>d, time: bool = True, add_linear: bool = False) ‑> List[float]</span>
</code></dt>
<dd>
<div class="desc"><p>Calculate the cos and sin of date/time cycles.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def cyclic_dt_features(d, time: bool = True, add_linear: bool = False) -&gt; List[float]:
    &#34;Calculate the cos and sin of date/time cycles.&#34;
    tt, fs = d.timetuple(), [np.cos, np.sin]
    day_year, days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]
    days_year = 366 if calendar.isleap(d.year) else 365
    rs = (
        d.weekday() / 7,
        (d.day - 1) / days_month,
        (d.month - 1) / 12,
        (day_year - 1) / days_year,
    )
    feats = [f(r * 2 * np.pi) for r in rs for f in fs]
    if time and isinstance(d, datetime) and type(d) != date:
        rs = tt.tm_hour / 24, tt.tm_hour % 12 / 12, tt.tm_min / 60, tt.tm_sec / 60
        feats += [f(r * 2 * np.pi) for r in rs for f in fs]
    if add_linear:
        if type(d) == date:
            feats.append(d.year + rs[-1])
        else:
            secs_in_year = (
                datetime(d.year + 1, 1, 1) - datetime(d.year, 1, 1)
            ).total_seconds()
            feats.append(
                d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year)
            )
    return feats</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.ifnone"><code class="name flex">
<span>def <span class="ident">ifnone</span></span>(<span>a, b)</span>
</code></dt>
<dd>
<div class="desc"><p><code>a</code> if <code>a</code> is not None, otherwise <code>b</code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def ifnone(a, b):
    &#34;`a` if `a` is not None, otherwise `b`.&#34;
    return b if a is None else a</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.make_date"><code class="name flex">
<span>def <span class="ident">make_date</span></span>(<span>df, date_field)</span>
</code></dt>
<dd>
<div class="desc"><p>Make sure <code>df[field_name]</code> is of the right date type.
Reference: <a href="https://github.com/fastai/fastai">https://github.com/fastai/fastai</a></p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def make_date(df, date_field):
    &#34;&#34;&#34;
    Make sure `df[field_name]` is of the right date type.
    Reference: https://github.com/fastai/fastai
    &#34;&#34;&#34;
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
    return</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.pd_data_types"><code class="name flex">
<span>def <span class="ident">pd_data_types</span></span>(<span>df, return_df=False)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>infers data type of each column in Pandas DataFrame
Args:
  df(pd.DataFrame): pandas DataFrame
  return_df(bool): If True, returns columns and types in DataFrame.
                   Otherwise, a dictionary is returned.
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def pd_data_types(df, return_df=False):
    &#34;&#34;&#34;
    ```
    infers data type of each column in Pandas DataFrame
    Args:
      df(pd.DataFrame): pandas DataFrame
      return_df(bool): If True, returns columns and types in DataFrame.
                       Otherwise, a dictionary is returned.
    ```
    &#34;&#34;&#34;

    infer_type = lambda x: pd.api.types.infer_dtype(x, skipna=True)
    df.apply(infer_type, axis=0)

    # DataFrame with column names &amp; new types
    df_types = (
        pd.DataFrame(df.apply(pd.api.types.infer_dtype, axis=0))
        .reset_index()
        .rename(columns={&#34;index&#34;: &#34;column&#34;, 0: &#34;type&#34;})
    )
    if return_df:
        return df_types
    cols = list(df_types[&#34;column&#34;].values)
    col_types = list(df_types[&#34;type&#34;].values)
    return dict(list(zip(cols, col_types)))</code></pre>
</details>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="ktrain.tabular.preprocessor.Categorify"><code class="flex name class">
<span>class <span class="ident">Categorify</span></span>
<span>(</span><span>cat_names, cont_names)</span>
</code></dt>
<dd>
<div class="desc"><p>A processor for tabular dataframes.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Categorify(TabularProc):
    def __init__(self, cat_names, cont_names):
        super().__init__(cat_names, cont_names)
        self.categories = None

    def apply_train(self, df):
        self.categories = {}
        for n in self.cat_names:
            df.loc[:, n] = df.loc[:, n].astype(&#34;category&#34;).cat.as_ordered()
            self.categories[n] = df[n].cat.categories

    def apply_test(self, df):
        for n in self.cat_names:
            df.loc[:, n] = pd.Categorical(
                df[n], categories=self.categories[n], ordered=True
            )</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.tabular.preprocessor.TabularProc" href="#ktrain.tabular.preprocessor.TabularProc">TabularProc</a></li>
</ul>
<h3>Inherited members</h3>
<ul class="hlist">
<li><code><b><a title="ktrain.tabular.preprocessor.TabularProc" href="#ktrain.tabular.preprocessor.TabularProc">TabularProc</a></b></code>:
<ul class="hlist">
<li><code><a title="ktrain.tabular.preprocessor.TabularProc.apply_test" href="#ktrain.tabular.preprocessor.TabularProc.apply_test">apply_test</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularProc.apply_train" href="#ktrain.tabular.preprocessor.TabularProc.apply_train">apply_train</a></code></li>
</ul>
</li>
</ul>
</dd>
<dt id="ktrain.tabular.preprocessor.FillMissing"><code class="flex name class">
<span>class <span class="ident">FillMissing</span></span>
<span>(</span><span>cat_names, cont_names, fill_strategy='median', add_col=True, fill_val=0.0)</span>
</code></dt>
<dd>
<div class="desc"><p>Fill the missing values in continuous columns.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class FillMissing(TabularProc):
    &#34;Fill the missing values in continuous columns.&#34;

    def __init__(
        self,
        cat_names,
        cont_names,
        fill_strategy=FILL_MEDIAN,
        add_col=True,
        fill_val=0.0,
    ):
        super().__init__(cat_names, cont_names)
        self.fill_strategy = fill_strategy
        self.add_col = add_col
        self.fill_val = fill_val
        self.na_dict = None

    def apply_train(self, df):
        self.na_dict = {}
        self.filler_dict = {}
        for name in self.cont_names:
            if self.fill_strategy == FILL_MEDIAN:
                filler = df[name].median()
            elif self.fill_strategy == FILL_CONSTANT:
                filler = self.fill_val
            else:
                filler = df[name].dropna().value_counts().idxmax()
            self.filler_dict[name] = filler
            if pd.isnull(df[name]).sum():
                if self.add_col:
                    df[name + &#34;_na&#34;] = pd.isnull(df[name])
                    if name + &#34;_na&#34; not in self.cat_names:
                        self.cat_names.append(name + &#34;_na&#34;)
                df[name] = df[name].fillna(filler)
                self.na_dict[name] = True

    def apply_test(self, df):
        &#34;Fill missing values in `self.cont_names` like in `apply_train`.&#34;
        for name in self.cont_names:
            if name in self.na_dict:
                if self.add_col:
                    df[name + &#34;_na&#34;] = pd.isnull(df[name])
                    if name + &#34;_na&#34; not in self.cat_names:
                        self.cat_names.append(name + &#34;_na&#34;)
                df[name] = df[name].fillna(self.filler_dict[name])
            elif pd.isnull(df[name]).sum() != 0:
                warnings.warn(
                    f&#34;&#34;&#34;There are nan values in field {name} but there were none in the training set.
                Filled with {self.fill_strategy}.&#34;&#34;&#34;
                )
                df[name] = df[name].fillna(self.filler_dict[name])</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.tabular.preprocessor.TabularProc" href="#ktrain.tabular.preprocessor.TabularProc">TabularProc</a></li>
</ul>
<h3>Methods</h3>
<dl>
<dt id="ktrain.tabular.preprocessor.FillMissing.apply_test"><code class="name flex">
<span>def <span class="ident">apply_test</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"><p>Fill missing values in <code>self.cont_names</code> like in <code>apply_train</code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def apply_test(self, df):
    &#34;Fill missing values in `self.cont_names` like in `apply_train`.&#34;
    for name in self.cont_names:
        if name in self.na_dict:
            if self.add_col:
                df[name + &#34;_na&#34;] = pd.isnull(df[name])
                if name + &#34;_na&#34; not in self.cat_names:
                    self.cat_names.append(name + &#34;_na&#34;)
            df[name] = df[name].fillna(self.filler_dict[name])
        elif pd.isnull(df[name]).sum() != 0:
            warnings.warn(
                f&#34;&#34;&#34;There are nan values in field {name} but there were none in the training set.
            Filled with {self.fill_strategy}.&#34;&#34;&#34;
            )
            df[name] = df[name].fillna(self.filler_dict[name])</code></pre>
</details>
</dd>
</dl>
<h3>Inherited members</h3>
<ul class="hlist">
<li><code><b><a title="ktrain.tabular.preprocessor.TabularProc" href="#ktrain.tabular.preprocessor.TabularProc">TabularProc</a></b></code>:
<ul class="hlist">
<li><code><a title="ktrain.tabular.preprocessor.TabularProc.apply_train" href="#ktrain.tabular.preprocessor.TabularProc.apply_train">apply_train</a></code></li>
</ul>
</li>
</ul>
</dd>
<dt id="ktrain.tabular.preprocessor.Normalize"><code class="flex name class">
<span>class <span class="ident">Normalize</span></span>
<span>(</span><span>cat_names, cont_names)</span>
</code></dt>
<dd>
<div class="desc"><p>Normalize the continuous variables.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Normalize(TabularProc):
    &#34;Normalize the continuous variables.&#34;

    def __init__(self, cat_names, cont_names):
        super().__init__(cat_names, cont_names)
        self.means = None
        self.stds = None

    def apply_train(self, df):
        &#34;Compute the means and stds of `self.cont_names` columns to normalize them.&#34;
        self.means, self.stds = {}, {}
        for n in self.cont_names:
            assert is_numeric_dtype(
                df[n]
            ), f&#34;&#34;&#34;Cannot normalize &#39;{n}&#39; column as it isn&#39;t numerical.
                Are you sure it doesn&#39;t belong in the categorical set of columns?&#34;&#34;&#34;
            self.means[n], self.stds[n] = df[n].mean(), df[n].std()
            df[n] = (df[n] - self.means[n]) / (1e-7 + self.stds[n])

    def apply_test(self, df):
        &#34;Normalize `self.cont_names` with the same statistics as in `apply_train`.&#34;
        for n in self.cont_names:
            df[n] = (df[n] - self.means[n]) / (1e-7 + self.stds[n])

    def revert(self, df):
        &#34;&#34;&#34;
        Undoes normalization and returns reverted dataframe
        &#34;&#34;&#34;
        out_df = df.copy()
        for n in self.cont_names:
            out_df[n] = (df[n] * (1e-7 + self.stds[n])) + self.means[n]
        return out_df</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.tabular.preprocessor.TabularProc" href="#ktrain.tabular.preprocessor.TabularProc">TabularProc</a></li>
</ul>
<h3>Methods</h3>
<dl>
<dt id="ktrain.tabular.preprocessor.Normalize.apply_test"><code class="name flex">
<span>def <span class="ident">apply_test</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"><p>Normalize <code>self.cont_names</code> with the same statistics as in <code>apply_train</code>.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def apply_test(self, df):
    &#34;Normalize `self.cont_names` with the same statistics as in `apply_train`.&#34;
    for n in self.cont_names:
        df[n] = (df[n] - self.means[n]) / (1e-7 + self.stds[n])</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.Normalize.apply_train"><code class="name flex">
<span>def <span class="ident">apply_train</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"><p>Compute the means and stds of <code>self.cont_names</code> columns to normalize them.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def apply_train(self, df):
    &#34;Compute the means and stds of `self.cont_names` columns to normalize them.&#34;
    self.means, self.stds = {}, {}
    for n in self.cont_names:
        assert is_numeric_dtype(
            df[n]
        ), f&#34;&#34;&#34;Cannot normalize &#39;{n}&#39; column as it isn&#39;t numerical.
            Are you sure it doesn&#39;t belong in the categorical set of columns?&#34;&#34;&#34;
        self.means[n], self.stds[n] = df[n].mean(), df[n].std()
        df[n] = (df[n] - self.means[n]) / (1e-7 + self.stds[n])</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.Normalize.revert"><code class="name flex">
<span>def <span class="ident">revert</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"><p>Undoes normalization and returns reverted dataframe</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def revert(self, df):
    &#34;&#34;&#34;
    Undoes normalization and returns reverted dataframe
    &#34;&#34;&#34;
    out_df = df.copy()
    for n in self.cont_names:
        out_df[n] = (df[n] * (1e-7 + self.stds[n])) + self.means[n]
    return out_df</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor"><code class="flex name class">
<span>class <span class="ident">TabularPreprocessor</span></span>
<span>(</span><span>predictor_columns, label_columns, date_columns=[], is_regression=False, procs=[], max_card=20)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>Tabular preprocessing base class
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class TabularPreprocessor(Preprocessor):
    &#34;&#34;&#34;
    ```
    Tabular preprocessing base class
    ```
    &#34;&#34;&#34;

    def __init__(
        self,
        predictor_columns,
        label_columns,
        date_columns=[],
        is_regression=False,
        procs=[],
        max_card=20,
    ):
        self.is_regression = is_regression
        self.c = None
        self.pc = predictor_columns
        self.lc = label_columns
        self.lc = [self.lc] if isinstance(self.lc, str) else self.lc
        self.dc = date_columns
        self.label_columns = None
        self.cat_names = []
        self.cont_names = []
        self.date_names = []
        self.label_transform = None
        self.procs = procs
        self.max_card = max_card

    @property
    def na_names(self):
        return [n for n in self.cat_names if n[-3:] == &#34;_na&#34;]

    def get_preprocessor(self):
        return (self.label_transform, self.procs)

    def get_classes(self):
        return self.label_columns if not self.is_regression else []

    def preprocess(self, df):
        return self.preprocess_test(df)

    def _validate_columns(self, df):
        missing_columns = []
        for col in df.columns.values:
            if col not in self.lc and col not in self.pc:
                missing_columns.append(col)
        if len(missing_columns) &gt; 0:
            raise ValueError(&#34;df is missing columns: %s&#34; % (missing_columns))
        return

    def denormalize(self, df):
        normalizer = None
        for proc in self.procs:
            if type(proc).__name__ == &#34;Normalize&#34;:
                normalizer = proc
                break
        if normalizer is None:
            return df
        return normalizer.revert(df)

    # def codify(self, df):
    # df = df.copy()
    # for lab in self.lc:
    # df[lab] = df[lab].cat.codes
    # return df

    def preprocess_train(self, df, mode=&#34;train&#34;, verbose=1):
        &#34;&#34;&#34;
        ```
        preprocess training set
        ```
        &#34;&#34;&#34;
        df = df.copy()

        clean_df(df, pc=self.pc, lc=self.lc, check_labels=mode == &#34;train&#34;)

        if not isinstance(df, pd.DataFrame):
            raise ValueError(&#34;df must be a pd.DataFrame&#34;)

        # validate columns
        self._validate_columns(df)

        # validate mode
        # if mode != &#39;train&#39; and self.label_transform is None:
        # raise ValueError(&#39;self.label_transform is None but mode is %s: are you sure preprocess_train was invoked first?&#39; % (mode))

        # verbose
        if verbose:
            print(
                &#34;processing %s: %s rows x %s columns&#34; % (mode, df.shape[0], df.shape[1])
            )

        # convert date fields
        for field in self.dc:
            df = df.copy()  # TODO: fix this
            df, date_names = add_datepart(df, field)
            self.date_names = date_names

        # preprocess labels and data
        if mode == &#34;train&#34;:
            label_columns = self.lc[:]
            # label_columns.sort() # leave label columns sorted in same order as in DataFrame
            self.label_transform = U.YTransformDataFrame(
                label_columns, is_regression=self.is_regression
            )
            df = self.label_transform.apply_train(df)
            self.label_columns = (
                self.label_transform.get_classes()
                if not self.is_regression
                else self.label_transform.label_columns
            )
            self.cont_names, self.cat_names = cont_cat_split(
                df, label_columns=self.label_columns, max_card=self.max_card
            )
            self.procs = [
                proc(self.cat_names, self.cont_names) for proc in self.procs
            ]  # &#34;objectivy&#34;
        else:
            df = self.label_transform.apply_test(df)
        for proc in self.procs:
            proc(df, test=mode != &#34;train&#34;)  # apply processors
        from .dataset import TabularDataset

        return TabularDataset(df, self.cat_names, self.cont_names, self.label_columns)

    def preprocess_valid(self, df, verbose=1):
        &#34;&#34;&#34;
        ```
        preprocess validation set
        ```
        &#34;&#34;&#34;
        return self.preprocess_train(df, mode=&#34;valid&#34;, verbose=verbose)

    def preprocess_test(self, df, verbose=1):
        &#34;&#34;&#34;
        ```
        preprocess test set
        ```
        &#34;&#34;&#34;
        return self.preprocess_train(df, mode=&#34;test&#34;, verbose=verbose)</code></pre>
</details>
<h3>Ancestors</h3>
<ul class="hlist">
<li><a title="ktrain.preprocessor.Preprocessor" href="../preprocessor.html#ktrain.preprocessor.Preprocessor">Preprocessor</a></li>
<li>abc.ABC</li>
</ul>
<h3>Instance variables</h3>
<dl>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.na_names"><code class="name">var <span class="ident">na_names</span></code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">@property
def na_names(self):
    return [n for n in self.cat_names if n[-3:] == &#34;_na&#34;]</code></pre>
</details>
</dd>
</dl>
<h3>Methods</h3>
<dl>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.denormalize"><code class="name flex">
<span>def <span class="ident">denormalize</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def denormalize(self, df):
    normalizer = None
    for proc in self.procs:
        if type(proc).__name__ == &#34;Normalize&#34;:
            normalizer = proc
            break
    if normalizer is None:
        return df
    return normalizer.revert(df)</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.get_classes"><code class="name flex">
<span>def <span class="ident">get_classes</span></span>(<span>self)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_classes(self):
    return self.label_columns if not self.is_regression else []</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.get_preprocessor"><code class="name flex">
<span>def <span class="ident">get_preprocessor</span></span>(<span>self)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_preprocessor(self):
    return (self.label_transform, self.procs)</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess"><code class="name flex">
<span>def <span class="ident">preprocess</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess(self, df):
    return self.preprocess_test(df)</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_test"><code class="name flex">
<span>def <span class="ident">preprocess_test</span></span>(<span>self, df, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>preprocess test set
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess_test(self, df, verbose=1):
    &#34;&#34;&#34;
    ```
    preprocess test set
    ```
    &#34;&#34;&#34;
    return self.preprocess_train(df, mode=&#34;test&#34;, verbose=verbose)</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_train"><code class="name flex">
<span>def <span class="ident">preprocess_train</span></span>(<span>self, df, mode='train', verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>preprocess training set
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess_train(self, df, mode=&#34;train&#34;, verbose=1):
    &#34;&#34;&#34;
    ```
    preprocess training set
    ```
    &#34;&#34;&#34;
    df = df.copy()

    clean_df(df, pc=self.pc, lc=self.lc, check_labels=mode == &#34;train&#34;)

    if not isinstance(df, pd.DataFrame):
        raise ValueError(&#34;df must be a pd.DataFrame&#34;)

    # validate columns
    self._validate_columns(df)

    # validate mode
    # if mode != &#39;train&#39; and self.label_transform is None:
    # raise ValueError(&#39;self.label_transform is None but mode is %s: are you sure preprocess_train was invoked first?&#39; % (mode))

    # verbose
    if verbose:
        print(
            &#34;processing %s: %s rows x %s columns&#34; % (mode, df.shape[0], df.shape[1])
        )

    # convert date fields
    for field in self.dc:
        df = df.copy()  # TODO: fix this
        df, date_names = add_datepart(df, field)
        self.date_names = date_names

    # preprocess labels and data
    if mode == &#34;train&#34;:
        label_columns = self.lc[:]
        # label_columns.sort() # leave label columns sorted in same order as in DataFrame
        self.label_transform = U.YTransformDataFrame(
            label_columns, is_regression=self.is_regression
        )
        df = self.label_transform.apply_train(df)
        self.label_columns = (
            self.label_transform.get_classes()
            if not self.is_regression
            else self.label_transform.label_columns
        )
        self.cont_names, self.cat_names = cont_cat_split(
            df, label_columns=self.label_columns, max_card=self.max_card
        )
        self.procs = [
            proc(self.cat_names, self.cont_names) for proc in self.procs
        ]  # &#34;objectivy&#34;
    else:
        df = self.label_transform.apply_test(df)
    for proc in self.procs:
        proc(df, test=mode != &#34;train&#34;)  # apply processors
    from .dataset import TabularDataset

    return TabularDataset(df, self.cat_names, self.cont_names, self.label_columns)</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_valid"><code class="name flex">
<span>def <span class="ident">preprocess_valid</span></span>(<span>self, df, verbose=1)</span>
</code></dt>
<dd>
<div class="desc"><pre><code>preprocess validation set
</code></pre></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def preprocess_valid(self, df, verbose=1):
    &#34;&#34;&#34;
    ```
    preprocess validation set
    ```
    &#34;&#34;&#34;
    return self.preprocess_train(df, mode=&#34;valid&#34;, verbose=verbose)</code></pre>
</details>
</dd>
</dl>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularProc"><code class="flex name class">
<span>class <span class="ident">TabularProc</span></span>
<span>(</span><span>cat_names, cont_names)</span>
</code></dt>
<dd>
<div class="desc"><p>A processor for tabular dataframes.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class TabularProc:
    &#34;A processor for tabular dataframes.&#34;

    def __init__(self, cat_names, cont_names):
        self.cat_names = cat_names
        self.cont_names = cont_names

    def __call__(self, df, test=False):
        &#34;Apply the correct function to `df` depending on `test`.&#34;
        func = self.apply_test if test else self.apply_train
        func(df)

    def apply_train(self, df):
        &#34;Function applied to `df` if it&#39;s the train set.&#34;
        raise NotImplementedError

    def apply_test(self, df):
        &#34;Function applied to `df` if it&#39;s the test set.&#34;
        self.apply_train(df)</code></pre>
</details>
<h3>Subclasses</h3>
<ul class="hlist">
<li><a title="ktrain.tabular.preprocessor.Categorify" href="#ktrain.tabular.preprocessor.Categorify">Categorify</a></li>
<li><a title="ktrain.tabular.preprocessor.FillMissing" href="#ktrain.tabular.preprocessor.FillMissing">FillMissing</a></li>
<li><a title="ktrain.tabular.preprocessor.Normalize" href="#ktrain.tabular.preprocessor.Normalize">Normalize</a></li>
</ul>
<h3>Methods</h3>
<dl>
<dt id="ktrain.tabular.preprocessor.TabularProc.apply_test"><code class="name flex">
<span>def <span class="ident">apply_test</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"><p>Function applied to <code>df</code> if it's the test set.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def apply_test(self, df):
    &#34;Function applied to `df` if it&#39;s the test set.&#34;
    self.apply_train(df)</code></pre>
</details>
</dd>
<dt id="ktrain.tabular.preprocessor.TabularProc.apply_train"><code class="name flex">
<span>def <span class="ident">apply_train</span></span>(<span>self, df)</span>
</code></dt>
<dd>
<div class="desc"><p>Function applied to <code>df</code> if it's the train set.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def apply_train(self, df):
    &#34;Function applied to `df` if it&#39;s the train set.&#34;
    raise NotImplementedError</code></pre>
</details>
</dd>
</dl>
</dd>
</dl>
</section>
</article>
<nav id="sidebar">
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="ktrain.tabular" href="index.html">ktrain.tabular</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="ktrain.tabular.preprocessor.add_cyclic_datepart" href="#ktrain.tabular.preprocessor.add_cyclic_datepart">add_cyclic_datepart</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.add_datepart" href="#ktrain.tabular.preprocessor.add_datepart">add_datepart</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.clean_df" href="#ktrain.tabular.preprocessor.clean_df">clean_df</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.cont_cat_split" href="#ktrain.tabular.preprocessor.cont_cat_split">cont_cat_split</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.cyclic_dt_feat_names" href="#ktrain.tabular.preprocessor.cyclic_dt_feat_names">cyclic_dt_feat_names</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.cyclic_dt_features" href="#ktrain.tabular.preprocessor.cyclic_dt_features">cyclic_dt_features</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.ifnone" href="#ktrain.tabular.preprocessor.ifnone">ifnone</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.make_date" href="#ktrain.tabular.preprocessor.make_date">make_date</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.pd_data_types" href="#ktrain.tabular.preprocessor.pd_data_types">pd_data_types</a></code></li>
</ul>
</li>
<li><h3><a href="#header-classes">Classes</a></h3>
<ul>
<li>
<h4><code><a title="ktrain.tabular.preprocessor.Categorify" href="#ktrain.tabular.preprocessor.Categorify">Categorify</a></code></h4>
</li>
<li>
<h4><code><a title="ktrain.tabular.preprocessor.FillMissing" href="#ktrain.tabular.preprocessor.FillMissing">FillMissing</a></code></h4>
<ul class="">
<li><code><a title="ktrain.tabular.preprocessor.FillMissing.apply_test" href="#ktrain.tabular.preprocessor.FillMissing.apply_test">apply_test</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.tabular.preprocessor.Normalize" href="#ktrain.tabular.preprocessor.Normalize">Normalize</a></code></h4>
<ul class="">
<li><code><a title="ktrain.tabular.preprocessor.Normalize.apply_test" href="#ktrain.tabular.preprocessor.Normalize.apply_test">apply_test</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.Normalize.apply_train" href="#ktrain.tabular.preprocessor.Normalize.apply_train">apply_train</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.Normalize.revert" href="#ktrain.tabular.preprocessor.Normalize.revert">revert</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor" href="#ktrain.tabular.preprocessor.TabularPreprocessor">TabularPreprocessor</a></code></h4>
<ul class="two-column">
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.denormalize" href="#ktrain.tabular.preprocessor.TabularPreprocessor.denormalize">denormalize</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.get_classes" href="#ktrain.tabular.preprocessor.TabularPreprocessor.get_classes">get_classes</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.get_preprocessor" href="#ktrain.tabular.preprocessor.TabularPreprocessor.get_preprocessor">get_preprocessor</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.na_names" href="#ktrain.tabular.preprocessor.TabularPreprocessor.na_names">na_names</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess" href="#ktrain.tabular.preprocessor.TabularPreprocessor.preprocess">preprocess</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_test" href="#ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_test">preprocess_test</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_train" href="#ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_train">preprocess_train</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_valid" href="#ktrain.tabular.preprocessor.TabularPreprocessor.preprocess_valid">preprocess_valid</a></code></li>
</ul>
</li>
<li>
<h4><code><a title="ktrain.tabular.preprocessor.TabularProc" href="#ktrain.tabular.preprocessor.TabularProc">TabularProc</a></code></h4>
<ul class="">
<li><code><a title="ktrain.tabular.preprocessor.TabularProc.apply_test" href="#ktrain.tabular.preprocessor.TabularProc.apply_test">apply_test</a></code></li>
<li><code><a title="ktrain.tabular.preprocessor.TabularProc.apply_train" href="#ktrain.tabular.preprocessor.TabularProc.apply_train">apply_train</a></code></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
</footer>
</body>
</html>