# Imputation Plugins

Missing data is a crucial issue when applying machine learning algorithms to real-world datasets.

**HyperImpute** provides a set of default imputation plugins and can be extended with any number of other plugins.

### Setup

In [1]:
import sys
import warnings
import time
from tqdm import tqdm
from math import sqrt

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan

import xgboost as xgb

from IPython.display import HTML, display
import tabulate

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### Loading the Imputation plugins

Make sure that you have installed HyperImpute in your workspace.

You can do that by running `pip install .` in the root of the repository.

In [2]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()

### List the existing plugins

In [3]:
imputers.list()

['mean',
 'EM',
 'sinkhorn',
 'softimpute',
 'nop',
 'miwae',
 'missforest',
 'median',
 'gain',
 'most_frequent',
 'sklearn_ice',
 'mice',
 'hyperimpute',
 'sklearn_missforest',
 'miracle',
 'ice']

### Adding a new Imputation plugin

By default, HyperImpute automatically loads the imputation plugins with the pattern `hyperimpute/plugins/imputers/plugin_*`. 

Alternatively, you can call `Imputers().add(<name>, <ImputerPlugin derived class>)` at runtime.

Next, we show two examples of custom Imputation plugins.

In [4]:
custom_ice_plugin = "custom_ice"


class NewPlugin(ImputerPlugin):
    def __init__(self):
        super().__init__()
        lr = LinearRegression()
        self._model = IterativeImputer(
            estimator=lr, max_iter=500, tol=1e-10, imputation_order="roman"
        )

    @staticmethod
    def name():
        return custom_ice_plugin

    @staticmethod
    def hyperparameter_space():
        return []

    def _fit(self, *args, **kwargs) -> "NewPlugin":
        self._model.fit(*args, **kwargs)
        return self

    def _transform(self, *args, **kwargs):
        return self._model.transform(*args, **kwargs)

    def save(self) -> bytes:
        raise NotImplemented("placeholder")

    @classmethod
    def load(cls, buff: bytes) -> "NewPlugin":
        raise NotImplemented("placeholder")


imputers.add(custom_ice_plugin, NewPlugin)

assert imputers.get(custom_ice_plugin) is not None

### List the existing plugins

Now we should see the new plugins loaded.

In [5]:
imputers.list()

['miwae',
 'ice',
 'miracle',
 'hyperimpute',
 'missforest',
 'mean',
 'gain',
 'sklearn_ice',
 'nop',
 'median',
 'EM',
 'sinkhorn',
 'sklearn_missforest',
 'mice',
 'softimpute',
 'most_frequent',
 'custom_ice']

### Testing the performance

We simulate some testing datasets using 3 amputation strategies:
- **Missing Completely At Random** (MCAR) if the probability of being missing is the same for all observations
- **Missing At Random** (MAR) if the probability of being missing only depends on observed values.
- **Missing Not At Random** (MNAR) if the unavailability of the data depends on both observed and unobserved data such as its value itself.

#### Load the dataset

In [4]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

preproc = MinMaxScaler()


def dataset():
    X, y = load_breast_cancer(return_X_y=True)
    X = pd.DataFrame(preproc.fit_transform(X, y))
    y = pd.Series(y)

    return train_test_split(X, y, test_size=0.2)


def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(np.asarray(x), p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return pd.DataFrame(x), pd.DataFrame(x_miss), pd.DataFrame(mask)

In [8]:
datasets = {}
headers = ["Plugin"]

pct = 0.3

mechanisms = ["MAR", "MNAR", "MCAR"]
percentages = [pct]

plugins = ["gain"]  # imputers.list()  # default plugins

X_train, X_test, y_train, y_test = dataset()

for ampute_mechanism in mechanisms:
    for p_miss in percentages:
        if ampute_mechanism not in datasets:
            datasets[ampute_mechanism] = {}

        headers.append(ampute_mechanism + "-" + str(p_miss))
        datasets[ampute_mechanism][p_miss] = ampute(X_train, ampute_mechanism, p_miss)

print(datasets)

{'MAR': {0.3: (           0         1         2         3         4         5         6   \
338  0.145251  0.264457  0.142492  0.070965  0.433962  0.165266  0.058833   
427  0.180747  0.414948  0.172759  0.091792  0.319401  0.116711  0.084677   
406  0.433480  0.174163  0.418147  0.278473  0.382053  0.201307  0.128866   
96   0.246060  0.274941  0.234953  0.130477  0.468268  0.157015  0.058341   
490  0.249373  0.430504  0.237648  0.137010  0.264422  0.100055  0.040159   
..        ...       ...       ...       ...       ...       ...       ...   
277  0.559847  0.347311  0.532859  0.406575  0.330414  0.121036  0.187910   
9    0.259312  0.484613  0.277659  0.140997  0.595558  0.675480  0.532568   
359  0.116191  0.291173  0.110773  0.057306  0.435768  0.123244  0.063496   
192  0.129632  0.287792  0.117062  0.061336  0.152298  0.012453  0.000000   
559  0.214350  0.480893  0.212356  0.110286  0.360928  0.253727  0.260544   

           7         8         9   ...        20        21  

#### Evaluation

We compare the methods in terms of root mean squared error (RMSE) to the initial dataset.

In [9]:
results = []
duration = []

for plugin in tqdm(plugins):
    plugin_results = [plugin]
    plugin_duration = [plugin]

    for ampute_mechanism in mechanisms:
        for p_miss in percentages:
            ctx = imputers.get(plugin)
            x, x_miss, mask = datasets[ampute_mechanism][p_miss]

            start = time.time() * 1000
            x_imp = ctx.fit_transform(x_miss)

            plugin_duration.append(round(time.time() * 1000 - start, 4))
            plugin_results.append(RMSE(x_imp.values, x.values, mask.values))

    results.append(plugin_results)
    duration.append(plugin_duration)
print("Results:", results, duration)

100%|██████████| 1/1 [00:03<00:00,  3.43s/it]

Results: [['gain', 0.11888193205704875, 0.12658364937170588, 0.07270601462137391]] [['gain', 1494.2852, 965.8999, 961.4333]]





### Reconstruction error(RMSE)

__Interpretation__ : The following table shows the reconstruction error -  the __Root Mean Square Error(RMSE)__ for each method applied on the original full dataset and the imputed dataset.

In [None]:
display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Plugin,MAR-0.3,MNAR-0.3,MCAR-0.3
mean,0.195269,0.170658,0.146133


: 

### XGBoost test score after imputation

__Interpretation__ The following table shows different metrics on the test set for an XGBoost classifier, after imputing the dataset with each method.
Metrics:
 - accuracy

In [None]:
from sklearn import metrics


def get_metrics(X_train, y_train, X_test, y_test):
    xgb_clf = xgb.XGBClassifier(verbosity=0)
    xgb_clf = xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict(X_test)

    score = xgb_clf.score(X_test, y_test)

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    auroc = metrics.auc(fpr, tpr)

    prec, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
    aurpc = metrics.auc(recall, prec)

    return score, auroc, aurpc


metrics_headers = ["Plugin", "Accuracy", "AUROC", "AURPC"]
xgboost_test_score = []


x, x_miss, mask = datasets["MAR"][pct]

xgboost_test_score.append(
    ["original dataset", *get_metrics(X_train, y_train, X_test, y_test)]
)

for plugin in plugins:
    X_train_imp = imputers.get(plugin).fit_transform(x_miss.copy())

    score, auroc, aurpc = get_metrics(X_train_imp, y_train, X_test, y_test)

    xgboost_test_score.append([plugin, score, auroc, aurpc])

In [None]:
display(
    HTML(
        tabulate.tabulate(xgboost_test_score, headers=metrics_headers, tablefmt="html")
    )
)

### Duration(ms) results

__Info__ : Here we measure the duration of imputing the dataset with each method.

In [None]:
display(HTML(tabulate.tabulate(duration, headers=headers, tablefmt="html")))

## Debugging

HyperImpute supports **debug** logging. __WARNING__: Don't use it for release builds. 

In [None]:
from hyperimpute import logger

imputers = Imputers()

logger.add(sink=sys.stderr, level="DEBUG")

x, x_miss, mask = datasets["MAR"][pct]

x_imp = imputers.get("EM").fit_transform(x)

imputers.get("softimpute").fit_transform(x_miss)