In [1]:
import numpy as np
import pandas as pd

In [10]:
def mcar(dataset : pd.DataFrame, column : str, p = 0.2):
    """
    Missing Completely at Random -- there is no dependency of the rate of
    missingness on other values or the value itself.
    
    dataset: the dataframe
    column: the column in which missingness will be produced
    p: The probability in which each value has of being missing

    Generated with each value having probability p of being a missing value
    """
    if (p < 0) or (p > 1):
        raise Exception('probability of missing values must be between 0 and 1')
    for i, val in enumerate(dataset[column]):
        if np.random.choice([1, 0], p = [p, 1 - p]) == 1:
                dataset[column].iloc[i] = np.nan
    return dataset

In [11]:
def mar(dataset : pd.DataFrame, miss_column : str, dep_column : str, b = 0.5):
    """
    Missing at Random -- data is missing due to dependency on another
    variable.
    
    dataset: the dataframe
    miss_column: the column in which missingness will be produced
    dep_column: the column in which missingness is dependent on
    b: tuning parameter between 0 and 1; the higher the beta, the more missingness

    Missingness is generated differently depeending on the type of the dependent column.
    If the dep_column is categorical, for each unique categorical value, a random z-score is
    generated from a normal distribution and put into a sigmoid function. This will produce
    a probability for each unique value of dep_column. For a dep_column with a numerical type,
    the z-score of each value is determined and passed into a sigmoid function to determine
    the probability of missingness for each value. Then, for each tuple in the dataframe, the
    probability that each value in miss_column will be missing is determined by the probability
    generated by the sigmoid function of that tuples' dep_column value. The beta value is
    multiplied by the generated probability as a way to tune how much missingness is generated.
    The higher the beta value (between 0 and 1), the more missingness is generated. 
    """
    if (b < 0) or (b > 1):
        raise Exception('beta value must be between 0 and 1')
    def sig(x):
        return 1 / (1 + np.exp(-x))
    if (dataset.dtypes.loc[dep_column] == object) or (dataset.dtypes.loc[dep_column] == bool):
        cat_dict = {cat: sig(np.random.normal()) for cat in dataset[dep_column].unique()}
        for i in range(dataset.shape[0]):
            dep_val = dataset.iloc[i, dataset.columns.get_loc(dep_column)]
            if np.random.choice([1, 0], p = [cat_dict[dep_val] * b, 1 - (cat_dict[dep_val] * b)]) == 1:
                dataset[miss_column].iloc[i] = np.nan
    else:
        def norm(val, mean, std):
            return (val - mean) / std
        mean = np.mean(dataset[dep_column])
        std = np.std(dataset[dep_column])
        for i in range(dataset.shape[0]):
            dep_val = dataset.iloc[i, dataset.columns.get_loc(dep_column)]
            prob = sig(norm(dep_val, mean, std))
            if np.random.choice([1, 0], p = [prob * b, 1 - (prob * b)]) == 1:
                dataset[column].iloc[i] = np.nan
    return dataset

In [13]:
def nmar(dataset : pd.DataFrame, column : str, b = 0.5):
    """
    Not Missing at Random -- data is missing due to the value of the variable
    itself. For example, sales not recorded for the stores that produced less
    than a specific amount of revenue.
    
    dataset: the dataframe
    column: the column in which missingness will be produced
    b: tuning parameter between 0 and 1; the higher the beta, the more missingness

    Missingness is generated differently depending on the type of the column.
    If the column is categorical, for each unique categorical value, a random z-score is
    generated from a normal distribution and put into a sigmoid function. This will produce
    a probability for each unique value of the column. If the column is numerical, the 
    z-score of each value is determined and passed into a sigmoid function to determine
    the probability of missingness for each value. Whether each value in the column will
    be missing is determined by the probability generated by the sigmoid function. The 
    beta value is multiplied by the generated probability as a way to tune how much 
    missingness is generated. The higher the beta value (between 0 and 1), the more
    missingness is generated. 
    """
    if (b < 0) or (b > 1):
        raise Exception('beta value must be between 0 and 1')
    def sig(x):
        return 1 / (1 + np.exp(-x))
    if (dataset.dtypes.loc[column] == object) or (dataset.dtypes.loc[column] == bool):
        cat_dict = {cat: sig(np.random.normal()) for cat in dataset[column].unique()}
        for i, val in enumerate(dataset[column]):
            if np.random.choice([1, 0], p = [cat_dict[val] * b, 1 - (cat_dict[val] * b)]) == 1:
                dataset[column].iloc[i] = np.nan
    else:
        def norm(val, mean, std):
            return (val - mean) / std
        mean = np.mean(dataset[column])
        std = np.std(dataset[column])
        for i, val in enumerate(dataset[column]):
            prob = sig(norm(val, mean, std))
            if np.random.choice([1, 0], p = [prob * b, 1 - (prob * b)]) == 1:
                dataset[column].iloc[i] = np.nan
    return dataset