In [5]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli

import matplotlib.pylab as plt

In [6]:
data = pd.read_csv('data/allegations.csv', index_col=0)

In [7]:
data.head()

Unnamed: 0,unique_mos_id,shield_no,complaint_id,mos_ethnicity,mos_gender,mos_age_incident,complainant_ethnicity,complainant_gender,complainant_age_incident,fado_type,allegation,precinct,contact_reason,Substantiated
0,10004,8409.0,42835,Hispanic,M,32,Black,Female,38.0,Abuse of Authority,Failure to provide RTKA card,78.0,Report-domestic dispute,True
1,10007,5952.0,24601,White,M,24,Black,Male,26.0,Discourtesy,Action,67.0,Moving violation,True
2,10007,5952.0,24601,White,M,24,Black,Male,26.0,Offensive Language,Race,67.0,Moving violation,True
3,10007,5952.0,26146,White,M,25,Black,Male,45.0,Abuse of Authority,Question,67.0,PD suspected C/V of violation/crime - street,True
4,10009,24058.0,40253,Hispanic,F,39,,,16.0,Force,Physical force,67.0,Report-dispute,True


In [8]:
data.isna().sum()

unique_mos_id                  0
shield_no                   5392
complaint_id                   0
mos_ethnicity                  0
mos_gender                     0
mos_age_incident               0
complainant_ethnicity       5505
complainant_gender          4195
complainant_age_incident    4829
fado_type                      0
allegation                     1
precinct                      48
contact_reason               199
Substantiated                  0
dtype: int64

In [9]:
relevant = data.rename({'Substantiated': 'substantiated'}, axis = 1)

In [10]:
relevant = relevant[[
    'complaint_id', 'complainant_ethnicity', 'complainant_gender', 'complainant_age_incident', 'allegation', 'contact_reason', 'substantiated'
]]

In [11]:
relevant.head(10)

Unnamed: 0,complaint_id,complainant_ethnicity,complainant_gender,complainant_age_incident,allegation,contact_reason,substantiated
0,42835,Black,Female,38.0,Failure to provide RTKA card,Report-domestic dispute,True
1,24601,Black,Male,26.0,Action,Moving violation,True
2,24601,Black,Male,26.0,Race,Moving violation,True
3,26146,Black,Male,45.0,Question,PD suspected C/V of violation/crime - street,True
4,40253,,,16.0,Physical force,Report-dispute,True
5,37256,White,Male,31.0,Refusal to process civilian complaint,C/V telephoned PCT,True
6,33969,White,Male,34.0,Sexual orientation,PD suspected C/V of violation/crime - street,True
7,40070,Asian,Male,60.0,Word,Moving violation,False
8,41927,Asian,Male,39.0,Refusal to provide shield number,Moving violation,False
9,41927,Asian,Male,39.0,Retaliatory summons,Moving violation,True


In [12]:
relevant[['complainant_ethnicity', 'substantiated']].groupby("complainant_ethnicity").mean()

Unnamed: 0_level_0,substantiated
complainant_ethnicity,Unnamed: 1_level_1
American Indian,0.359375
Asian,0.291353
Black,0.237992
Hispanic,0.244707
Other Race,0.268833
Refused,0.27027
White,0.279195


In [13]:
relevant['substantiated'].mean()

0.24884585406798967

In [14]:
def mcar(dataset : pd.DataFrame, column : str, p = 0.2):
    """
    Missing Completely at Random -- there is no dependency of the rate of
    missingness on other values or the value itself.
    
    dataset: the dataframe
    column: the column in which missingness will be produced
    p: The probability in which each value has of being missing

    Generated with each value having probability p of being a missing value
    """
    # check for p
    if (p < 0) or (p > 1):
        raise Exception('probability of missing values must be between 0 and 1')

    # go over values in the target column, 
    # randomly make it missing based on probability p
    for i, val in enumerate(dataset[column]):
        if np.random.choice([1, 0], p = [p, 1 - p]) == 1:
                dataset[column].iloc[i] = np.nan
    
    # return edited dataset
    return dataset

In [15]:
def mar(dataset : pd.DataFrame, miss_column : str, dep_column : str, b = 0.5):
    """
    Missing at Random -- data is missing due to dependency on another
    variable.
    
    dataset: the dataframe
    miss_column: the column in which missingness will be produced
    dep_column: the column in which missingness is dependent on
    b: tuning parameter between 0 and 1; the higher the beta, the more missingness

    Missingness is generated differently depeending on the type of the dependent column.
    If the dep_column is categorical, for each unique categorical value, a random z-score is
    generated from a normal distribution and put into a sigmoid function. This will produce
    a probability for each unique value of dep_column. For a dep_column with a numerical type,
    the z-score of each value is determined and passed into a sigmoid function to determine
    the probability of missingness for each value. Then, for each tuple in the dataframe, the
    probability that each value in miss_column will be missing is determined by the probability
    generated by the sigmoid function of that tuples' dep_column value. The beta value is
    multiplied by the generated probability as a way to tune how much missingness is generated.
    The higher the beta value (between 0 and 1), the more missingness is generated. 
    """
    # check for beta
    if (b < 0) or (b > 1):
        raise Exception('beta value must be between 0 and 1')

    # helper sigmoid function
    def sig(x):
        return 1 / (1 + np.exp(-x))
    

    # check for categorical
    if (dataset.dtypes.loc[dep_column] == object) or (dataset.dtypes.loc[dep_column] == bool):
        # generate probabilities -> sigmoid for each unique categorical value
        cat_dict = {cat: sig(np.random.normal()) for cat in dataset[dep_column].unique()}
        
        # generate missingness based on generated probabilities
        for i in range(dataset.shape[0]):
            dep_val = dataset.iloc[i, dataset.columns.get_loc(dep_column)]
            if np.random.choice([1, 0], p = [cat_dict[dep_val] * b, 1 - (cat_dict[dep_val] * b)]) == 1:
                dataset[miss_column].iloc[i] = np.nan
    else:
        # normalize the value (create Z value)
        def norm(val, mean, std):
            return (val - mean) / std
        
        # take mean and standard deviation for Z-value
        mean = np.mean(dataset[dep_column])
        std = np.std(dataset[dep_column])

        # generate missingness using normal distribution
        for i in range(dataset.shape[0]):
            dep_val = dataset.iloc[i, dataset.columns.get_loc(dep_column)]
            prob = sig(norm(dep_val, mean, std))
            if np.random.choice([1, 0], p = [prob * b, 1 - (prob * b)]) == 1:
                dataset[miss_column].iloc[i] = np.nan
        

    return dataset

In [16]:
def nmar(dataset : pd.DataFrame, column : str, b = 0.5):
    """
    Not Missing at Random -- data is missing due to the value of the variable
    itself. For example, sales not recorded for the stores that produced less
    than a specific amount of revenue.
    
    dataset: the dataframe
    column: the column in which missingness will be produced
    b: tuning parameter between 0 and 1; the higher the beta, the more missingness

    Missingness is generated differently depending on the type of the column.
    If the column is categorical, for each unique categorical value, a random z-score is
    generated from a normal distribution and put into a sigmoid function. This will produce
    a probability for each unique value of the column. If the column is numerical, the 
    z-score of each value is determined and passed into a sigmoid function to determine
    the probability of missingness for each value. Whether each value in the column will
    be missing is determined by the probability generated by the sigmoid function. The 
    beta value is multiplied by the generated probability as a way to tune how much 
    missingness is generated. The higher the beta value (between 0 and 1), the more
    missingness is generated. 
    """
    # check for beta
    if (b < 0) or (b > 1):
        raise Exception('beta value must be between 0 and 1')

    # helper sigmoid function
    def sig(x):
        return 1 / (1 + np.exp(-x))
    
    # check for categorical
    if (dataset.dtypes.loc[column] == object) or (dataset.dtypes.loc[column] == bool):
        # create probabilities based on normal distribution, passed through sigmoid
        cat_dict = {cat: sig(np.random.normal()) for cat in dataset[column].unique()}

        # generate missing values based on probabilities calculated
        for i, val in enumerate(dataset[column]):
            if np.random.choice([1, 0], p = [cat_dict[val] * b, 1 - (cat_dict[val] * b)]) == 1:
                dataset[column].iloc[i] = np.nan
    else:
        # Z-value
        def norm(val, mean, std):
            return (val - mean) / std

        # find mean and std for normalization
        mean = np.mean(dataset[column])
        std = np.std(dataset[column])

        # generate missingness using probabilities
        for i, val in enumerate(dataset[column]):
            prob = sig(norm(val, mean, std))
            if np.random.choice([1, 0], p = [prob * b, 1 - (prob * b)]) == 1:
                dataset[column].iloc[i] = np.nan
    
    return dataset

## Plotting

In [None]:


plt.matshow()