In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('data/allegations.csv', index_col=0)

In [4]:
data.head()

Unnamed: 0,unique_mos_id,shield_no,complaint_id,mos_ethnicity,mos_gender,mos_age_incident,complainant_ethnicity,complainant_gender,complainant_age_incident,fado_type,allegation,precinct,contact_reason,Substantiated
0,10004,8409.0,42835,Hispanic,M,32,Black,Female,38.0,Abuse of Authority,Failure to provide RTKA card,78.0,Report-domestic dispute,True
1,10007,5952.0,24601,White,M,24,Black,Male,26.0,Discourtesy,Action,67.0,Moving violation,True
2,10007,5952.0,24601,White,M,24,Black,Male,26.0,Offensive Language,Race,67.0,Moving violation,True
3,10007,5952.0,26146,White,M,25,Black,Male,45.0,Abuse of Authority,Question,67.0,PD suspected C/V of violation/crime - street,True
4,10009,24058.0,40253,Hispanic,F,39,,,16.0,Force,Physical force,67.0,Report-dispute,True


In [5]:
data.isna().sum()

unique_mos_id                  0
shield_no                   5392
complaint_id                   0
mos_ethnicity                  0
mos_gender                     0
mos_age_incident               0
complainant_ethnicity       5505
complainant_gender          4195
complainant_age_incident    4829
fado_type                      0
allegation                     1
precinct                      48
contact_reason               199
Substantiated                  0
dtype: int64

In [6]:
relevant = data.rename({'Substantiated': 'substantiated'}, axis = 1)

In [7]:
relevant = relevant[[
    'complaint_id', 'complainant_ethnicity', 'complainant_gender', 'complainant_age_incident', 'allegation', 'contact_reason', 'substantiated'
]]

In [8]:
relevant.head(10)

Unnamed: 0,complaint_id,complainant_ethnicity,complainant_gender,complainant_age_incident,allegation,contact_reason,substantiated
0,42835,Black,Female,38.0,Failure to provide RTKA card,Report-domestic dispute,True
1,24601,Black,Male,26.0,Action,Moving violation,True
2,24601,Black,Male,26.0,Race,Moving violation,True
3,26146,Black,Male,45.0,Question,PD suspected C/V of violation/crime - street,True
4,40253,,,16.0,Physical force,Report-dispute,True
5,37256,White,Male,31.0,Refusal to process civilian complaint,C/V telephoned PCT,True
6,33969,White,Male,34.0,Sexual orientation,PD suspected C/V of violation/crime - street,True
7,40070,Asian,Male,60.0,Word,Moving violation,False
8,41927,Asian,Male,39.0,Refusal to provide shield number,Moving violation,False
9,41927,Asian,Male,39.0,Retaliatory summons,Moving violation,True


In [9]:
relevant[['complainant_ethnicity', 'substantiated']].groupby("complainant_ethnicity").mean()

Unnamed: 0_level_0,substantiated
complainant_ethnicity,Unnamed: 1_level_1
American Indian,0.359375
Asian,0.291353
Black,0.237992
Hispanic,0.244707
Other Race,0.268833
Refused,0.27027
White,0.279195


In [10]:
relevant['substantiated'].mean()

0.24884585406798967

In [11]:
def mcar(dataset : pd.DataFrame, n=10000):
    """
    Missing Completely at Random -- there is no dependency of the rate of
    missingness on other values or the value itself.

    Generated by simple sampling.
    """
    sampled = dataset.sample(n, replace=True)
    return sampled

In [12]:
mcar(relevant).substantiated.mean()

0.2475

In [13]:
def mar(dataset : pd.DataFrame, n=10000):
    """
    Missing at Random -- data is missing due to dependency on another
    variable.

    New data created by dropping values of the complainant's ethnicity 
    where their ages are less than 30
    """
    dataset.dropna()
    sample = dataset.sample(n,replace = True)
    sample.loc[sample['complainant_age_incident'] < 30, 'complainant_ethnicity'] = np.nan
    return sample

In [79]:
def nmar(dataset : pd.DataFrame, n=10000):
    """
    Not Missing at Random -- data is missing due to the value of the variable
    itself. For example, sales not recorded for the stores that produced less
    than a specific amount of revenue.

    Generated by fancy math that I don't remember. Alec save me please.
    """
    pass

In [19]:
new_data = mar(relevant)

In [20]:
new_data

Unnamed: 0,complaint_id,complainant_ethnicity,complainant_gender,complainant_age_incident,allegation,contact_reason,substantiated
20802,1143,,,,Other - Force,Others,True
11357,19993,Hispanic,Male,36.0,Stop,PD suspected C/V of violation/crime - bldg,True
27157,27579,,Male,22.0,Stop,PD suspected C/V of violation/crime - auto,True
20694,18938,,Female,20.0,Stop,PD suspected C/V of violation/crime - street,False
6569,10296,Hispanic,Male,53.0,Physical force,C/V intervened on behalf of/observed encounter...,True
...,...,...,...,...,...,...,...
20695,21366,,Male,19.0,Word,Report-possession/sale of narcotics,False
27496,20041,,Male,21.0,Refusal to process civilian complaint,PD suspected C/V of violation/crime - street,True
8990,5986,White,Male,41.0,Refusal to process civilian complaint,Other violation of VTL,True
25281,10914,,Male,20.0,Question and/or stop,PD suspected C/V of violation/crime - street,False


In [24]:
new_data.loc[new_data['complainant_age_incident'] < 30].isna().sum()

4022

In [25]:
len(new_data.loc[new_data['complainant_age_incident'] < 30])

4022