# goal

- [ ] demonstrate how to use the prepared modules for generating various statistics accompanied by structured data (as "magic numbers") and plain text or HTML components.

# setup

In [1]:
# dependencies
import sys
import re
import pandas as pd
from ipywidgets import HTML

sys.path.append(".")
import Summary
import Relative_risk
import Chi_square

In [2]:
# support methods
def formatprop_num(num, den, aspercent, dec):
    if num == 0: return 0
    v = num/den
    if not aspercent: out = round(v, dec)
    else: out = round(v * 100, dec)
    return out


def formatprop_str(num, den, aspercent, dec):
    if num == 0: return '0'
    v = num/den
    if not aspercent: out = f"%.{dec}f" % v
    else: out = f"%.{dec}f" % (v*100)
    if int(float(out)) == 0: return 'less than 1'
    return out


def formatprop(num, den, numeric, aspercent=False, dec=1):
    assert (den >= num)
    if numeric: out = formatprop_num(num=num, den=den, aspercent=aspercent, dec=dec)
    else: out = formatprop_str(num=num, den=den, aspercent=aspercent, dec=dec)
    return out


def get_census():
    # county numbers from USCB DHC https://data.census.gov/table/DECENNIALDHC2020.P9?t=Race%20and%20Ethnicity&g=050XX00US06075
    magic = {}
    magic['county'] = {
        'total': {'n': 873965},
        'Black': {'n': 45071}, # 'Black or African American alone'
        'White': {'n': 341306}, # 'White alone'
        'Latine': {'n': 136761}, # 'Hispanic or Latino'
        'Asian': {'n': 294220 + 3244 + 1570}, # 'Asian alone' + 'Native Hawaiian and Other Pacific Islander alone' + 'American Indian and Alaska Native alone'
        'Other/Unknown': {'n': 6347 + 45446}, # 'Some Other Race alone' + 'Population of two or more races'
    }
    realgroups = [k for k in magic['county'].keys() if k != 'total']
    assert magic['county']['total']['n'] == sum([magic['county'][group]['n'] for group in realgroups]), f"\
    Census counts by race group should add up to the total population count.\
    Found {magic['county']['total']['n']} for total population and {sum([magic['county'][group]['n'] for group in realgroups])} group total."
    for race_group in realgroups:
        magic['county'][race_group]['prop'] = formatprop(
            num=magic['county'][race_group]['n'],
            den=magic['county']['total']['n'],
            numeric=True, aspercent=False, dec=5)
    for race_group in realgroups:
        magic['county'][race_group]['perc'] = formatprop(
            num=magic['county'][race_group]['n'],
            den=magic['county']['total']['n'],
            numeric=False, aspercent=True, dec=1)
    return magic

In [3]:
# main
"""This is the processed version of publicly available data requested by the ACLU from the San Francisco District Attorney's Office ("SFDA") covering 2015-2022. Note that the SFDA did not necessarily produce the data themselves and it may also reflect SFPD or SF Sheriff entries."""
sfda = pd.read_parquet("sfda.parquet")
magic = get_census()

In [4]:
# support for breaking down data
cols = {
    'meta': [
        'source', 'filename', 'sheet', 'incident_number', 'court_number',],
    'person': [
        'age_at_arrest', 'gender', 'race', 'ethnicity', 'ethnicity_group',],
    'booking': [
        'incident_number', 'arrest_date', 'age_at_arrest',
        'booked_case_type', 'booked_charge_list',],
    'filing': [
        'court_number', 'filing_date',
        'filed_case_type', 'filed_charge_list', 'description',
        'case_dispo_date', 'case_dispo', 'dispo_description', 'dispo_description_group',
        'status_ctnum', 'status_ctnum_agg', 'status_ctnum_group',],
}
grouped = [c for collist in cols.values() for c in collist]
lost = [c for c in sfda.columns if c not in grouped]
assert not any(lost), f"\
previously had grouped all available columns, however {
lost} have not been grouped."

# preview data

In [5]:
sfda[cols['meta']].sample().T

Unnamed: 0,73223
source,ACLU_PRA
filename,2015-2022_San Francisco County_Proseuction Dat...
sheet,Arrests & DA Actions|Cases Filed
incident_number,200412198
court_number,20007770


In [6]:
sfda[cols['booking']].sample().T

Unnamed: 0,49552
incident_number,180394204
arrest_date,2018-05-27 00:00:00
age_at_arrest,30.0
booked_case_type,Felony
booked_charge_list,"21310,1009.22DHC"


In [7]:
sfda[cols['filing']].sample().T

Unnamed: 0,54773
court_number,18017115
filing_date,2018-11-15 00:00:00
filed_case_type,Misdemeanor
filed_charge_list,"23152A/M/0, 23152B/M/0"
description,DUI
case_dispo_date,2019-06-13 00:00:00
case_dispo,152.0
dispo_description,Finding - Not Guilty
dispo_description_group,acquittal
status_ctnum,Acquittal


# coverage

this dataset covers cases filed by the SFDA from January 1, 2015 to December 30, 2021.

In [8]:
sfda.filing_date.describe().reset_index().rename(columns={'index': 'statistic'})

Unnamed: 0,statistic,filing_date
0,count,46613
1,mean,2018-05-09 18:49:05.770493184
2,min,2015-01-02 00:00:00
3,25%,2016-09-12 00:00:00
4,50%,2018-05-21 00:00:00
5,75%,2019-10-31 00:00:00
6,max,2021-12-30 00:00:00


In [9]:
assert sfda.filename.str.contains('San Francisco County').all()

# setting up labels

`LABELS` are plain text phrases that explain what a unit of a variable represents in this context. For example, we often describe a `case_filed` value as 'a case against a defendant'.

These phrases are laced into the analytical template for presenting findings in plain text, so pay close attention to how the sentences read in the results to confirm it makes sense and appears correct.

In [10]:
any([[c for c in sfda.columns if sfda[c].dtype == bool]])

False

In [11]:
# add case-based indicators
sfda['case_filed'] = sfda.court_number.notna()
sfda['conviction_any'] = sfda.dispo_description_group.isin((
    'imprisonment', 'jail_probation', 'other_conviction'))

# add charge-based indicators
pcs = {
    148: '148[A-Z]*|148\\.',
    211: '211',
}
for code, patt in pcs.items():
    for stage in ('booked', 'filed'): # note lack of a `convicted_charge_list` field
        chargecol = f'{stage}_charge_list'
        sfda[f'pc_{code}_{stage}'] = sfda[chargecol].str.contains(patt, na=False, flags=re.I)

In [12]:
LABELS = {
    'ethnicity_group': 'someone with a recorded race/ethnicity of',
    'def_black': 'a Black defendant',
    'def_white': 'a White defendant',
    'case_filed': 'a case against a defendant',
    'conviction_any': 'conviction on at least one charge', # the data are not more specific than this
    'pc_148_booked': 'a booked charge for resisting arrest',
    'pc_148_filed': 'a filed charge for resisting arrest',
    'pc_211_booked': 'a booked charge for robbery',
    'pc_211_filed': 'a filed charge for robbery',
}

# sample charge indicator: `pc_148_booked`

## default `value_counts()` presentation

In [13]:
sfda.pc_148_booked.value_counts().reset_index()

Unnamed: 0,pc_148_booked,count
0,False,76840
1,True,8243


## `Summary` module

### background

In [14]:
Summary.Summary?

[31mInit signature:[39m Summary.Summary(df, params, labels)
[31mDocstring:[39m     
Calculation:
- Table: df[[INDICATOR_COL, GROUP_COL]].groupby(GROUP_COL)[INDICATOR_COL].sum()
- Description: Summarize by {GROUP_COL} all records where {INDICATOR_COL} is True
Present:
- Count/Percent: '{GROUP_COL.sum()/INDICATOR_COL.sum()*100}% ({GROUP_COL.sum()} of {INDICATOR_COL.sum()})'
- Finding:
    - Of the {INDICATOR_COL.sum()},
        - {magic['GROUP_COUNTS'][GROUP_LABEL]} were for {GROUP_LABEL}
        - (repeated for each group appearing in GROUP_COL)
[31mFile:[39m           ~/git/tool-suite/templates/analysis/Summary.py
[31mType:[39m           type
[31mSubclasses:[39m     

In [15]:
Summary.PARAMS

['INDICATOR_COL', 'INDICATOR_OP', 'GROUP_COL', 'RENAMER']

### setup

In [16]:
pc148_sum = Summary.Summary(
    df=sfda,
    params={
        'INDICATOR_COL': 'pc_148_booked',
        'INDICATOR_OP': 'against',
        'GROUP_COL': 'ethnicity_group',
        'RENAMER': {
            'ethnicity_group': 'Race/Ethnicity',
            True: 'PC 148 booked',
            False: 'No PC 148'
        }
    },
    labels=LABELS,
)

### text based summary: distribution across groups

In [17]:
print(pc148_sum.getinfo())

Of the 8243 a booked charge for resisting arrest,
-  442 or 5.4% against someone with a recorded race/ethnicity of Asian.
-  3508 or 42.6% against someone with a recorded race/ethnicity of Black.
-  1940 or 23.5% against someone with a recorded race/ethnicity of Latine.
-  227 or 2.8% against someone with a recorded race/ethnicity of Other/Unknown.
-  2126 or 25.8% against someone with a recorded race/ethnicity of White.


### magic numbers: distribution across groups

In [18]:
pc148_sum_gc = pc148_sum.getmagic()['GROUP_COUNTS'].copy()

In [19]:
pc148_sum.getmagic()

{'INDICATOR_COL': 'pc_148_booked',
 'INDICATOR_OP': 'against',
 'GROUP_COL': 'ethnicity_group',
 'RENAMER': {'ethnicity_group': 'Race/Ethnicity',
  True: 'PC 148 booked',
  False: 'No PC 148'},
 'INDICATOR_COUNT': np.int64(8243),
 'GROUP_COUNTS': {'Asian': 442,
  'Black': 3508,
  'Latine': 1940,
  'Other/Unknown': 227,
  'White': 2126},
 'GROUP_PERCENTS': {'Asian': 5.362125439767075,
  'Black': 42.55732136358122,
  'Latine': 23.535120708479923,
  'Other/Unknown': 2.7538517530025475,
  'White': 25.791580735169234}}

### table: distribution across groups

In [20]:
HTML("\n".join(pc148_sum.gettable_winevent()))

HTML(value='Of the 85,083 cases considered, there are 8243 a booked charge for resisting arrest, with the foll…

### table: distribution within groups

In [21]:
HTML("\n".join(pc148_sum.gettable_wingroup()))

HTML(value='Of the 85,083 cases considered, there are 8243 a booked charge for resisting arrest, with the foll…

## `Relative_risk` module

For the contingency table to work, you need to setup a supplemental variable that is a version of the race/ethnicity field for each comparison group (ie. Black vs. White).

You could avoid setting up this variable by providing a filtered `df` that only includes records related to the comparison, as long as your comparison only involves groups in the data (ie. not comparing to non-Black defendants when you don't have that label encoded in the group variable).

### background

In [22]:
Relative_risk.Ratio?

[31mInit signature:[39m Relative_risk.Ratio(df, params, labels)
[31mDocstring:[39m     
Calculation:
- Table: 
- Description: 
Present:
- Ratio: 
- Finding: 
[31mFile:[39m           ~/git/tool-suite/templates/analysis/Relative_risk.py
[31mType:[39m           type
[31mSubclasses:[39m     

In [23]:
Relative_risk.CONTINGENCY_PARAMS

['COMPARISON_GROUP_COL', 'OUTCOME_EVENT_COL', 'GIVEN_EVENT_COL']

In [24]:
Relative_risk.RATIO_PARAMS

['TREAT_GROUP_COL',
 'CONTROL_GROUP_COL',
 'OUTCOME_EVENT_OP',
 'OUTCOME_EVENT_COL',
 'GIVEN_EVENT_COL']

### setup

In [25]:
sfda['def_black'] = sfda.ethnicity_group == 'Black'
sfda['def_white'] = sfda.ethnicity_group == 'White'
sfda['def_black_or_hispanic'] = sfda.ethnicity_group.isin(('Black', 'Latine'))
sfda['def_not_black'] = ~sfda.def_black

# setup comparison groups based on race data
sfda.loc[(sfda.def_black) | (sfda.def_white), 'comparison_group_1'] = sfda.ethnicity_group
sfda.loc[sfda.def_black, 'comparison_group_2'] = sfda.ethnicity_group
sfda.loc[sfda.def_not_black, 'comparison_group_2'] = 'Race other than Black'
sfda.loc[sfda.def_white, 'comparison_group_3'] = sfda.ethnicity_group
sfda.loc[sfda.def_black_or_hispanic, 'comparison_group_3'] = 'Black or Hispanic'

In [26]:
pc148_rr_tbl = Relative_risk.Contingency(
    df=sfda,
    params={
        'COMPARISON_GROUP_COL': 'comparison_group_1',
        'OUTCOME_EVENT_COL': 'pc_148_filed',
        'GIVEN_EVENT_COL': 'pc_148_booked',
    },
    labels=LABELS,
)
pc148_rr_sum = Relative_risk.Ratio(
    df=sfda,
    params={
        'TREAT_GROUP_COL': 'def_black',
        'CONTROL_GROUP_COL': 'def_white',
        'OUTCOME_EVENT_OP': 'faces',
        'OUTCOME_EVENT_COL': 'pc_148_filed',
        'GIVEN_EVENT_COL': 'pc_148_booked',
    },
    labels=LABELS,
)

In [27]:
HTML(pc148_rr_tbl.gettable())

HTML(value='<p><table class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th style = "b…

In [28]:
print(pc148_rr_sum.getinfo())

The ratio of the probability that a Black defendant faces a filed charge for resisting arrest compared to the probability that a White defendant faces a filed charge for resisting arrest is 1.178.

	In other words, the relative risk that a Black defendant faces a filed charge for resisting arrest is **17.8% greater than** a White defendant.




## `Chi_square` module

This module has been setup to use the numbers exported from the `Summary` module.

### background

In [29]:
Chi_square.DEFAULTS

{'SIG': 0.05, 'DDOF': 0}

In [30]:
Chi_square.CENSUS_PARAMS

['CENSUS_DICT', 'OBSERVED_DICT', 'NULL_PHRASE', 'SIG', 'DDOF']

In [31]:
Chi_square.EQUAL_PARAMS

['OBSERVED_DICT', 'NULL_PHRASE', 'SIG', 'DDOF']

In [34]:
params = {
    'Census': {
        'OBSERVED_DICT': pc148_sum_gc,
        'CENSUS_DICT': {k:v for k,v in magic['county'].items() if k != 'total'},
        'NULL_PHRASE': f"the distribution by race of {
            LABELS['pc_148_booked']} follows the distribution of the general population"
    },
    'Equal': {
        'OBSERVED_DICT': pc148_sum_gc,
        'NULL_PHRASE': f"the distribution by race of {
            LABELS['pc_148_booked']} is equal across all groups"
    },
}

In [36]:
print(Chi_square.Census(params=params['Census']).getinfo())

This test compares whether observed racial proportions     match the proportion of each racial group in the general population.    In interpreting the results, a p-value below 0.05 will be considered statistically significant.

	This test results in a **p-value of < 0.0001**, which is a statistically significant difference and rejects the null hypothesis that the distribution by race of a booked charge for resisting arrest follows the distribution of the general population. 


In [37]:
print(Chi_square.Equal(params=params['Equal']).getinfo())

This test compares whether observed racial proportions match the proportion of each racial group in the general population.    In interpreting the results, a p-value below 0.05 will be considered statistically significant.

	This test results in a **p-value of < 0.0001**, which is a statistically significant difference and rejects the null hypothesis that the distribution by race of a booked charge for resisting arrest is equal across all groups. 
