In [16]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

In [57]:
df = pd.read_csv('../../../Project/Data/recidivism_dataset_w_prs8.csv')

In [60]:
df.loc[df['PRS']=='RFEL', 'PRS'] = 'RFEL/REVOC'

In [61]:
prs7 = df['PRS'].unique()
prs7.sort()

In [63]:
prs8 = df['PRS8'].unique()
prs8.sort()

In [64]:
print(prs7, prs8)

['0' '1' '2' '3' '4' '5' 'RFEL/REVOC'] [1. 2. 3. 4.]


In [66]:
def diff_in_proportions(data, prs_vers):
    results = {} 
    
    if prs_vers == 7:
        prs = prs7
        var = 'PRS'
    elif prs_vers == 8:
        prs = prs8
        var = 'PRS8'
        
    for i in range(1, len(prs)):
        prev = prs[i-1]
        curr = prs[i]

        prev_recid = len(data[(data[var]==prev) & (data['RECIDIVISM_3Y']==1)])
        prev_nobs = len(data[data[var]==prev])

        curr_recid = len(data[(data[var]==curr) & (data['RECIDIVISM_3Y']==1)])
        curr_nobs = len(data[data[var]==curr]) 

        # difference is current minus previous 
        diff = curr_recid / curr_nobs - prev_recid / prev_nobs 

        # if current proportion is smaller than previous, skip
        if diff <= 0: 
            pass
        else:
            z_test = proportions_ztest(np.array([prev_recid, curr_recid]), np.array([prev_nobs, curr_nobs]), alternative='smaller') 
            pval = z_test[1]
            sig =  '***' if pval < 0.001  else '**' if pval < 0.01 else '*' if pval < 0.05 else ''
            results[(prev, curr)]= {'difference': diff, 'p-val': pval, 'significance': sig}
    return results


In [78]:
overall_recid_ztest_p7 = diff_in_proportions(df, 7)

In [81]:
overall_recid_ztest_p8 = diff_in_proportions(df, 8)

In [79]:
overall_recid_ztest_p7 = pd.DataFrame(overall_recid_ztest_p7)

In [73]:
overall_recid_ztest_p8 = pd.DataFrame(overall_recid_ztest_p8)

In [80]:
overall_recid_ztest_p7

Unnamed: 0_level_0,0,1,2,4
Unnamed: 0_level_1,1,2,3,5
difference,0.051967,0.01283,0.010522,0.007613
p-val,0.0,0.0,0.0,0.000068
significance,***,***,***,***


In [74]:
overall_recid_ztest_p8

Unnamed: 0_level_0,1.0,2.0
Unnamed: 0_level_1,2.0,3.0
difference,0.050246,0.026023
p-val,0.0,0.0
significance,***,***


In [83]:
c1 = len(df[(df['OFF_RACE']=='White') & (df['RECIDIVISM_3Y']==1)])

In [84]:
n1 = len(df[(df['OFF_RACE']=='White')])

In [87]:
c2 = len(df[(df['OFF_RACE']=='Black') & (df['RECIDIVISM_3Y']==1)])

In [88]:
n2 = len(df[(df['OFF_RACE']=='Black')])

In [95]:
res = proportions_ztest(np.array([c1, c2]), np.array([n1, n2]), alternative = 'two-sided') 

In [91]:
print(c1, n1, c2, n2)

133944 767518 53214 287902


In [96]:
res

(-12.360702813833472, 4.2641219767152864e-35)

In [53]:
temp = pd.read_csv('../../../Project/data/trimmed_w_prs8.csv')

  temp = pd.read_csv('../../../Project/data/trimmed_w_prs8.csv')


In [54]:
temp['PRS8'].unique()

array([1., 2., 3., 4.])

In [98]:
print('{:.5f}'.format(res[1]))

0.00000


In [99]:
c1/n1 - c2/n2

-0.010317951314917667

In [100]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID_VARIABLE,NEW_DOS,NEW_DOF,PRS,PRS8,NEW_INC_SANCTION_EXISTS,ADJ_JPMIN,AT_RISK_DT,COUNTY,NEXT_DOF,TIME_TO_RECIDIVATE,RECIDIVISM_3Y,RECIDIVISM_5Y,OGS,DOB,OFF_RACE,OFF_SEX
0,0,1000001,2010-02-18,2009-06-25,0,1.0,Y,16.0,2010-03-06,Schuylkill,,,0,0,3.0,1990-05-28,White,M
1,1,1000002,2017-01-31,2015-09-01,4,2.0,Y,120.0,2017-05-31,Somerset,,,0,0,3.0,1958-07-12,White,F
2,2,1000003,2002-05-08,2001-09-07,0,2.0,N,0.0,2002-05-08,Northampton,2009-03-04,2492.0,0,0,3.0,1961-08-14,White,F
3,3,1000003,2009-03-04,2009-03-04,3,3.0,Y,92.0,2009-06-04,Northampton,,,0,0,3.0,1961-08-14,White,F
4,4,1000004,2013-12-10,2013-09-19,0,2.0,N,0.0,2013-12-10,Franklin,2018-07-09,1672.0,0,1,1.0,1993-09-21,Unknown,M


In [102]:
df.columns

Index(['Unnamed: 0', 'ID_VARIABLE', 'NEW_DOS', 'NEW_DOF', 'PRS', 'PRS8',
       'NEW_INC_SANCTION_EXISTS', 'ADJ_JPMIN', 'AT_RISK_DT', 'COUNTY',
       'NEXT_DOF', 'TIME_TO_RECIDIVATE', 'RECIDIVISM_3Y', 'RECIDIVISM_5Y',
       'OGS', 'DOB', 'OFF_RACE', 'OFF_SEX'],
      dtype='object')