# Load Data

In [1]:
#this jupyter notebook is essentially the same as the "recidivism-check" notebook, just cleaned up a bit (hence the name)
#import required libraries
import os
import pandas as pd
import numpy as np
import datetime

#get the folder path for this data
pa_sentencing_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))

## **WILL NEED TO EDIT THE BELOW TO MAKE IT GENERALIZABLE FOR THE PA SENTENCING COMISSION**

In [2]:
#read in trimmed version WITH 8th edition PRS score implementation
psc_trimmed = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "PSC8_CRIMECAT_MERGED_w_prs8.csv"))


  psc_trimmed = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "PSC8_CRIMECAT_MERGED_w_prs8.csv"))


# Description of the Dataset

Because the original dataset provided by the PA Sentencing Commission included **547** columns, and took anywhere from 20-30 minutes to load, we selected the columns of the original dataset that we most relevant for our analysis and created a new, "trimmed" dataset. This new trimmed dataset (loaded in above) has **93** columns. In the next cell, we display the names of the columns in the dataset.

In [3]:
print("The names of the columns in the trimmed dataset are: \n {}".format(psc_trimmed.columns.tolist()))

The names of the columns in the trimmed dataset are: 
 ['Unnamed: 0.1', 'Unnamed: 0', 'INC_END', 'MOSTSERIOUS', 'MND_MINIMUM', 'OFF_AGE', 'PRS_MANUAL', 'JPS_DRUG_DEPENDENT', 'INC_TYPE', 'JPO_ID', 'PROB_SANCTION_EXISTS', 'JPR_ID', 'DOSAGE', 'IP_START', 'GUILTY_NO_PENALTY', 'OFN_LABEL', 'STAT_MIN', 'F2TOT', 'DOB2', 'SGR_LVL', 'OFF_SEX', 'F3TOT', 'DOS', 'F1TOT', 'JP_MIN', 'INC_RELATIONSHIP', 'OFN_TITLE', 'OGS', 'JMIN', 'F1F2', 'REASON_THREE', 'JPR_LASTUPDATE', 'OFN_COUNT', 'MS_OFFBODY', 'OFN_GRADE', 'PCS_OFF_ID', 'JPS_DA_EVAL', 'SGR_LVL_OGS_PRS', 'INCMAX', 'REASON_ONE', 'M1TOT', 'PMIN', 'JPS_SVP', 'PRS_OTHER_MISD', 'DOF', 'GRADE', 'GLEPOCH', 'PRS8', 'IP_SANCTION_EXISTS', 'INC_RELATEDTO', 'IP_END', 'INC_RELATEDOTN', 'PRS_LAPSING', 'SIP', 'CONFORMITY', 'INC_SANCTION_EXISTS', 'DOB', 'MS_OFFINJP', 'INC_START', 'CONFORM', 'OFF_RACE', 'PRS', 'LABEL', 'PRS_NONLAPSING', 'JP_CC_BUG', 'MORE_REASONS', 'MS_SANCTION', 'STATE_IP', 'MS_SENTJP', 'OFN_LIFE_DEATH', 'OTN', 'DISPOSITION', 'COUNTY', 'DOFAGE',

In [4]:
#DELETE

# Copying a this column needed from the main dataset
# !!! AZ: I don't think we actually need ofn-id, i used it in my code to do a count so really any other column is fine! 
# usecols = ['OFN_ID']
# main_df = pd.read_csv('../../../Project/data/Main.csv', usecols=usecols) 

In [5]:
# DELETE

#ofn_df = main_df.copy()


In [293]:
df = psc_trimmed.copy()  #copy the trimmed dataset into a new dataframe

df.columns = df.columns.str.upper() #change all column names to uppercase

In [294]:
df.head() #inspect the dataset

Unnamed: 0,UNNAMED: 0.1,UNNAMED: 0,INC_END,MOSTSERIOUS,MND_MINIMUM,OFF_AGE,PRS_MANUAL,JPS_DRUG_DEPENDENT,INC_TYPE,JPO_ID,...,F2TOT.1,F3TOT.1,M1TOT.1,M2TOT.1,F1F2.1,MOSTSERIOUS.1,OFN_TITLE.1,OFN_SECTION,OFN_SUBSECTION,JPS_WALSHACTASSESSMENT
0,0,0,,7.0,,36.0,,N,,938145,...,0.0,0.0,0.0,0.0,0.0,7.0,18,6301,(a)(1)*,
1,1,1,02 Dec 02,7.0,48 Hours,18.0,,N,County Facility,942201,...,0.0,0.0,0.0,0.0,0.0,7.0,75,3731,,
2,2,2,27 Apr 01,7.0,48 Hours,36.0,,N,County Facility,971901,...,0.0,0.0,0.0,0.0,0.0,7.0,75,3731,,
3,3,3,,7.0,,22.0,,N,,919472,...,0.0,0.0,0.0,0.0,0.0,7.0,18,2701,(b),
4,4,4,01 Jan 03,6.0,30 Days,40.0,,N,County Facility,870391,...,0.0,0.0,0.0,0.0,0.0,6.0,75,3731,,


# Table of Contents  EDIT!!!!!

1. Data Cleaning
    1. Convert Dates 
    2. [Combine REVOC and RFEL Categories](#combining-revoc-into-rfel)
    3. Clean DOF
    1. [Clean Missing PRS Scores in the 7th Edition](#clean-missing-prs-score)
    4. Clean Missing PRS Scores in the 8th Edition
    1. [Clean JP_CC_BUG Issue](#clean-jp-cc-bug)
2. Adding New Columns
    1. Crime Categories
    2. Handing Philadelphia Cases Vs. Other Counties
2. [At-Risk Date Calculation](#implement-at-risk-date-calculation-logic)
    1. Group Data at JPR_ID Level
        1. [Address Mutiple Dates of Sentencing](#multiple-dos-for-one-jprid)
        2. [Create Adjusted JP_MIN Value]
        3. [Check INC_SANCTION EXISTS](#incsanctionexists-check)
    2. Group Data at the ID_VARIABLE, DOS LEVEL
    3. Implement At-Risk Date Logic
3. Calculate Recidivism
    1. [Calculate Next Date of Offense](#populate-next-dof)
    2. [Check for Free Time](#check-for-"free-time")
    3. [Calculate Time to Recidivate & 3-Year and 5-Year Recidivism Variables](#create-time-to-recidivate-and-recidivsm-variables)

Note: the links aove may not work well in VSCode but wold work better in Jupyter Notebooks via Anaconda
    


# Data Cleaning

## Convert Dates

In [295]:
## Convert Dates#convert date strings to datetime variable
df[['DOF','DOS']] = df[['DOF','DOS']].apply(pd.to_datetime,format="%d %b %y")

In [296]:
# extracting out the just the year from the date to be used later 
df['DOF_YEAR'] = pd.DatetimeIndex(df['DOF']).year
df['DOS_YEAR'] = pd.DatetimeIndex(df['DOS']).year

In [297]:
#checking the range of values for the DOF and DOS variables
print("The minimum date of offense in the dataset is: {}".format(df[["DOF"]].min()[0]))
print("The maximum date of offense in the dataset is: {}".format(df[["DOF"]].max()[0]))
print("The minimum date of sentencing in the dataset is: {}".format(df[["DOS"]].min()[0]))
print("The maximum date of sentencing in the dataset is: {}".format(df[["DOS"]].max()[0]))

The minimum date of offense in the dataset is: 1984-11-14 00:00:00
The maximum date of offense in the dataset is: 2020-05-08 00:00:00
The minimum date of sentencing in the dataset is: 2001-01-01 00:00:00
The maximum date of sentencing in the dataset is: 2019-12-31 00:00:00


## Combine REVOC into RFEL for 7th Edition PRS Scores

In [298]:
# Values before conversion
df['PRS'].unique()

array(['0', '1', '2', '3', '5', '4', 'RFEL', 'REVOC', 'rfel', 'RFEl', nan,
       'Rfel', 'revoc'], dtype=object)

In [299]:
def refl_combine(x):
    if x in ['RFEL', 'REVOC', 'rfel', 'RFEl', 'Rfel', 'revoc']:
        return('RFEL/REVOC')
    else:
        return(x)   

In [300]:
df['PRS'] = df['PRS'].apply(refl_combine)

In [301]:
df['PRS8'].unique()

array([1., 2., 3., 4.])

In [302]:
df['PRS'].unique()

array(['0', '1', '2', '3', '5', '4', 'RFEL/REVOC', nan], dtype=object)

## Clean DOF

Note: group offense by ID_VAR, JPR_ID, MIN(DOF) to get the first DOF associated for a single JPR_ID

### Step 1: Get **minimum** value for the DOF across all of the charges associated with **one** JPR_ID, ID VARIABLE COMBO. 

Note: This is the procedure because we don't wan't to count a DOF as an instance of recidivism if it occurs BEFORE the date of sentencing. Additionally, we group here by id_var and jpr_id because multiple id variables can be associated with one JPR_ID.

In [303]:
#at the JPR_ID level we only want ONE DOF because becuase we don't want to take into account DOF's that occur
#BEFORE the DOS (associated with the JPR_ID) as an instance of recidivism. -- each JPR_ID should have only ONE DOS

#here we will group by id_Variable as well as jpr_id
df["NEW_DOF"] = df.groupby(["JPR_ID", "ID_VARIABLE"])["DOF"].transform("min")



In [304]:
#testing code
df_test = df.copy()

df_jprid = df_test[df_test["JPR_ID"] == 5499834]
df_test = df_test[df_test["ID_VARIABLE"] == 1468038]

df_test[["JPR_ID", "ID_VARIABLE", "DOS", "DOF", "NEW_DOF","OFN_LABEL"]]


Unnamed: 0,JPR_ID,ID_VARIABLE,DOS,DOF,NEW_DOF,OFN_LABEL
676640,465584,1468038,2006-09-07,2006-04-15,2005-12-31,DUI: High Rate of Alcohol - (BAC .10 - < .16) ...
676641,465584,1468038,2006-09-07,2005-12-31,2005-12-31,DUI: High Rate of Alcohol - (BAC .10 - < .16) ...
1964753,5499834,1468038,2015-03-24,2014-08-18,2014-08-18,Theft by unlaw taking-movable property ($200-$...
2104401,5646230,1468038,2016-05-23,2016-01-28,2016-01-28,Possession-drug paraphernalia


In [305]:
df.head()[["JPR_ID", "ID_VARIABLE","DOF", "NEW_DOF"]]

Unnamed: 0,JPR_ID,ID_VARIABLE,DOF,NEW_DOF
0,640001,1904581,2000-04-01,2000-04-01
1,642480,1157226,1999-12-31,1999-12-31
2,660434,1467650,2000-12-23,2000-12-23
3,628940,1746031,2000-06-26,2000-06-26
4,594048,1374131,2000-10-15,2000-10-15


In [306]:
dof_missing = df[df['NEW_DOF'].isnull()]

percent_missing = len(dof_missing)/len(df)
print("After cleaning, there are {:,} ({:%}) rows with missing DOFs in the dataset.".format(len(dof_missing), percent_missing))

After cleaning, there are 11,785 (0.454381%) rows with missing DOFs in the dataset.


### Step 2: Subset the data to just include those rows where NEW_DOF <= DOS

In [307]:
#make sure the sentencing 
before_length = len(df)
df = df[df.NEW_DOF <= df.DOS] 
after_length = len(df)


In [308]:
print("Before DOF <= DOS correction, there were {:,} rows and after cleaning there were {:,} rows. A change of {:,}.".format(before_length, after_length, before_length - after_length))


Before DOF <= DOS correction, there were 2,593,636 rows and after cleaning there were 2,581,813 rows. A change of 11,823.


## Clean Missing PRS Score 

In [309]:
before_length = len(df)
#subset to just the id variables with a PRS score missing
id_varswith_prsmissing= set(df[df.PRS.isnull()].ID_VARIABLE)

#remove id vars with missing PRS
df_prs_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_prsmissing)]

#reassign to working dataframe
df = df_prs_notaffected 

after_length = len(df)
print("Before PRS correction there were {:,} rows and after cleaning there were {:,} rows. A change of {:,} rows and {} people.".format(before_length, after_length, before_length - after_length, len(id_varswith_prsmissing)))


Before PRS correction there were 2,581,813 rows and after cleaning there were 2,581,750 rows. A change of 63 rows and 18 people.


## Clean Missing PRS8 (8th Edition Sentencing Guidelines) Score

In [310]:
before_length = len(df)
#subset to just the id variables with a PRS score missing
id_varswith_prs8missing= set(df[df.PRS8.isnull()].ID_VARIABLE)

#remove id vars with missing PRS
df_prs8_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_prs8missing)]

#reassign to working dataframe
df = df_prs8_notaffected 

after_length = len(df)
print("Before PRS8 correction there were {:,} rows and after cleaning there were {:,} rows. A change of {:,} rows and {} people.".format(before_length, after_length, before_length - after_length, len(id_varswith_prs8missing)))


Before PRS8 correction there were 2,581,750 rows and after cleaning there were 2,581,750 rows. A change of 0 rows and 0 people.


## Clean JP CC Bug

## Steps followed in cleaning JP_CC Bug
1. It is evident that there are JPR_ID's with DOS from 2016 to 2019 that were impacted by the JP_CC_BUG 
2. The first step was to extract out the ID variables that were impacted out by the bug. 
3. Next, we removed the Judicial proceedings of these JPR_ID's where the DOS is in 2017,2018, and 2019. However, the JPR_ID's associated with the first occurence of the JP_CC Bug is kept- In other words, the JPR_ID's where the DOS was in 2016 is kept. 


In [311]:
#confirming the years that impacted the JP_CC_BUG
set(df[df.JP_CC_BUG=='Y'].DOS_YEAR)


{2016, 2017, 2018, 2019}

In [312]:
# Obtaining the id variables with jp_bug
id_varswith_jpbug= set(df[df.JP_CC_BUG=='Y'].ID_VARIABLE) #pull out both id_variable and DOS

In [313]:
# assigning all the rows associated with the jp bugs to a seperate dataframe 
df_with_jpbug=  df[df.ID_VARIABLE.isin(id_varswith_jpbug)]  #want to remove the charges that come after the DOS associated with the JP_CC_BUG row (want to eliminate the problemative date of sentencing)

In [314]:
# Removing the JPR'IDS's that have DOS in 2017,2018 and 2019
df_jp_bug_cleaned = df_with_jpbug[df.DOS_YEAR<2017]

  df_jp_bug_cleaned = df_with_jpbug[df.DOS_YEAR<2017]


In [315]:
# Isolating the rows associated with id_vars in the original dataframe that is not associated with the bug
df_jpbug_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_jpbug)]

In [316]:
# Rejoining the rows affected by the JP_CC_bug after cleaning them to the rows not affected by the bug
df_cleaned_1 = pd.concat([df_jpbug_notaffected,df_jp_bug_cleaned])  #new working df

df = df_cleaned_1

In [317]:
after_length = len(df)

print("After the JP_CC_BUG correction there are {:,} rows. ".format(after_length))


After the JP_CC_BUG correction there are 2,574,269 rows. 


# Adding New Columns

## Crime Categories

In [318]:
# Sex crimes
df['SEXCRIME'] = [True if (a == 'Tier I' or a == 'Tier II' or a == 'Tier III') else False for a in df['JPS_WALSHACTASSESSMENT']]

In [319]:
df.JPS_WALSHACTASSESSMENT.unique()

array([nan, 'Tier I', 'Tier II', 'Tier III'], dtype=object)

## Create a Numeric column for the Categorical Sex Crime tiers in JPS_WALSHACTASSESSMENT

In [320]:
def numeric_sextier(x):
    if x =='Tier I':
        return(1)
    elif x=='Tier II':
        return(2)
    elif x=='Tier III':
        return(3)
    else:
        return(x)      

In [321]:
df['SEXTIER_NUMERIC']= df['JPS_WALSHACTASSESSMENT'].apply(numeric_sextier)

In [322]:
df['OFN_SECTION_61'] = [True if a[:2]=='61' else False for a in df['OFN_SECTION']]
df['FIREARMS'] = np.where((df['OFN_SECTION_61'] == True) & (df['OFN_TITLE'] ==
18), True, False)


In [323]:
def conditions(s):
    if (s['OFN_TITLE']==18):
        if (s['OFN_SECTION']=='2502') & (s['OFN_SUBSECTION'] == 'C'):
            return True
        elif (s['OFN_SECTION']=='2502') & (s['OFN_SUBSECTION'] == '(c)'):
            return True
        elif (s['OFN_SECTION']=='2503'):
            return True
        elif (s['OFN_SECTION']=='2506'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C1'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C1I'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C1II'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'D'):
            return True
        elif (s['OFN_SECTION']=='2604') & (s['OFN_SUBSECTION'] == 'C'):
            return True
        elif (s['OFN_SECTION']=='2604') & (s['OFN_SUBSECTION'] == 'C1'):
            return True
        elif (s['OFN_SECTION']=='2606'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(1)'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(1)*'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == 'A1'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(2)'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(2)*'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == 'A2'):
            return True
        elif (s['OFN_SECTION']=='2702.1'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == 'B'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == 'B1'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == '(b)'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == '(b)1'):
            return True
        elif (s['OFN_SECTION']=='2717') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='2718') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='2901'):
            return True
        elif (s['OFN_SECTION']=='3002') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='3011') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='3012') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='3121'):
            return True
        elif (s['OFN_SECTION']=='3123'):
            return True
        elif (s['OFN_SECTION']=='3124.1'):
            return True
        elif (s['OFN_SECTION']=='3125'):
            return True
        elif (s['OFN_SECTION']=='4302'):
            return True
        elif (s['OFN_SECTION']=='3301'):
            if (s['OFN_SUBSECTION'] == 'A'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1I'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1II'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A2'):
                return True
            elif (s['OFN_SUBSECTION'] == '(a)(1)*'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.1'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.11'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.11I'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.11II'):
                return True
        elif (s['OFN_SECTION']=='3311') & (s['OFN_GRADE'] == 'F-1'):     
            return True
        elif (s['OFN_SECTION']=='3502') & (s['OFN_SUBSECTION'] == 'A1'):
            return True
        elif (s['OFN_SECTION']=='3701'):
            if (s['OFN_SUBSECTION'] == '(a)(1)(i)'):
                return True
            elif (s['OFN_SUBSECTION'] == '(a)(1)(ii)'):
                return True
            elif (s['OFN_SUBSECTION'] == '(a)(1)(iii)'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1I'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1II'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1III'):
                return True
        elif (s['OFN_SECTION']=='3702'):
            return True
        else:
            return False
    else:
        return False

In [324]:
df['VIOLENCE'] = df.apply(conditions, axis=1)

In [325]:
# Drug
df['DRUGOFFENSE'] = np.where((df['OFN_TITLE'] == 35), True, False)

In [326]:
# DUI
df['DUI'] = np.where((df['OFN_TITLE'] == 75) & (df['OFN_SECTION']=='3802'), True, False)

# Handling Cases where sentencing was in Philly and another county 

In [327]:
# creating a temp Philly indicator column
df['CTY_PHL'] = np.where(df['COUNTY']=='Philadelphia', 1, 0)

In [328]:
# check whether there are cases in which a single JPR_ID is sentenced in multiple counties 
df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['COUNTY'].nunique().sort_values(ascending=False)

ID_VARIABLE  JPR_ID 
1000001      4915383    1
1610861      2154476    1
1610865      5560041    1
1610864      4918949    1
1610863      5741168    1
                       ..
1305561      2724282    1
             2707509    1
             634482     1
             337354     1
1916197      5830496    1
Name: COUNTY, Length: 1571856, dtype: int64

In [329]:
# !!! AZ EDIT 
# As seen above - this is not an issue. so taking this out 
#df['CTY_PHL'] = df.groupby(by=['JPR_ID', 'ID_VARIABLE'])['_CTY_PHL_'].transform('max') # TJ EDIT = changed the order of these columns

# Adding Concurrent and Consecutive Sentencing Info

In [330]:
# !!! AZ changed the order per Matt's comment 

# Use the following order to map numeric values to INC_RELATIONSHIP (i.e., 0 for NAs, 1 for Concurrent and 2 for Consecutive). 
# By taking the max() value later, this will assign all cases with at least one consecutive sentence in the consecutive category. 
order = [np.nan, 'Concurrent', 'Consecutive']

In [331]:
df['INC_REL_NUMERIC'] = df['INC_RELATIONSHIP'].map(dict(zip(order, np.arange(len(order)))))

In [332]:
df[df['INC_RELATIONSHIP'].notna()][['INC_REL_NUMERIC', 'INC_RELATIONSHIP']].head()

Unnamed: 0,INC_REL_NUMERIC,INC_RELATIONSHIP
49,2.0,Consecutive
54,1.0,Concurrent
57,1.0,Concurrent
80,1.0,Concurrent
81,1.0,Concurrent


In [333]:
df[df['INC_RELATIONSHIP'].isna()][['INC_REL_NUMERIC', 'INC_RELATIONSHIP']].head()

Unnamed: 0,INC_REL_NUMERIC,INC_RELATIONSHIP
0,0.0,
1,0.0,
2,0.0,
3,0.0,
4,0.0,


In [334]:
# Again, by taking the maximum of the INC_REL_NUMERIC variable, we isolate cases that have only concurrent or only consecutive sentences. 
# cases with both consecutive and concurrent cases are grouped as consecutive
df['INC_REL_NUMERIC'] = df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['INC_REL_NUMERIC'].transform('max')

In [335]:
# !!! AZ Edit

# get the number of charges sentenced per JP. Because there are cases where one JP has multiple DOS, we also group by unique DOS here to get the number of charges for each unique hearing for each JP
df['CHARGE_COUNT'] = df.groupby(by=['ID_VARIABLE', 'JPR_ID', 'DOS'])['JPO_ID'].transform('count')

In [336]:
# test code remove later
df[(df['ID_VARIABLE']==1884187) & (df['JPR_ID']==5199873)][['CHARGE_COUNT', 'JPR_ID', 'DOS']].sort_values(by='DOS')

Unnamed: 0,CHARGE_COUNT,JPR_ID,DOS
1591799,6,5199873,2012-12-06
1591800,6,5199873,2012-12-06
1591801,6,5199873,2012-12-06
1591802,6,5199873,2012-12-06
1591803,6,5199873,2012-12-06
1591804,6,5199873,2012-12-06
1885851,6,5199873,2014-10-09
1885850,6,5199873,2014-10-09
1885849,6,5199873,2014-10-09
1885848,6,5199873,2014-10-09


In [337]:
# FOR NOW: we will take the charge count associated with most severe sentence (largest JP_MIN) and highest OGS for a given DOS, after dealing with NANs
# TEAM: review later when incorporating into pipeline as to whether this logic makes sense 
#charge_counts['OGS'] = charge_counts['OGS'].fillna(0)
#charge_counts = charge_counts.sort_values(by=['ID_VARIABLE', 'DOS', 'JP_MIN', 'OGS', 'CHARGE_COUNT'], ascending=[True, True, False, False, False])

# Implement At Risk Date Calculation Logic

## Step 1: Address Issue with Multiple DOS for one JPR_ID

In [338]:
#dos_vals = df.groupby(["JPR_ID"])['DOS'].agg(lambda x: set(x)) #> 1 #how many rows have two UNIQUE DOS for the same JPR_ID (set)

In [343]:
#num_dos = df.groupby(["JPR_ID"])['DOS'].agg(lambda x: len(set(x))) #how many dates of sentencing does each jpr_id have?
# !!! AZ edit. also need to groupby ID_VAR and JPR_ID here

num_dos = df.groupby(["ID_VARIABLE", "JPR_ID"])['DOS'].nunique()
#num_dos[2847193]

In [345]:
#more_than_one_dos.reset_index()
#more_than_one_dos = list(num_dos[num_dos > 1].index)

print("There are {:,} JPR_IDS in the dataset with more than one date of sentence.".format((num_dos>1).sum()))

There are 353 JPR_IDS in the dataset with more than one date of sentence.


In [358]:
#subset the data JUST to those ID's in the more_than_one dos bucket
df['NUM_DOS'] = df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['DOS'].transform('nunique')
only_one_dos = df[df['NUM_DOS']==1]
more_than_one_dos_df = df[df['NUM_DOS']>1]


In [359]:
more_than_one_dos_df = more_than_one_dos_df.sort_values(["JPR_ID", "DOS"]) #sort by jpr_id AND DOS

more_than_one_dos_df.head()[["JPR_ID", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "MS_SENTJP", "INC_SANCTION_EXISTS", "INC_REL_NUMERIC", "CHARGE_COUNT"]]


Unnamed: 0,JPR_ID,DOS,OFN_LABEL,PRS,JP_MIN,MS_SENTJP,INC_SANCTION_EXISTS,INC_REL_NUMERIC,CHARGE_COUNT
569776,92399,2005-01-06,Murder of The Second Degree,0,30346.0,,Y,0.0,1
2131297,92399,2016-06-27,Murder of The Second Degree,0,,Yes,Y,0.0,1
620698,117010,2005-02-01,Aggravated Assault - Cause or Att B.I. w/Deadl...,5,8127.0,,Y,2.0,4
620699,117010,2005-02-01,Burglary - Home: Person Present,5,8127.0,,Y,2.0,4
620700,117010,2005-02-01,Murder Inchoate - Attempt with S.B.I.,5,8127.0,,Y,2.0,4


In [360]:
#create a new column with the NEW_DOS value
more_than_one_dos_df["MAX_DOS"] = more_than_one_dos_df.groupby("JPR_ID")["DOS"].transform("max") #take the latest date of sentencing
more_than_one_dos_df["MIN_DOS"] = more_than_one_dos_df.groupby("JPR_ID")["DOS"].transform("min") #take the earliest date of sentencing

# #create a new time served column
more_than_one_dos_df["TIME_SERVED"] = more_than_one_dos_df["MAX_DOS"] - more_than_one_dos_df["MIN_DOS"]
more_than_one_dos_df["TIME_SERVED"] = more_than_one_dos_df["TIME_SERVED"].dt.days

#finds the JP_MIN associated with the latest DOS (because the data is already sorted by JPR_ID and DOS)
more_than_one_dos_df["LATEST_JPMIN"] = more_than_one_dos_df.groupby("JPR_ID")["JP_MIN"].transform("last")

#calculate an adjusted JP_MIN from the logic provided by Miranda
# ORIG: more_than_one_dos_df['ADJ_JPMIN'] = more_than_one_dos_df["LATEST_JPMIN"] - more_than_one_dos_df["TIME_SERVED"]
#!!! AZ edit, to remove the issue where ADJ_JPMIN ends up negative 
more_than_one_dos_df['ADJ_JPMIN'] = np.where(more_than_one_dos_df['LATEST_JPMIN']>0, more_than_one_dos_df["LATEST_JPMIN"] - more_than_one_dos_df["TIME_SERVED"], more_than_one_dos_df['LATEST_JPMIN'])

# # more_than_one_dos_df[:20][["JPR_ID", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "ADJ_JPMIN", "INC_SANCTION_EXISTS", "NEW_DOS"]]
more_than_one_dos_df[:20][["JPR_ID", "ID_VARIABLE", "DOS", "OFN_LABEL", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN", "INC_REL_NUMERIC", "CHARGE_COUNT"]]


Unnamed: 0,JPR_ID,ID_VARIABLE,DOS,OFN_LABEL,MIN_DOS,JP_MIN,MAX_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN,INC_REL_NUMERIC,CHARGE_COUNT
569776,92399,1877126,2005-01-06,Murder of The Second Degree,2005-01-06,30346.0,2016-06-27,4190,30346.0,26156.0,0.0,1
2131297,92399,1877126,2016-06-27,Murder of The Second Degree,2005-01-06,,2016-06-27,4190,30346.0,26156.0,0.0,1
620698,117010,1325462,2005-02-01,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0,2.0,4
620699,117010,1325462,2005-02-01,Burglary - Home: Person Present,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0,2.0,4
620700,117010,1325462,2005-02-01,Murder Inchoate - Attempt with S.B.I.,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0,2.0,4
620701,117010,1325462,2005-02-01,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0,2.0,4
895508,117010,1325462,2007-10-17,Murder Inchoate - Attempt with S.B.I.,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0,2.0,4
895509,117010,1325462,2007-10-17,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0,2.0,4
895510,117010,1325462,2007-10-17,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0,2.0,4
895511,117010,1325462,2007-10-17,Burglary - Home: Person Present,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0,2.0,4


In [361]:
# There are several cases in which the number of charges heard on different DOS for a given JPR_ID is not the same
more_than_one_dos_df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['CHARGE_COUNT'].nunique().sort_values(ascending=False)

ID_VARIABLE  JPR_ID 
1235359      5288591    2
1759010      2869652    2
1712533      2876997    2
1716167      5104218    2
1607882      5917531    2
                       ..
1345319      5335810    1
1343888      3698897    1
1326751      5667281    1
1325462      117010     1
1913828      4833136    1
Name: CHARGE_COUNT, Length: 353, dtype: int64

In [362]:
# As an example: this is a case in which the second hearing of the same JP had more charges than the previous
more_than_one_dos_df[(more_than_one_dos_df['ID_VARIABLE']==1697895) & (more_than_one_dos_df['JPR_ID']==5311207)][["JPR_ID", "ID_VARIABLE", "DOS", "OFN_LABEL", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN", "INC_REL_NUMERIC", "CHARGE_COUNT"]]

Unnamed: 0,JPR_ID,ID_VARIABLE,DOS,OFN_LABEL,MIN_DOS,JP_MIN,MAX_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN,INC_REL_NUMERIC,CHARGE_COUNT
1651785,5311207,1697895,2013-10-15,Firearms-persons not to possess: convicted of ...,2013-10-15,2739.0,2016-07-07,996,2740.0,1744.0,2.0,2
1651786,5311207,1697895,2013-10-15,Firearms-persons not to possess: convicted of ...,2013-10-15,2739.0,2016-07-07,996,2740.0,1744.0,2.0,2
2081313,5311207,1697895,2016-07-07,Firearms-carried w/o license: ineligible (load...,2013-10-15,2740.0,2016-07-07,996,2740.0,1744.0,2.0,3
2081314,5311207,1697895,2016-07-07,Firearms-persons not to possess: convicted of ...,2013-10-15,2740.0,2016-07-07,996,2740.0,1744.0,2.0,3
2081315,5311207,1697895,2016-07-07,Firearms-carried w/o license: ineligible (unlo...,2013-10-15,2740.0,2016-07-07,996,2740.0,1744.0,2.0,3


In [363]:
# This is an example where the earlier sentencing date had more charges than the later one 
# !!! AZ NOTE TO TEAM: there is a negative ADJ_JPMIN value here - should be 0 if there is no sentence. should be an easy fix and doesn't impact any downstream analyses since we use ADJ_JPMIN > 0 for analyses

more_than_one_dos_df[(more_than_one_dos_df['ID_VARIABLE']==1668975) & (more_than_one_dos_df['JPR_ID']==5369358)][["JPR_ID", "ID_VARIABLE", "DOS", "OFN_LABEL", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN", "INC_REL_NUMERIC", "CHARGE_COUNT"]]

Unnamed: 0,JPR_ID,ID_VARIABLE,DOS,OFN_LABEL,MIN_DOS,JP_MIN,MAX_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN,INC_REL_NUMERIC,CHARGE_COUNT
1829036,5369358,1668975,2014-03-11,Firearms-Persons Not To Possess - Convicted of...,2014-03-11,1826.0,2015-03-26,380,365.0,-15.0,1.0,2
1829037,5369358,1668975,2014-03-11,Possession With Intent to Deliver: Cocaine (2....,2014-03-11,1826.0,2015-03-26,380,365.0,-15.0,1.0,2
1971514,5369358,1668975,2015-03-26,Possession With Intent to Deliver: Cocaine (2....,2014-03-11,365.0,2015-03-26,380,365.0,-15.0,1.0,1


In [364]:
# !!! AZ NOTE TO TEAM: there is a negative ADJ_JPMIN value here - should be 0 if there is no sentence. should be an easy fix and doesn't impact any downstream analyses since we use ADJ_JPMIN > 0 for analyses
more_than_one_dos_df[(more_than_one_dos_df['ID_VARIABLE']==1268610) & (more_than_one_dos_df['JPR_ID']==5579677)][["JPR_ID", "ID_VARIABLE", "DOS", "OFN_LABEL", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN", "INC_REL_NUMERIC", "CHARGE_COUNT"]]

Unnamed: 0,JPR_ID,ID_VARIABLE,DOS,OFN_LABEL,MIN_DOS,JP_MIN,MAX_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN,INC_REL_NUMERIC,CHARGE_COUNT
2045097,5579677,1268610,2015-12-18,Theft by unlaw taking-movable property ($50-<$...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045098,5579677,1268610,2015-12-18,"Theft by unlaw taking-movable property (<=$2,0...",2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045099,5579677,1268610,2015-12-18,Theft by unlaw taking-movable property ($50-<$...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045100,5579677,1268610,2015-12-18,Access device fraud-unauthorized use of device...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045101,5579677,1268610,2015-12-18,Access device fraud-unauthorized use of device...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045102,5579677,1268610,2015-12-18,"Theft by unlaw taking-movable property (<=$2,0...",2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045103,5579677,1268610,2015-12-18,Access device fraud-unauthorized use of device...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045104,5579677,1268610,2015-12-18,Access device fraud-unauthorized use of device...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045105,5579677,1268610,2015-12-18,Access device fraud-unauthorized use of device...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12
2045106,5579677,1268610,2015-12-18,Theft by unlaw taking-movable property ($50-<$...,2015-12-18,0.0,2019-09-16,1368,0.0,0.0,0.0,12


In [365]:
# Using the latest DOS, we want to get the number of charges sentenced for the most recent hearing of each JP
# !!! AZ Edit
more_than_one_dos_df['CHARGE_COUNT'] = more_than_one_dos_df.sort_values(by=['ID_VARIABLE', 'JPR_ID', 'DOS'], ascending=[True, True, True]).groupby(by=['ID_VARIABLE', 'JPR_ID'])['CHARGE_COUNT'].transform('last')

In [366]:
# using the latest JP_MIN information (i.e. the most 'final' incarceration length), get the inc_relationship associated with that JP 
# we also want to take the max(IND_REL_NUMERIC), so we also sort in ascending order by INC_REL_NUMERIC and take the last observation
# !!!! AZ edit
more_than_one_dos_df['INC_REL_NUMERIC'] = more_than_one_dos_df.sort_values(by=['ID_VARIABLE', 'JPR_ID', 'DOS', 'INC_REL_NUMERIC'], ascending=[True, True, True, True]).groupby(by=['ID_VARIABLE', 'JPR_ID'])['INC_REL_NUMERIC'].transform('last')

In [367]:
#combine the data back together

df_combo_dos = pd.concat([only_one_dos, more_than_one_dos_df])

df = df_combo_dos

## **Step 2:** Create a New JP_MIN variable that takes the Max(JP_MIN) for a given JPR_ID, for cases that were not sentenced multiple times (EDIT & EXAMINE)

In [368]:
#inspect the results

#impute missing values for the ADJ_JPMIN values that are currently null -- if there are multiple JP_MINS for an instance that DOESNT have multiple sentencing dates, 
# just take the maximum value of the JP_MIN available and set it equal to the adj_jpmin value

# !!! AZ shouldn't this also be grouping by ID_VAR and JPR_ID? -- TJ ADDRESSED THIS

df.loc[df["ADJ_JPMIN"].isnull(), "ADJ_JPMIN"] =  df.groupby(["JPR_ID", "ID_VARIABLE"])["JP_MIN"].transform("max")

#examine if for a jpr_id, individiuals have the same jp_min in the same jpr_id
# 1. first just subset to those jpr_ids with multiple id_variables


#2. inspect if the jp_min values are the same

#df.sort_values(["JPR_ID"])[:20][["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


#[["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]





In [369]:
#IN THE BELOW, WE CHECK TO SEE HOW MANY CASES THERE ARE WHERE THERE ARE MULTIPLE ID_VARS FOR A GIVEN JPR_ID AND CAN OCCUR IN THE CASE THAT THE SAME JPR_ID IS RE-USED FOR MULTIPLE CASES
# THIS NUMBER IS NECESSARY TO KNOW BC IT SERVES AS THE JUSTIFICATION FOR GROUPING AT THE JPR_ID, ID_VARIABLE LEVEL IN OUR DATASET
# num_id_vars = df.groupby(["JPR_ID"])['ID_VARIABLE'].agg(lambda x: len(set(x)))
# more_than_one_id_var = list(num_id_vars[num_id_vars > 1].index)
# #more_than_one_id_var_df = df.loc[df["JPR_ID"].isin(more_than_one_id_var)]

# len(more_than_one_id_var) #TJ EDIT


In [370]:
# !!! AZ edit: much faster way to do the above 
(df.groupby(by=['JPR_ID'])['ID_VARIABLE'].nunique()>1).sum()

11

In [371]:
df[["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]

df.loc[df["MAX_DOS"].isnull()][["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


Unnamed: 0,JPR_ID,DOS,MIN_DOS,JP_MIN,MAX_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN
0,640001,2001-06-12,NaT,,NaT,,,
1,642480,2001-12-03,NaT,2.0,NaT,,,2.0
2,660434,2001-04-26,NaT,2.0,NaT,,,2.0
3,628940,2001-05-22,NaT,,NaT,,,
4,594048,2001-01-03,NaT,183.0,NaT,,,183.0
...,...,...,...,...,...,...,...,...
2189215,5614645,2016-03-29,NaT,,NaT,,,
2189216,5614645,2016-03-29,NaT,,NaT,,,
2189217,5640865,2016-06-22,NaT,2.0,NaT,,,2.0
2189219,5663216,2016-12-08,NaT,90.0,NaT,,,90.0


## !!! AZ added here - moved up code box for new DOS

In [372]:
#create a new date of sentence variable

# !!!!  AZ NOTE TO TEAM: moved code block up. if we're using new_DOS as the unit of analysis later on, we should create this variable early so we end up collapsing data using this column 

#if max_dos is null, then there is only one DOS associated with a given JPR_ID OTHERWISE, the new_dos becomes the max dos (meaning there were multiple dates of sentence associated with that JPR_ID)

df.loc[df["MAX_DOS"].notna(), "NEW_DOS"] = df["MAX_DOS"]

df.loc[df["MAX_DOS"].isnull(), "NEW_DOS"] = df["DOS"]

## Step 3: Ensure that the INC_SANCTION_EXISTS Flag Is Aggregated Correctly

# !!! NOTE TO TEAM: AZ edit to this entire section is at the end of this section

In [373]:
# #figure out how many different values INC_SANCTION_EXISTS takes on for each JPR_ID
# #if at least 1 charge is = Y (JPR_ID might have Yes and NO) -- then inc_sanction_exists for the ENTIRE JPR_ID should be "Y"

# num_inc_sanc_vals = df.groupby(["JPR_ID"])['INC_SANCTION_EXISTS'].agg(lambda x: len(set(x))) #INC_SANCTION_EXISTS values does each jpr_id have?

# #give back the list of JPR_IDs that have more than one inc_sanction_exists value
# more_than_one_inc_sanc = list(num_inc_sanc_vals[num_inc_sanc_vals > 1].index)


In [374]:
# only_one_inc_sanc = df.loc[~df["JPR_ID"].isin(more_than_one_inc_sanc)]
# more_than_one_inc_sanc_df = df.loc[df["JPR_ID"].isin(more_than_one_inc_sanc)]


In [375]:
# df[df['JPR_ID']==622266][['ID_VARIABLE', 'JPR_ID', 'INC_SANCTION_EXISTS', 'INC_TYPE']]

In [376]:
# #more_than_one_inc_sanc_df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS"]]

# more_than_one_inc_sanc_df["NEW_INC_SANCTION_EXISTS"] = "Y"
# more_than_one_inc_sanc_df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

In [377]:
# inc_sanc_combined = pd.concat([only_one_inc_sanc, more_than_one_inc_sanc_df])

In [378]:
# inc_sanc_combined[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

In [379]:
# df = inc_sanc_combined

# df.loc[df["NEW_INC_SANCTION_EXISTS"].isnull(), "NEW_INC_SANCTION_EXISTS"] =  df["INC_SANCTION_EXISTS"]

# df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

##### !!! AZ edit of this entire section: 

In [380]:
# First, check whether there are cases with inconsistent INC_SANCTION_EXISTS information
df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['INC_SANCTION_EXISTS'].nunique().sort_values(ascending=False)

ID_VARIABLE  JPR_ID 
1000001      4915383    2
1581070      5061825    2
1794157      5601285    2
1136551      5443606    2
1136552      5387805    2
                       ..
1823010      677199     0
1906251      446285     0
1543836      2697016    0
1610636      1846646    0
1173315      275370     0
Name: INC_SANCTION_EXISTS, Length: 1571856, dtype: int64

In [381]:
# Looking at one example
df[(df['ID_VARIABLE']==1000001) & (df['JPR_ID']==4915383)][["ID_VARIABLE", "JPR_ID", "DOS", "INC_SANCTION_EXISTS", "INC_TYPE"]]

Unnamed: 0,ID_VARIABLE,JPR_ID,DOS,INC_SANCTION_EXISTS,INC_TYPE
1312343,1000001,4915383,2010-02-18,N,
1312344,1000001,4915383,2010-02-18,Y,County Facility
1312345,1000001,4915383,2010-02-18,N,


In [382]:
# There are also cases with null INC_SANCTION_EXISTS values
df['INC_SANCTION_EXISTS'].isna().sum()

67685

In [383]:
df[df['INC_SANCTION_EXISTS'].isna()][["ID_VARIABLE", "JPR_ID", "DOS", "INC_SANCTION_EXISTS", "INC_TYPE", "OFN_LABEL"]]

Unnamed: 0,ID_VARIABLE,JPR_ID,DOS,INC_SANCTION_EXISTS,INC_TYPE,OFN_LABEL
42,1079152,622266,2001-11-19,,,Indecent Exposure
43,1079152,622266,2001-11-19,,,Indecent Exposure
44,1079152,622266,2001-11-19,,,Corruption of Minors - when of a sexual nature
62,1869842,654996,2001-02-26,,,Possession With Intent to Deliver: Drug Unknown
187,1601637,625637,2001-12-03,,,Burglary - Not of a Home: No One Present
...,...,...,...,...,...,...
1149953,1621749,1070583,2009-05-20,,,Theft - Deception; $50 - < $200
1149958,1621749,1519136,2009-05-20,,,"Theft - Unlawful Taking; $200 - $2,000"
1183671,1102408,2739046,2009-06-16,,,Criminal Use of Communication Facility
1130151,1563544,3540976,2009-06-09,,,Simple Assault


In [384]:
# Are there cases where INC_SANCTION_EXISTS is N or NA but INC_TYPE is filled out? No
df[(df['INC_SANCTION_EXISTS'].isna()) & (df['INC_TYPE'].notna())]

Unnamed: 0,UNNAMED: 0.1,UNNAMED: 0,INC_END,MOSTSERIOUS,MND_MINIMUM,OFF_AGE,PRS_MANUAL,JPS_DRUG_DEPENDENT,INC_TYPE,JPO_ID,...,CTY_PHL,INC_REL_NUMERIC,CHARGE_COUNT,NUM_DOS,MAX_DOS,MIN_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN,NEW_DOS


In [385]:
# Are there cases where INC_SANCTION_EXISTS is N or NA but INC_MIN is filled out? No
df[(df['INC_SANCTION_EXISTS'].isna()) & (df['INCMIN'].notna())]

Unnamed: 0,UNNAMED: 0.1,UNNAMED: 0,INC_END,MOSTSERIOUS,MND_MINIMUM,OFF_AGE,PRS_MANUAL,JPS_DRUG_DEPENDENT,INC_TYPE,JPO_ID,...,CTY_PHL,INC_REL_NUMERIC,CHARGE_COUNT,NUM_DOS,MAX_DOS,MIN_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN,NEW_DOS


In [386]:
# In this case, we can fill out all NA values in INC_SANCTION_EXISTS with "N"
df['INC_SANCTION_EXISTS'].fillna("N", inplace=True)

In [387]:
# Next, by taking the MAX() within each ID_VARIABLE and JPR_ID group, we can flag a case as INC_SANCTION_EXISTS = Y if at least one charge in the JPR_ID has an incarceration sanction 
df['NEW_INC_SANCTION_EXISTS'] = np.where(df['INC_SANCTION_EXISTS']=='Y', 1, 0)
df['NEW_INC_SANCTION_EXISTS'] = df.groupby(by=['JPR_ID', 'ID_VARIABLE'])['NEW_INC_SANCTION_EXISTS'].transform('max')

In [388]:
print("Note: As shown below, there are two distinct JPs before we collapse at the ID_VAR, DOS-LEVEL with the adj_jpmin & new_inc_sanction_exists mismatch")
df.loc[(df["ADJ_JPMIN"] > 0) & (df["NEW_INC_SANCTION_EXISTS"] == 0)][["ID_VARIABLE", "JPR_ID", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "INC_SANCTION_EXISTS", "INCMIN", "ADJ_JPMIN"]] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 


Note: As shown below, there are two distinct JPs before we collapse at the ID_VAR, DOS-LEVEL with the adj_jpmin & new_inc_sanction_exists mismatch


Unnamed: 0,ID_VARIABLE,JPR_ID,DOS,NEW_DOF,NEW_INC_SANCTION_EXISTS,INC_SANCTION_EXISTS,INCMIN,ADJ_JPMIN
2208408,1501571,5794095,2017-09-07,2016-02-18,0,N,,349.0
2208409,1501571,5794095,2017-09-07,2016-02-18,0,N,,349.0
2347266,1644095,5480997,2018-07-25,2014-07-29,0,N,,148.0


In [389]:
# check out these cases 
df[(df['ID_VARIABLE']==1501571) & (df['JPR_ID']==5794095)][["ID_VARIABLE", "JPR_ID", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "INC_SANCTION_EXISTS", "INCMIN", "JP_MIN", "ADJ_JPMIN"]]

Unnamed: 0,ID_VARIABLE,JPR_ID,DOS,NEW_DOF,NEW_INC_SANCTION_EXISTS,INC_SANCTION_EXISTS,INCMIN,JP_MIN,ADJ_JPMIN
2208408,1501571,5794095,2017-09-07,2016-02-18,0,N,,349.0,349.0
2208409,1501571,5794095,2017-09-07,2016-02-18,0,N,,349.0,349.0


In [390]:
df[(df['ID_VARIABLE']==1644095) & (df['JPR_ID']==5480997)][["ID_VARIABLE", "JPR_ID", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "INC_SANCTION_EXISTS", "INCMIN", "JP_MIN", "ADJ_JPMIN"]]

Unnamed: 0,ID_VARIABLE,JPR_ID,DOS,NEW_DOF,NEW_INC_SANCTION_EXISTS,INC_SANCTION_EXISTS,INCMIN,JP_MIN,ADJ_JPMIN
2347266,1644095,5480997,2018-07-25,2014-07-29,0,N,,148.0,148.0


In [391]:
# To resolve the above, we will mark NEW_INC_SANCTION_EXISTS as "Y" if ADJ_JPMIN is > 0
df['NEW_INC_SANCTION_EXISTS'] = np.where(df['ADJ_JPMIN']>0, 1, df['NEW_INC_SANCTION_EXISTS'])

In [392]:
print("Note: double check that this issue is resolved. As seen below, there are no more conflicts")
df.loc[(df["ADJ_JPMIN"] > 0) & (df["NEW_INC_SANCTION_EXISTS"] == 0)][["ID_VARIABLE", "JPR_ID", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "INC_SANCTION_EXISTS", "INCMIN", "ADJ_JPMIN"]] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 


Note: double check that this issue is resolved. As seen below, there are no more conflicts


Unnamed: 0,ID_VARIABLE,JPR_ID,DOS,NEW_DOF,NEW_INC_SANCTION_EXISTS,INC_SANCTION_EXISTS,INCMIN,ADJ_JPMIN


# Aggregate Select Columns at the JPR_ID Level

## Get MAX PRS and OGS Scores at the JPR_ID level

In [393]:
df['PRS'].unique()

array(['0', '1', '2', '3', '5', '4', 'RFEL/REVOC'], dtype=object)

In [394]:
# convert PRS scores to int to allow for taking the maximum value when collapsing
df['PRS'] = np.where(df['PRS']=='RFEL/REVOC', '6', df['PRS'])
df['PRS'] = df['PRS'].astype('int')

In [395]:
# !!! AZ shouldn't this be grouping by ID_VAR and JPR_ID level?? 
# original code: df['PRS8'] = df.groupby(["JPR_ID"])['PRS8'].transform("max")

# new code:
df['PRS8'] = df.groupby(["ID_VARIABLE", "JPR_ID"])['PRS8'].transform("max")
df['PRS'] = df.groupby(["ID_VARIABLE", "JPR_ID"])['PRS'].transform('max')
df['OGS'] = df.groupby(["ID_VARIABLE", "JPR_ID"])['OGS'].transform('max')

## Collapsing Crime Type Variables at the JPR_ID Level

In [396]:
crime_cats = ['SEXCRIME', 'SEXTIER_NUMERIC', 'FIREARMS', 'VIOLENCE', 'DRUGOFFENSE', 'DUI']

In [397]:
for i in crime_cats:
    df[i] = np.where(df[i]==True, 1, 0)

In [398]:
for i in crime_cats:
    df[i] = df.groupby(["ID_VARIABLE", "JPR_ID"])[i].transform("max")

## Collapsing the county information at the JPR_ID Level

In [399]:
# !!! AZ edit: get CTY at JPR_ID level 
df['CTY_PHL'] = df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['CTY_PHL'].transform('max')

### Collapse additional columns !!! AZ added

In [400]:
# !!! AZ Edit
df["ADJ_JPMIN"] = df.groupby(by=["ID_VARIABLE", "JPR_ID"])["ADJ_JPMIN"].transform(max)
df["LATEST_JPMIN"] = df.groupby(by=["ID_VARIABLE", "JPR_ID"])["LATEST_JPMIN"].transform(max)
df["NEW_INC_SANCTION_EXISTS"] = df.groupby(by=["ID_VARIABLE", "JPR_ID"])["NEW_INC_SANCTION_EXISTS"].transform(max)


In [401]:
df["INCMIN"] = df.groupby(by=["ID_VARIABLE", "JPR_ID"])["INCMIN"].transform(max)

In [402]:
df['OFN_LIFE_DEATH'].unique()

array([nan, 'LIFE', 'DEATH'], dtype=object)

In [403]:
df['JP_LIFE_DEATH'].unique()

array([nan, 'Yes', 'No'], dtype=object)

In [404]:
# replace "No" values in JP_LIFE_DEATH column with NAs 
df['JP_LIFE_DEATH'] = np.where(df['JP_LIFE_DEATH']=='No', np.nan, df['JP_LIFE_DEATH'])

In [405]:
life_death = ['OFN_LIFE_DEATH', 'JP_LIFE_DEATH']

# replace with numeric values
for i in life_death:
    df[i] = np.where(df[i].isna(), 0, 1)

In [406]:
for i in life_death:
    df[i] = df.groupby(by=["ID_VARIABLE", "JPR_ID"])[i].transform(max)

# Collapse the data at the ID_VARIABLE, DOS-LEVEL 



### Adjust the following columns to be at the id var, dos level

In [407]:
# !!! AZ Edit - COMMENTED OUT FOR NOW BY TJ
df['LATEST_JPMIN'] = np.where(df['LATEST_JPMIN'].isna(), df['JP_MIN'], df['LATEST_JPMIN']) # TJ edit: why is this added here? 
# AZ response: so we can output the latest JPMIN for analysis later on for the incarceration sentence comparisons 

In [409]:
df_collapsed = df[['ID_VARIABLE', 'NEW_DOS', 'NEW_DOF', 'PRS', 'OGS', 'PRS8', 'ADJ_JPMIN', 
    'LATEST_JPMIN', 'CTY_PHL', 'INCMIN', 'NEW_INC_SANCTION_EXISTS', 
    'CHARGE_COUNT', 'INC_REL_NUMERIC', 'OFN_LIFE_DEATH', 'JP_LIFE_DEATH'] + crime_cats].copy()


#get the max values of the OGS and JP_MIN values -- possibly further adjustments need to be at this level
df_collapsed['OGS'] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["OGS"].transform(max)

#for the same id_variable, DOS pairing, get the "MIN" (i.e. first) DOF
df_collapsed['NEW_DOF'] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["NEW_DOF"].transform(min)

#get the max PRS and PRS8 scores at the id_variable, date of sentencing level
df_collapsed['PRS'] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["PRS"].transform(max)
df_collapsed["PRS8"] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["PRS8"].transform(max)

#collapse each crime type variable at the id var, dos level
for i in crime_cats:
    df_collapsed[i] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])[i].transform(max)

# collapse the adjusted JPMIN and latest JPMIN variables 
df_collapsed["ADJ_JPMIN"] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["ADJ_JPMIN"].transform(max)
df_collapsed["LATEST_JPMIN"] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["LATEST_JPMIN"].transform(max)

# also collapse INCMIN values for use in edge cases later
df_collapsed["INCMIN"] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["INCMIN"].transform(max)

# collapse Philly county flag information
df_collapsed["CTY_PHL"] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["CTY_PHL"].transform(max)

# collapse incarceration sanction information
df_collapsed["NEW_INC_SANCTION_EXISTS"] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])["NEW_INC_SANCTION_EXISTS"].transform(max)

# collapse life/death information
for i in life_death:
    df_collapsed[i] = df_collapsed.groupby(by=["ID_VARIABLE", "NEW_DOS"])[i].transform(max)

# for charge counts and inc relationship, use the data observed for the most serious sanction (e.g. max latest_jp_min)
# we use LATEST_JPMIN here instead of adjusted JPMIN to consider the actual total length of sentence
# rather than the remaining sentence that needs to be served, for cases with multiple DOS per JP
# we first sort by ID_VAR (ascending), DOS (ascending), and LATEST_JPMIN (descending), 
# and grab the first value for CHARGE_COUNT and INC_REL_NUMERIC within each group 
df_collapsed['CHARGE_COUNT'] = df_collapsed.sort_values(by=['ID_VARIABLE', 'NEW_DOS', 'LATEST_JPMIN'], ascending=[True, True, False]
    ).groupby(['ID_VARIABLE', 'NEW_DOS'])['CHARGE_COUNT'].transform('first')

# note that skipna is set to False here since we want to keep N
df_collapsed['INC_REL_NUMERIC'] = df_collapsed.sort_values(by=['ID_VARIABLE', 'NEW_DOS', 'LATEST_JPMIN'], ascending=[True, True, False]
    ).groupby(['ID_VARIABLE', 'NEW_DOS'])['INC_REL_NUMERIC'].transform('first')


In [None]:
# test case !!! AZ
#df_collapsed[df_collapsed["ID_VARIABLE"]==1884187][['ID_VARIABLE', 'JPR_ID', 'DOS', 'JP_MIN', 'LATEST_JPMIN', 'CHARGE_COUNT', 'INC_REL_NUMERIC_CLEANED', 'INC_RELATIONSHIP']]

In [None]:
#df_collapsed[df_collapsed["ID_VARIABLE"]==1001234][['ID_VARIABLE', 'JPR_ID', 'DOS', 'JP_MIN', 'LATEST_JPMIN', 'CHARGE_COUNT', 'INC_REL_NUMERIC_CLEANED', 'INC_RELATIONSHIP']]

In [None]:
# note the last two rows above have INC_REL as 0 even though there are multiple charges with JP_MIN
# in the orig dataset this is also missing
#psc_trimmed[psc_trimmed["ID_VARIABLE"]==1001234][['ID_VARIABLE', 'JPR_ID', 'DOS', 'JP_MIN', 'INC_RELATIONSHIP']]

## Actually Collapse the Dataset Now 

In [410]:
#collapse data to be at the id variable, DOS level (need to ungroup the data for the at_risk date calculation to work)
df_collapsed = df_collapsed.copy().groupby(["ID_VARIABLE", "NEW_DOS"]).first().reset_index()

#inspect the results
df_collapsed.head() # [["ID_VARIABLE", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN", "LATEST_JPMIN", "CHARGE_COUNT", "CTY_PHL", "CHARGE_COUNT", "INC_REL_NUMERIC"] + crime_cats] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 

Unnamed: 0,ID_VARIABLE,NEW_DOS,NEW_DOF,PRS,OGS,PRS8,ADJ_JPMIN,LATEST_JPMIN,CTY_PHL,INCMIN,...,CHARGE_COUNT,INC_REL_NUMERIC,OFN_LIFE_DEATH,JP_LIFE_DEATH,SEXCRIME,SEXTIER_NUMERIC,FIREARMS,VIOLENCE,DRUGOFFENSE,DUI
0,1000001,2010-02-18,2009-06-25,0,3.0,1.0,16.0,16.0,0,0.526316,...,3,0.0,0,0,0,0,0,0,0,0
1,1000002,2017-01-31,2015-09-01,4,3.0,2.0,120.0,120.0,0,4.0,...,2,1.0,0,0,0,0,0,0,0,1
2,1000003,2002-05-08,2001-09-07,0,3.0,2.0,0.0,0.0,0,,...,1,0.0,0,0,0,0,0,0,1,0
3,1000003,2009-03-04,2009-03-04,3,3.0,3.0,92.0,92.0,0,3.0,...,1,0.0,0,0,0,0,0,0,0,0
4,1000004,2013-12-10,2013-09-19,0,1.0,2.0,0.0,0.0,0,,...,1,0.0,0,0,0,0,0,0,1,0


#### TEMP - testing

In [411]:
df_collapsed[(df_collapsed['INCMIN']>0) & (df_collapsed['NEW_INC_SANCTION_EXISTS']!=1)][['ID_VARIABLE',  'NEW_INC_SANCTION_EXISTS', 'INCMIN', 'ADJ_JPMIN']]

Unnamed: 0,ID_VARIABLE,NEW_INC_SANCTION_EXISTS,INCMIN,ADJ_JPMIN


In [412]:
df_collapsed[(df_collapsed['NEW_INC_SANCTION_EXISTS']==1) & (df_collapsed['ADJ_JPMIN'].isna()) ][['ID_VARIABLE', 'NEW_DOS',  'INCMIN', 'ADJ_JPMIN', 'LATEST_JPMIN', 'NEW_INC_SANCTION_EXISTS']]

Unnamed: 0,ID_VARIABLE,NEW_DOS,INCMIN,ADJ_JPMIN,LATEST_JPMIN,NEW_INC_SANCTION_EXISTS
127,1000076,2010-12-06,27.000000,,,1
342,1000204,2006-09-27,0.098684,,,1
579,1000351,2005-12-14,9.000000,,,1
665,1000401,2001-07-12,5.000000,,,1
1839,1001140,2005-06-16,0.164474,,,1
...,...,...,...,...,...,...
1483701,1914216,2015-01-09,120.000000,,,1
1484141,1914495,2006-11-30,0.526316,,,1
1485947,1915606,2005-10-03,3.157895,,,1
1486179,1915744,2005-11-16,0.065789,,,1


In [416]:
df_collapsed.shape

(1486949, 21)

### **STEP 3:** Calculate the AT_RISK_DT using the following logic

In [431]:
# check cases where INC_SANCTION_EXISTS is 1 but there are no INCMIN or JP_MIN values filled out
# all of these seem to have long mandatory minimums 
df[(df.NEW_INC_SANCTION_EXISTS==1) & (df['INCMIN'].isna()) & (df['ADJ_JPMIN'].isna())]['MND_MINIMUM'].unique()

array(['LIFE', nan, 'DEATH', '420 Months', '360 Months'], dtype=object)

In [446]:
# but there are cases with NA values in the MND_MINIMUM column. these are likely life/death cases. if we filter out life/death using OFN_LIFE_DEATH and JP_LIFE_DEATH, no further rows remain
df[(df.NEW_INC_SANCTION_EXISTS==1) & (df['INCMIN'].isna()) & (df['ADJ_JPMIN'].isna()) & (df['MND_MINIMUM'].isna()) & (df['OFN_LIFE_DEATH']==0) & (df['JP_LIFE_DEATH']==0)]

Unnamed: 0,UNNAMED: 0.1,UNNAMED: 0,INC_END,MOSTSERIOUS,MND_MINIMUM,OFF_AGE,PRS_MANUAL,JPS_DRUG_DEPENDENT,INC_TYPE,JPO_ID,...,INC_REL_NUMERIC,CHARGE_COUNT,NUM_DOS,MAX_DOS,MIN_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN,NEW_DOS,NEW_INC_SANCTION_EXISTS


In [433]:
df[(df.NEW_INC_SANCTION_EXISTS==1) & (df['INCMIN'].isna()) & (df['ADJ_JPMIN'].isna())]['JP_LIFE_DEATH'].unique()

array([0, 1])

In [427]:
df[(df['ID_VARIABLE']==1001381) & (df['NEW_DOS']==pd.to_datetime('2019-09-13'))][['ID_VARIABLE', 'MND_MINIMUM', 'OFN_LIFE_DEATH', 'JP_LIFE_DEATH', 'OGS', 'DOS', 'JPR_ID', 'INCMIN', 'INC_END', 'JP_MIN', 'ADJ_JPMIN', 'INC_SANCTION_EXISTS', 'NEW_INC_SANCTION_EXISTS']]

Unnamed: 0,ID_VARIABLE,MND_MINIMUM,OFN_LIFE_DEATH,JP_LIFE_DEATH,OGS,DOS,JPR_ID,INCMIN,INC_END,JP_MIN,ADJ_JPMIN,INC_SANCTION_EXISTS,NEW_INC_SANCTION_EXISTS
2486179,1001381,LIFE,1,1,0.0,2019-09-13,6043557,,29 Dec 99,,,Y,1


In [439]:
pd.to_datetime('29 Dec 99').date()

datetime.date(1999, 12, 29)

In [450]:
def create_at_risk_date(row):
    #need to account for REALLY large JP_MIN values
    
    # Because of this error message OverflowError: Python int too large to convert to C long
    # 25 is more years than we have in our data, so their at_risk date also get set to some value far in the future
    upper_limit = 25.0 * 365.0
    
    num_days_in_month = 30.0
    
    #if offense has a life or death flag, set their at_risk_date abritarily large
    if row['OFN_LIFE_DEATH'] == 1 or row['JP_LIFE_DEATH'] == 1:
        at_risk_date = pd.to_datetime('2035-12-31')

    #if they were not incarcerated, then their at risk date is just their date of sentence
    elif row["NEW_INC_SANCTION_EXISTS"] == 0:
        at_risk_date = row['NEW_DOS'] #(was previously DOF but should be DOS)
    
    #if they were incarcerated, look at the below logic to determine their at-risk date
    else: # this case already accounts for NEW_INC_SANCTION_EXISTS == 1

        if row["ADJ_JPMIN"] < upper_limit:

            if row['ADJ_JPMIN']>=0:

                #!!! AZ: updated all of the DOS vars to NEW_DOS. if ADJ_JPMIN is adjusted JPMIN that already subtracted time served
                #  (in other words, equals the latest JPMIN minus time already served equals time remaining from the latest DOS)
               # then we should use NEW_DOS to do this calculation 
                at_risk_date = row['NEW_DOS'] + pd.Timedelta(days = row['ADJ_JPMIN'])
            
            elif row['INCMIN']>=0:
                at_risk_date = row['NEW_DOS'] + pd.Timedelta(days = row['INCMIN'] * num_days_in_month)

            # in the case that INC_SANCTION_EXISTS but both JP_MIN and INCMIN are null, we treat these as LIFE/DEATH cases, as shown in the previous analysis
            else: #!!! AZ changed here: based on the analysis above, we can treat these cases with INC_SANCTION = 1 but JPMIN and INCMIN as life/death cases
                at_risk_date = pd.to_datetime('2035-12-31')

        else:
            at_risk_date = pd.to_datetime('2035-12-31')

    return at_risk_date


#apply the function to the data (row by row)
df_collapsed["AT_RISK_DT"] = df_collapsed.apply(create_at_risk_date, axis = 1)

 #adjust so that the times do not include minutes and seconds
df_collapsed["AT_RISK_DT"] = pd.to_datetime(df_collapsed["AT_RISK_DT"]).dt.date

# #inspect the results
df_collapsed[['ID_VARIABLE', "INCMIN", "ADJ_JPMIN", "NEW_INC_SANCTION_EXISTS", "NEW_DOS", "NEW_DOF", "AT_RISK_DT"]]



Unnamed: 0,ID_VARIABLE,INCMIN,ADJ_JPMIN,NEW_INC_SANCTION_EXISTS,NEW_DOS,NEW_DOF,AT_RISK_DT
0,1000001,0.526316,16.0,1,2010-02-18,2009-06-25,2010-03-06
1,1000002,4.000000,120.0,1,2017-01-31,2015-09-01,2017-05-31
2,1000003,,0.0,0,2002-05-08,2001-09-07,2002-05-08
3,1000003,3.000000,92.0,1,2009-03-04,2009-03-04,2009-06-04
4,1000004,,0.0,0,2013-12-10,2013-09-19,2013-12-10
...,...,...,...,...,...,...,...
1486944,1916193,,0.0,0,2002-01-07,2001-05-03,2002-01-07
1486945,1916194,,0.0,0,2016-11-14,2015-03-30,2016-11-14
1486946,1916195,,0.0,0,2009-06-04,2009-05-16,2009-06-04
1486947,1916196,1.000000,31.0,1,2014-03-03,2013-07-05,2014-04-03


**Note:** In the above at_risk_date calculation code, there is an "upper_limit" because the largest JP_MIN value is 230,000+ days, which is the equivalent of about 631 years. This person would not recidivate in our dataset and Python throws a "OverflowError: Python int too large to convert to C long" for these individuals. So, in order to allow the code to run, those with jp_min values equivalent to more days than we have data for, will just get an at-risk date very far into the future.

In [454]:
# !!! AZ: i think this issue is fixed below
# #correct the at_risk_dt calculation for some rows:
#df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['NEW_DOS'].isnull())), "AT_RISK_DT"]# = df_collapsed['MAX_DOS']  + pd.to_timedelta(df_collapsed['ADJ_JPMIN'], unit='d')

# #will another line here work to resolve this issue?
# df_subset_mult = df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull()))]  #, "AT_RISK_DT"] # = pd.to_datetime(df_collapsed["AT_RISK_DT"])
# df_subset_rest = df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & ((df_collapsed['MAX_DOS'].isnull()))]  #, "AT_RISK_DT"] # = pd.to_datetime(df_collapsed["AT_RISK_DT"])



In [None]:
# df_subset_mult[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]

# df_subset_mult["AT_RISK_DT"] = pd.to_datetime(df_subset_mult["AT_RISK_DT"])

# df_subset_mult[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]


In [None]:
# #concatenate the dataframes back together
# df_collapsed = pd.concat([df_subset_mult, df_subset_rest])

# df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull()))].head()[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]



## Populate Next DOF

In [456]:
#sort the data
df_collapsed = df_collapsed.sort_values(by = ["ID_VARIABLE", "NEW_DOF"])

#shift the data up by one to create the new vaariable "NEXT_DOF"
df_collapsed['NEXT_DOF'] = df_collapsed.groupby(['ID_VARIABLE'])['NEW_DOF'].shift(-1).dt.date

df_collapsed[:20][["ID_VARIABLE", "NEW_DOS", "NEW_DOF", "NEXT_DOF", "AT_RISK_DT", "NEW_INC_SANCTION_EXISTS"]]

Unnamed: 0,ID_VARIABLE,NEW_DOS,NEW_DOF,NEXT_DOF,AT_RISK_DT,NEW_INC_SANCTION_EXISTS
0,1000001,2010-02-18,2009-06-25,NaT,2010-03-06,1
1,1000002,2017-01-31,2015-09-01,NaT,2017-05-31,1
2,1000003,2002-05-08,2001-09-07,2009-03-04,2002-05-08,0
3,1000003,2009-03-04,2009-03-04,NaT,2009-06-04,1
4,1000004,2013-12-10,2013-09-19,2018-07-09,2013-12-10,0
5,1000004,2018-09-26,2018-07-09,NaT,2018-09-26,0
6,1000005,2008-08-11,2006-08-14,NaT,2009-02-10,1
7,1000006,2006-08-30,2005-10-08,NaT,2007-11-30,1
8,1000007,2004-03-02,2003-04-18,NaT,2004-03-02,0
9,1000008,2011-05-13,2011-01-16,NaT,2011-11-13,1


## Check for "Free Time" 
(i.e.: Do we have enough data for an individual to see if they recidivated in 3 years or not?)

**Procedure Below:**
1. Subset just to those whose at_risk date < max DOS df[["DOS"]].max()
2. Then, we also want to remove those whose last next_dof is null and whose last dof > 2017
3. Essentially, we want to subset (whatever grouping variable we're using) to just those entries where next_dof is null and FOR THIS SAME ROW, if the dof >= pd.todatetime("2017-01-01") -- remove these entries



In [460]:
#subset to those whose at_risk_date < the largest sentencing date that we have

before_length = len(df_collapsed)

#what is the maximum sentence date?
last_day = pd.to_datetime(df_collapsed[["NEW_DOS"]].max())[0]  
df_collapsed = df_collapsed[df_collapsed["AT_RISK_DT"] <= last_day]

after_length = len(df_collapsed) 

print("There are {:,} id_var, dos combos where the at risk date is after the last date of sentence available.".format(before_length - after_length))


  result = libops.scalar_compare(x.ravel(), y, op)


There are 0 id_var, dos combos where the at risk date is after the last date of sentence available.


Here, I calculate a "LAST_DOF" variable, which will then be used to subset the data to only those whose latest offense was before 2017

In [None]:
# !!! AZ: I am very confused here, shouldn't we subset based on risk window end date? 

# df_collapsed["LAST_DOF"] = df_collapsed.loc[df_collapsed["NEXT_DOF"].isnull(), "NEW_DOF"]

# df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "LAST_DOF"]]

In [None]:
#subset the data to only those whose last_dof is before 2017
#before_length = len(df_collapsed)

# last_day = pd.to_datetime("2017-01-01") 

# #subset the dataset to either where the LAST_DOF is null OR LAST_DOF < last_day
# df_collapsed = df_collapsed.loc[(df_collapsed["LAST_DOF"].isnull()) | (df_collapsed["LAST_DOF"] < last_day)]

# # after_length = len(df_collapsed) 
# # print("There are {:,} id_var, dos combos whose's last dof is not in scope.".format(before_length - after_length))

# df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "LAST_DOF"]]


# Create The Time To Recidivite and Recidivism Variables

In [462]:
#subtract the next_dof and at_risk_dt variables  -- update this 
df_collapsed['TIME_TO_RECIDIVATE'] = pd.to_datetime(df_collapsed['NEXT_DOF']) - pd.to_datetime(df_collapsed['AT_RISK_DT'])#update to this level 
    
#update the time to recidivate column to JUST be the number of days as an integer/float
df_collapsed['TIME_TO_RECIDIVATE'] = df_collapsed['TIME_TO_RECIDIVATE'].dt.days

df_collapsed[["ID_VARIABLE", "NEW_DOS", "NEW_DOF", "NEXT_DOF", "TIME_TO_RECIDIVATE"]]


Unnamed: 0,ID_VARIABLE,NEW_DOS,NEW_DOF,NEXT_DOF,TIME_TO_RECIDIVATE
0,1000001,2010-02-18,2009-06-25,NaT,
1,1000002,2017-01-31,2015-09-01,NaT,
2,1000003,2002-05-08,2001-09-07,2009-03-04,2492.0
3,1000003,2009-03-04,2009-03-04,NaT,
4,1000004,2013-12-10,2013-09-19,2018-07-09,1672.0
...,...,...,...,...,...
1486944,1916193,2002-01-07,2001-05-03,NaT,
1486945,1916194,2016-11-14,2015-03-30,NaT,
1486946,1916195,2009-06-04,2009-05-16,NaT,
1486947,1916196,2014-03-03,2013-07-05,NaT,


In [463]:
#number of days in  years
three_years_in_days = float(3) * 365.0  
five_years_in_days = float(5) * 365.0  

#ID_VARIABLE, DOS-LEVEL RECIDIVISM -- does not count times where the next_dof < at_risk_dt as instances of recidivism

df_collapsed["RECIDIVISM_3Y"] = np.where(
    (df_collapsed['TIME_TO_RECIDIVATE'] > 0) & (df_collapsed['TIME_TO_RECIDIVATE'] <= three_years_in_days), 1, 0)

df_collapsed["RECIDIVISM_5Y"] = np.where(
    (df_collapsed['TIME_TO_RECIDIVATE'] > 0) & (df_collapsed['TIME_TO_RECIDIVATE'] <= five_years_in_days), 1, 0)

df_collapsed[["ID_VARIABLE", "NEW_DOS", "ADJ_JPMIN", "NEW_DOF", "NEXT_DOF", "AT_RISK_DT", "TIME_TO_RECIDIVATE", "RECIDIVISM_3Y", "RECIDIVISM_5Y"]]


Unnamed: 0,ID_VARIABLE,NEW_DOS,ADJ_JPMIN,NEW_DOF,NEXT_DOF,AT_RISK_DT,TIME_TO_RECIDIVATE,RECIDIVISM_3Y,RECIDIVISM_5Y
0,1000001,2010-02-18,16.0,2009-06-25,NaT,2010-03-06,,0,0
1,1000002,2017-01-31,120.0,2015-09-01,NaT,2017-05-31,,0,0
2,1000003,2002-05-08,0.0,2001-09-07,2009-03-04,2002-05-08,2492.0,0,0
3,1000003,2009-03-04,92.0,2009-03-04,NaT,2009-06-04,,0,0
4,1000004,2013-12-10,0.0,2013-09-19,2018-07-09,2013-12-10,1672.0,0,1
...,...,...,...,...,...,...,...,...,...
1486944,1916193,2002-01-07,0.0,2001-05-03,NaT,2002-01-07,,0,0
1486945,1916194,2016-11-14,0.0,2015-03-30,NaT,2016-11-14,,0,0
1486946,1916195,2009-06-04,0.0,2009-05-16,NaT,2009-06-04,,0,0
1486947,1916196,2014-03-03,31.0,2013-07-05,NaT,2014-04-03,,0,0


In [465]:
# !!! AZ add here 
# subset to cases with at_risk_date <= 2017-01-01, AFTER we get recidivsm info (e.g. we don't want to lose convictions in 2018 or 2019 as they might count as recidivism events for earlier convictions)
starting_len = len(df_collapsed)
df_collapsed = df_collapsed[df_collapsed['AT_RISK_DT']<=pd.to_datetime('2017-01-01')]
ending_len = len(df_collapsed)
print("There are {:d} cases where the at-risk-date is less than 3 years from the end of the data range, and therefore does not have a long enough observation period for recidivism calculations".format(starting_len - ending_len))

  result = libops.scalar_compare(x.ravel(), y, op)


There are 234894 cases where the at-risk-date is less than 3 years from the end of the data range, and therefore does not have a long enough observation period for recidivism calculations


In [466]:
print(df_collapsed.shape)

(1216238, 26)


## Export The Results to CSV 
(PA_SENTENCING/Project/data/recidivism_dataset.csv)

In [None]:
# #create a new date of sentence variable

# #if max_dos is null, then there is only one DOS associated with a given JPR_ID OTHERWISE, the new_dos becomes the max dos (meaning there were multiple dates of sentence associated with that JPR_ID)

# df_collapsed.loc[df_collapsed["MAX_DOS"].notna(), "NEW_DOS"] = df_collapsed["MAX_DOS"]

# df_collapsed.loc[df_collapsed["MAX_DOS"].isnull(), "NEW_DOS"] = df_collapsed["DOS"]

# df_collapsed[["MAX_DOS","DOS", "NEW_DOS"]]

In [467]:
df_collapsed.columns

Index(['ID_VARIABLE', 'NEW_DOS', 'NEW_DOF', 'PRS', 'OGS', 'PRS8', 'ADJ_JPMIN',
       'LATEST_JPMIN', 'CTY_PHL', 'INCMIN', 'NEW_INC_SANCTION_EXISTS',
       'CHARGE_COUNT', 'INC_REL_NUMERIC', 'OFN_LIFE_DEATH', 'JP_LIFE_DEATH',
       'SEXCRIME', 'SEXTIER_NUMERIC', 'FIREARMS', 'VIOLENCE', 'DRUGOFFENSE',
       'DUI', 'AT_RISK_DT', 'NEXT_DOF', 'TIME_TO_RECIDIVATE', 'RECIDIVISM_3Y',
       'RECIDIVISM_5Y'],
      dtype='object')

In [468]:
#Export the Results to a CSV
#subset the dataset before exporting it
# df_collapsed_subset = df_collapsed[["ID_VARIABLE", "NEW_DOS", "NEW_DOF", "PRS", 
# "PRS8", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN", "LATEST_JPMIN", "AT_RISK_DT", "COUNTY", 
# "NEXT_DOF", "TIME_TO_RECIDIVATE", "RECIDIVISM_3Y", "RECIDIVISM_5Y", "OGS",
# "MS_SANCTION","SEXCRIME","DUI","DRUGOFFENSE","FIREARMS","VIOLENCE",'JPS_WALSHACTASSESSMENT', 'CTY_PHL', 'CHARGE_COUNT', 'INC_REL_NUMERIC']]


#get the demographics dataset
cleaned_demographics = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "demographic_dataset.csv"))

#merged the recidivism dataset with the cleaned demographics dataset
result = pd.merge(df_collapsed, cleaned_demographics, how="left", on=["ID_VARIABLE", "ID_VARIABLE"])


# #export the dataframe with the recidivism variables to a new dataframe
#output_path = os.path.join(pa_sentencing_path, "Project", "data", "recidivism_dataset.csv")

#changes the output path to include a flag for including the PRS score 8 values
output_path = os.path.join(pa_sentencing_path, "Project", "data", "recidivism_dataset_w_additional_analysis_3.csv")



result.to_csv(output_path,index=False) #export the final results

In [None]:
# #check to see what the data looks like
test = pd.read_csv(output_path)


print(test.RECIDIVISM_3Y.value_counts())
