# Load Data

In [1]:
#this jupyter notebook is essentially the same as the "recidivism-check" notebook, just cleaned up a bit (hence the name)
#import required libraries
import os
import pandas as pd
import numpy as np
import datetime

#get the folder path for this data
pa_sentencing_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))

## **WILL NEED TO EDIT THE BELOW TO MAKE IT GENERALIZABLE FOR THE PA SENTENCING COMISSION**

In [2]:
#read in trimmed version WITH 8th edition PRS score implementation
psc_trimmed = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "PSC8_CRIMECAT_MERGED_w_prs8.csv"))


  psc_trimmed = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "PSC8_CRIMECAT_MERGED_w_prs8.csv"))


# Description of the Dataset

Because the original dataset provided by the PA Sentencing Commission included **547** columns, and took anywhere from 20-30 minutes to load, we selected the columns of the original dataset that we most relevant for our analysis and created a new, "trimmed" dataset. This new trimmed dataset (loaded in above) has **93** columns. In the next cell, we display the names of the columns in the dataset.

In [3]:
print("The names of the columns in the trimmed dataset are: \n {}".format(psc_trimmed.columns.tolist()))

The names of the columns in the trimmed dataset are: 
 ['Unnamed: 0.1', 'Unnamed: 0', 'INC_END', 'MOSTSERIOUS', 'MND_MINIMUM', 'OFF_AGE', 'PRS_MANUAL', 'JPS_DRUG_DEPENDENT', 'INC_TYPE', 'JPO_ID', 'PROB_SANCTION_EXISTS', 'JPR_ID', 'DOSAGE', 'IP_START', 'GUILTY_NO_PENALTY', 'OFN_LABEL', 'STAT_MIN', 'F2TOT', 'DOB2', 'SGR_LVL', 'OFF_SEX', 'F3TOT', 'DOS', 'F1TOT', 'JP_MIN', 'INC_RELATIONSHIP', 'OFN_TITLE', 'OGS', 'JMIN', 'F1F2', 'REASON_THREE', 'JPR_LASTUPDATE', 'OFN_COUNT', 'MS_OFFBODY', 'OFN_GRADE', 'PCS_OFF_ID', 'JPS_DA_EVAL', 'SGR_LVL_OGS_PRS', 'INCMAX', 'REASON_ONE', 'M1TOT', 'PMIN', 'JPS_SVP', 'PRS_OTHER_MISD', 'DOF', 'GRADE', 'GLEPOCH', 'PRS8', 'IP_SANCTION_EXISTS', 'INC_RELATEDTO', 'IP_END', 'INC_RELATEDOTN', 'PRS_LAPSING', 'SIP', 'CONFORMITY', 'INC_SANCTION_EXISTS', 'DOB', 'MS_OFFINJP', 'INC_START', 'CONFORM', 'OFF_RACE', 'PRS', 'LABEL', 'PRS_NONLAPSING', 'JP_CC_BUG', 'MORE_REASONS', 'MS_SANCTION', 'STATE_IP', 'MS_SENTJP', 'OFN_LIFE_DEATH', 'OTN', 'DISPOSITION', 'COUNTY', 'DOFAGE',

In [4]:
#DELETE

# Copying a this column needed from the main dataset
# !!! AZ: I don't think we actually need ofn-id, i used it in my code to do a count so really any other column is fine! 
# usecols = ['OFN_ID']
# main_df = pd.read_csv('../../../Project/data/Main.csv', usecols=usecols) 

In [5]:
# DELETE

#ofn_df = main_df.copy()


In [6]:
df = psc_trimmed.copy()  #copy the trimmed dataset into a new dataframe

df.columns = df.columns.str.upper() #change all column names to uppercase

In [7]:
df.head() #inspect the dataset

Unnamed: 0,UNNAMED: 0.1,UNNAMED: 0,INC_END,MOSTSERIOUS,MND_MINIMUM,OFF_AGE,PRS_MANUAL,JPS_DRUG_DEPENDENT,INC_TYPE,JPO_ID,...,F2TOT.1,F3TOT.1,M1TOT.1,M2TOT.1,F1F2.1,MOSTSERIOUS.1,OFN_TITLE.1,OFN_SECTION,OFN_SUBSECTION,JPS_WALSHACTASSESSMENT
0,0,0,,7.0,,36.0,,N,,938145,...,0.0,0.0,0.0,0.0,0.0,7.0,18,6301,(a)(1)*,
1,1,1,02 Dec 02,7.0,48 Hours,18.0,,N,County Facility,942201,...,0.0,0.0,0.0,0.0,0.0,7.0,75,3731,,
2,2,2,27 Apr 01,7.0,48 Hours,36.0,,N,County Facility,971901,...,0.0,0.0,0.0,0.0,0.0,7.0,75,3731,,
3,3,3,,7.0,,22.0,,N,,919472,...,0.0,0.0,0.0,0.0,0.0,7.0,18,2701,(b),
4,4,4,01 Jan 03,6.0,30 Days,40.0,,N,County Facility,870391,...,0.0,0.0,0.0,0.0,0.0,6.0,75,3731,,


# Table of Contents  EDIT!!!!!

1. Data Cleaning
    1. Convert Dates 
    2. [Combine REVOC and RFEL Categories](#combining-revoc-into-rfel)
    3. Clean DOF
    1. [Clean Missing PRS Scores in the 7th Edition](#clean-missing-prs-score)
    4. Clean Missing PRS Scores in the 8th Edition
    1. [Clean JP_CC_BUG Issue](#clean-jp-cc-bug)
2. Adding New Columns
    1. Crime Categories
    2. Handing Philadelphia Cases Vs. Other Counties
2. [At-Risk Date Calculation](#implement-at-risk-date-calculation-logic)
    1. Group Data at JPR_ID Level
        1. [Address Mutiple Dates of Sentencing](#multiple-dos-for-one-jprid)
        2. [Create Adjusted JP_MIN Value]
        3. [Check INC_SANCTION EXISTS](#incsanctionexists-check)
    2. Group Data at the ID_VARIABLE, DOS LEVEL
    3. Implement At-Risk Date Logic
3. Calculate Recidivism
    1. [Calculate Next Date of Offense](#populate-next-dof)
    2. [Check for Free Time](#check-for-"free-time")
    3. [Calculate Time to Recidivate & 3-Year and 5-Year Recidivism Variables](#create-time-to-recidivate-and-recidivsm-variables)

Note: the links aove may not work well in VSCode but wold work better in Jupyter Notebooks via Anaconda
    


# Data Cleaning

## Convert Dates

In [8]:
## Convert Dates#convert date strings to datetime variable
df[['DOF','DOS']] = df[['DOF','DOS']].apply(pd.to_datetime,format="%d %b %y")

In [9]:
# extracting out the just the year from the date to be used later 
df['DOF_YEAR'] = pd.DatetimeIndex(df['DOF']).year
df['DOS_YEAR'] = pd.DatetimeIndex(df['DOS']).year

In [10]:
#checking the range of values for the DOF and DOS variables
print("The minimum date of offense in the dataset is: {}".format(df[["DOF"]].min()[0]))
print("The maximum date of offense in the dataset is: {}".format(df[["DOF"]].max()[0]))
print("The minimum date of sentencing in the dataset is: {}".format(df[["DOS"]].min()[0]))
print("The maximum date of sentencing in the dataset is: {}".format(df[["DOS"]].max()[0]))

The minimum date of offense in the dataset is: 1984-11-14 00:00:00
The maximum date of offense in the dataset is: 2020-05-08 00:00:00
The minimum date of sentencing in the dataset is: 2001-01-01 00:00:00
The maximum date of sentencing in the dataset is: 2019-12-31 00:00:00


## Combine REVOC into RFEL for 7th Edition PRS Scores

In [11]:
# Values before conversion
df['PRS'].unique()

array(['0', '1', '2', '3', '5', '4', 'RFEL', 'REVOC', 'rfel', 'RFEl', nan,
       'Rfel', 'revoc'], dtype=object)

In [12]:
def refl_combine(x):
    if x in ['RFEL', 'REVOC', 'rfel', 'RFEl', 'Rfel', 'revoc']:
        return('RFEL/REVOC')
    else:
        return(x)   

In [13]:
df['PRS'] = df['PRS'].apply(refl_combine)

In [14]:
df['PRS8'].unique()

array([1., 2., 3., 4.])

In [15]:
df['PRS'].unique()

array(['0', '1', '2', '3', '5', '4', 'RFEL/REVOC', nan], dtype=object)

## Clean DOF

Note: group offense by ID_VAR, JPR_ID, MIN(DOF) to get the first DOF associated for a single JPR_ID

### Step 1: Get **minimum** value for the DOF across all of the charges associated with **one** JPR_ID, ID VARIABLE COMBO. 

Note: This is the procedure because we don't wan't to count a DOF as an instance of recidivism if it occurs BEFORE the date of sentencing. Additionally, we group here by id_var and jpr_id because multiple id variables can be associated with one JPR_ID.

In [16]:
#at the JPR_ID level we only want ONE DOF because becuase we don't want to take into account DOF's that occur
#BEFORE the DOS (associated with the JPR_ID) as an instance of recidivism. -- each JPR_ID should have only ONE DOS

#here we will group by id_Variable as well as jpr_id
df["NEW_DOF"] = df.groupby(["JPR_ID", "ID_VARIABLE"])["DOF"].transform("min")



In [17]:
#testing code
df_test = df.copy()

df_jprid = df_test[df_test["JPR_ID"] == 5499834]
df_test = df_test[df_test["ID_VARIABLE"] == 1468038]

df_test[["JPR_ID", "ID_VARIABLE", "DOS", "DOF", "NEW_DOF","OFN_LABEL"]]


Unnamed: 0,JPR_ID,ID_VARIABLE,DOS,DOF,NEW_DOF,OFN_LABEL
676640,465584,1468038,2006-09-07,2006-04-15,2005-12-31,DUI: High Rate of Alcohol - (BAC .10 - < .16) ...
676641,465584,1468038,2006-09-07,2005-12-31,2005-12-31,DUI: High Rate of Alcohol - (BAC .10 - < .16) ...
1964753,5499834,1468038,2015-03-24,2014-08-18,2014-08-18,Theft by unlaw taking-movable property ($200-$...
2104401,5646230,1468038,2016-05-23,2016-01-28,2016-01-28,Possession-drug paraphernalia


In [18]:
df.head()[["JPR_ID", "ID_VARIABLE","DOF", "NEW_DOF"]]

Unnamed: 0,JPR_ID,ID_VARIABLE,DOF,NEW_DOF
0,640001,1904581,2000-04-01,2000-04-01
1,642480,1157226,1999-12-31,1999-12-31
2,660434,1467650,2000-12-23,2000-12-23
3,628940,1746031,2000-06-26,2000-06-26
4,594048,1374131,2000-10-15,2000-10-15


In [19]:
dof_missing = df[df['NEW_DOF'].isnull()]

percent_missing = len(dof_missing)/len(df)
print("After cleaning, there are {:,} ({:%}) rows with missing DOFs in the dataset.".format(len(dof_missing), percent_missing))

After cleaning, there are 11,785 (0.454381%) rows with missing DOFs in the dataset.


### Step 2: Subset the data to just include those rows where NEW_DOF <= DOS

In [20]:
#make sure the sentencing 
before_length = len(df)
df = df[df.NEW_DOF <= df.DOS] 
after_length = len(df)


In [21]:
print("Before DOF <= DOS correction, there were {:,} rows and after cleaning there were {:,} rows. A change of {:,}.".format(before_length, after_length, before_length - after_length))


Before DOF <= DOS correction, there were 2,593,636 rows and after cleaning there were 2,581,813 rows. A change of 11,823.


## Clean Missing PRS Score 

In [22]:
before_length = len(df)
#subset to just the id variables with a PRS score missing
id_varswith_prsmissing= set(df[df.PRS.isnull()].ID_VARIABLE)

#remove id vars with missing PRS
df_prs_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_prsmissing)]

#reassign to working dataframe
df = df_prs_notaffected 

after_length = len(df)
print("Before PRS correction there were {:,} rows and after cleaning there were {:,} rows. A change of {:,} rows and {} people.".format(before_length, after_length, before_length - after_length, len(id_varswith_prsmissing)))


Before PRS correction there were 2,581,813 rows and after cleaning there were 2,581,750 rows. A change of 63 rows and 18 people.


## Clean Missing PRS8 (8th Edition Sentencing Guidelines) Score

In [23]:
before_length = len(df)
#subset to just the id variables with a PRS score missing
id_varswith_prs8missing= set(df[df.PRS8.isnull()].ID_VARIABLE)

#remove id vars with missing PRS
df_prs8_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_prs8missing)]

#reassign to working dataframe
df = df_prs8_notaffected 

after_length = len(df)
print("Before PRS8 correction there were {:,} rows and after cleaning there were {:,} rows. A change of {:,} rows and {} people.".format(before_length, after_length, before_length - after_length, len(id_varswith_prs8missing)))


Before PRS8 correction there were 2,581,750 rows and after cleaning there were 2,581,750 rows. A change of 0 rows and 0 people.


## Clean JP CC Bug

## Steps followed in cleaning JP_CC Bug
1. It is evident that there are JPR_ID's with DOS from 2016 to 2019 that were impacted by the JP_CC_BUG 
2. The first step was to extract out the ID variables that were impacted out by the bug. 
3. Next, we removed the Judicial proceedings of these JPR_ID's where the DOS is in 2017,2018, and 2019. However, the JPR_ID's associated with the first occurence of the JP_CC Bug is kept- In other words, the JPR_ID's where the DOS was in 2016 is kept. 


In [24]:
#confirming the years that impacted the JP_CC_BUG
set(df[df.JP_CC_BUG=='Y'].DOS_YEAR)


{2016, 2017, 2018, 2019}

In [25]:
# Obtaining the id variables with jp_bug
id_varswith_jpbug= set(df[df.JP_CC_BUG=='Y'].ID_VARIABLE) #pull out both id_variable and DOS

In [26]:
# assigning all the rows associated with the jp bugs to a seperate dataframe 
df_with_jpbug=  df[df.ID_VARIABLE.isin(id_varswith_jpbug)]  #want to remove the charges that come after the DOS associated with the JP_CC_BUG row (want to eliminate the problemative date of sentencing)

In [27]:
# Removing the JPR'IDS's that have DOS in 2017,2018 and 2019
df_jp_bug_cleaned = df_with_jpbug[df.DOS_YEAR<2017]

  df_jp_bug_cleaned = df_with_jpbug[df.DOS_YEAR<2017]


In [28]:
# Isolating the rows associated with id_vars in the original dataframe that is not associated with the bug
df_jpbug_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_jpbug)]

In [29]:
# Rejoining the rows affected by the JP_CC_bug after cleaning them to the rows not affected by the bug
df_cleaned_1 = pd.concat([df_jpbug_notaffected,df_jp_bug_cleaned])  #new working df

df = df_cleaned_1

In [30]:
after_length = len(df)

print("After the JP_CC_BUG correction there are {:,} rows. ".format(after_length))


After the JP_CC_BUG correction there are 2,574,269 rows. 


# Adding New Columns

## Crime Categories

In [31]:
# Sex crimes
df['SEXCRIME'] = [True if (a == 'Tier I' or a == 'Tier II' or a == 'Tier III') else False for a in df['JPS_WALSHACTASSESSMENT']]

In [32]:
df.JPS_WALSHACTASSESSMENT.unique()

array([nan, 'Tier I', 'Tier II', 'Tier III'], dtype=object)

## Create a Numeric column for the Categorical Sex Crime tiers in JPS_WALSHACTASSESSMENT

In [33]:
def numeric_sextier(x):
    if x =='Tier I':
        return(1)
    elif x=='Tier II':
        return(2)
    elif x=='Tier III':
        return(3)
    else:
        return(x)      

In [34]:
df['SEXTIER_NUMERIC']= df['JPS_WALSHACTASSESSMENT'].apply(numeric_sextier)

In [35]:
df['OFN_SECTION_61'] = [True if a[:2]=='61' else False for a in df['OFN_SECTION']]
df['FIREARMS'] = np.where((df['OFN_SECTION_61'] == True) & (df['OFN_TITLE'] ==
18), True, False)


In [36]:
def conditions(s):
    if (s['OFN_TITLE']==18):
        if (s['OFN_SECTION']=='2502') & (s['OFN_SUBSECTION'] == 'C'):
            return True
        elif (s['OFN_SECTION']=='2502') & (s['OFN_SUBSECTION'] == '(c)'):
            return True
        elif (s['OFN_SECTION']=='2503'):
            return True
        elif (s['OFN_SECTION']=='2506'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C1'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C1I'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'C1II'):
            return True
        elif (s['OFN_SECTION']=='2507') & (s['OFN_SUBSECTION'] == 'D'):
            return True
        elif (s['OFN_SECTION']=='2604') & (s['OFN_SUBSECTION'] == 'C'):
            return True
        elif (s['OFN_SECTION']=='2604') & (s['OFN_SUBSECTION'] == 'C1'):
            return True
        elif (s['OFN_SECTION']=='2606'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(1)'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(1)*'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == 'A1'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(2)'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == '(a)(2)*'):
            return True
        elif (s['OFN_SECTION']=='2702') & (s['OFN_SUBSECTION'] == 'A2'):
            return True
        elif (s['OFN_SECTION']=='2702.1'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == 'B'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == 'B1'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == '(b)'):
            return True
        elif (s['OFN_SECTION']=='2716') & (s['OFN_SUBSECTION'] == '(b)1'):
            return True
        elif (s['OFN_SECTION']=='2717') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='2718') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='2901'):
            return True
        elif (s['OFN_SECTION']=='3002') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='3011') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='3012') & (s['OFN_GRADE'] == 'F-1'):
            return True
        elif (s['OFN_SECTION']=='3121'):
            return True
        elif (s['OFN_SECTION']=='3123'):
            return True
        elif (s['OFN_SECTION']=='3124.1'):
            return True
        elif (s['OFN_SECTION']=='3125'):
            return True
        elif (s['OFN_SECTION']=='4302'):
            return True
        elif (s['OFN_SECTION']=='3301'):
            if (s['OFN_SUBSECTION'] == 'A'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1I'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1II'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A2'):
                return True
            elif (s['OFN_SUBSECTION'] == '(a)(1)*'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.1'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.11'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.11I'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A.11II'):
                return True
        elif (s['OFN_SECTION']=='3311') & (s['OFN_GRADE'] == 'F-1'):     
            return True
        elif (s['OFN_SECTION']=='3502') & (s['OFN_SUBSECTION'] == 'A1'):
            return True
        elif (s['OFN_SECTION']=='3701'):
            if (s['OFN_SUBSECTION'] == '(a)(1)(i)'):
                return True
            elif (s['OFN_SUBSECTION'] == '(a)(1)(ii)'):
                return True
            elif (s['OFN_SUBSECTION'] == '(a)(1)(iii)'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1I'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1II'):
                return True
            elif (s['OFN_SUBSECTION'] == 'A1III'):
                return True
        elif (s['OFN_SECTION']=='3702'):
            return True
        else:
            return False
    else:
        return False

In [37]:
df['VIOLENCE'] = df.apply(conditions, axis=1)

In [38]:
# Drug
df['DRUGOFFENSE'] = np.where((df['OFN_TITLE'] == 35), True, False)

In [39]:
# DUI
df['DUI'] = np.where((df['OFN_TITLE'] == 75) & (df['OFN_SECTION']=='3802'), True, False)

# Handling Cases where sentencing was in Philly and another county 

In [40]:

# creating Philly indicator column
df['CTY_PHL'] = np.where(df['COUNTY']=='Philadelphia', 1, 0)

In [41]:
# !!! AZ EDIT 
df['CTY_PHL_CLEANED'] = df.groupby(by=['JPR_ID', 'ID_VARIABLE'])['CTY_PHL'].transform('max') # TJ EDIT = changed the order of these columns

# Adding Concurrent and Consecutive Sentencing Info

In [42]:
# !!! AZ changed the order per Matt's comment 

# Use the following order to map numeric values to INC_RELATIONSHIP (i.e., 0 for Concurrent and 1 for Consecutive). 
# By taking the max() value later, this will assign all cases with at least one consecutive sentence in the consecutive category. 
order = [np.nan, 'Concurrent', 'Consecutive']

In [43]:
df['INC_REL_NUMERIC'] = df['INC_RELATIONSHIP'].map(dict(zip(order, np.arange(len(order)))))

In [44]:
df[df['INC_RELATIONSHIP'].notna()][['INC_REL_NUMERIC', 'INC_RELATIONSHIP']].head()

Unnamed: 0,INC_REL_NUMERIC,INC_RELATIONSHIP
49,2.0,Consecutive
54,1.0,Concurrent
57,1.0,Concurrent
80,1.0,Concurrent
81,1.0,Concurrent


In [45]:
df[df['INC_RELATIONSHIP'].isna()][['INC_REL_NUMERIC', 'INC_RELATIONSHIP']].head()

Unnamed: 0,INC_REL_NUMERIC,INC_RELATIONSHIP
0,0.0,
1,0.0,
2,0.0,
3,0.0,
4,0.0,


In [46]:
# !!! AZ Edit 
df['INC_REL_NUMERIC_CLEANED'] = df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['INC_REL_NUMERIC'].transform('max')

In [47]:
# !!! AZ Edit

# get the number of charges sentenced per JP on each DOS
df['CHARGE_COUNT'] = df.groupby(by=['ID_VARIABLE', 'JPR_ID', 'DOS'])['JPO_ID'].transform('count')

In [48]:
df[(df['ID_VARIABLE']==1884187) & (df['JPR_ID']==5199873)][['CHARGE_COUNT', 'JPR_ID', 'DOS']].sort_values(by='DOS')

Unnamed: 0,CHARGE_COUNT,JPR_ID,DOS
1591799,6,5199873,2012-12-06
1591800,6,5199873,2012-12-06
1591801,6,5199873,2012-12-06
1591802,6,5199873,2012-12-06
1591803,6,5199873,2012-12-06
1591804,6,5199873,2012-12-06
1885851,6,5199873,2014-10-09
1885850,6,5199873,2014-10-09
1885849,6,5199873,2014-10-09
1885848,6,5199873,2014-10-09


In [49]:
# FOR NOW: we will take the charge count associated with most severe sentence (largest JP_MIN) and highest OGS for a given DOS, after dealing with NANs
# TEAM: review later when incorporating into pipeline as to whether this logic makes sense 
#charge_counts['OGS'] = charge_counts['OGS'].fillna(0)
#charge_counts = charge_counts.sort_values(by=['ID_VARIABLE', 'DOS', 'JP_MIN', 'OGS', 'CHARGE_COUNT'], ascending=[True, True, False, False, False])

# Implement At Risk Date Calculation Logic

## Step 1: Address Issue with Multiple DOS for one JPR_ID

In [50]:
dos_vals = df.groupby(["JPR_ID"])['DOS'].agg(lambda x: set(x)) #> 1 #how many rows have two UNIQUE DOS for the same JPR_ID (set)

In [51]:
num_dos = df.groupby(["JPR_ID"])['DOS'].agg(lambda x: len(set(x))) #how many dates of sentencing does each jpr_id have?
#num_dos[2847193]

In [52]:
#more_than_one_dos.reset_index()
more_than_one_dos = list(num_dos[num_dos > 1].index)

print("There are {:,} JPR_IDS in the dataset with more than one date of sentence.".format(len(more_than_one_dos)))

There are 359 JPR_IDS in the dataset with more than one date of sentence.


In [53]:
#subset the data JUST to those ID's in the more_than_one dos bucket
only_one_dos = df.loc[~df["JPR_ID"].isin(more_than_one_dos)]
more_than_one_dos_df = df.loc[df["JPR_ID"].isin(more_than_one_dos)]


In [54]:
more_than_one_dos_df = more_than_one_dos_df.sort_values(["JPR_ID", "DOS"]) #sort by jpr_id AND DOS

more_than_one_dos_df.head()[["JPR_ID", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "MS_SENTJP", "INC_SANCTION_EXISTS"]]


Unnamed: 0,JPR_ID,DOS,OFN_LABEL,PRS,JP_MIN,MS_SENTJP,INC_SANCTION_EXISTS
569776,92399,2005-01-06,Murder of The Second Degree,0,30346.0,,Y
2131297,92399,2016-06-27,Murder of The Second Degree,0,,Yes,Y
620698,117010,2005-02-01,Aggravated Assault - Cause or Att B.I. w/Deadl...,5,8127.0,,Y
620699,117010,2005-02-01,Burglary - Home: Person Present,5,8127.0,,Y
620700,117010,2005-02-01,Murder Inchoate - Attempt with S.B.I.,5,8127.0,,Y


In [55]:
#create a new column with the NEW_DOS value
more_than_one_dos_df["MAX_DOS"] = more_than_one_dos_df.groupby("JPR_ID")["DOS"].transform("max") #take the latest date of sentencing
more_than_one_dos_df["MIN_DOS"] = more_than_one_dos_df.groupby("JPR_ID")["DOS"].transform("min") #take the earliest date of sentencing

# #create a new time served column
more_than_one_dos_df["TIME_SERVED"] = more_than_one_dos_df["MAX_DOS"] - more_than_one_dos_df["MIN_DOS"]
more_than_one_dos_df["TIME_SERVED"] = more_than_one_dos_df["TIME_SERVED"].dt.days

#finds the JP_MIN associated with the latest DOS (because the data is already sorted by JPR_ID and DOS)
more_than_one_dos_df["LATEST_JPMIN"] = more_than_one_dos_df.groupby("JPR_ID")["JP_MIN"].transform("last")

#calculate an adjusted JP_MIN from the logic provided by Miranda
more_than_one_dos_df['ADJ_JPMIN'] = more_than_one_dos_df["LATEST_JPMIN"] - more_than_one_dos_df["TIME_SERVED"]

# # more_than_one_dos_df[:20][["JPR_ID", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "ADJ_JPMIN", "INC_SANCTION_EXISTS", "NEW_DOS"]]
more_than_one_dos_df[:20][["JPR_ID", "ID_VARIABLE", "DOS", "OFN_LABEL", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


Unnamed: 0,JPR_ID,ID_VARIABLE,DOS,OFN_LABEL,MIN_DOS,JP_MIN,MAX_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN
569776,92399,1877126,2005-01-06,Murder of The Second Degree,2005-01-06,30346.0,2016-06-27,4190,30346.0,26156.0
2131297,92399,1877126,2016-06-27,Murder of The Second Degree,2005-01-06,,2016-06-27,4190,30346.0,26156.0
620698,117010,1325462,2005-02-01,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0
620699,117010,1325462,2005-02-01,Burglary - Home: Person Present,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0
620700,117010,1325462,2005-02-01,Murder Inchoate - Attempt with S.B.I.,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0
620701,117010,1325462,2005-02-01,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,8127.0,2007-10-17,988,5570.0,4582.0
895508,117010,1325462,2007-10-17,Murder Inchoate - Attempt with S.B.I.,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0
895509,117010,1325462,2007-10-17,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0
895510,117010,1325462,2007-10-17,Aggravated Assault - Cause or Att B.I. w/Deadl...,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0
895511,117010,1325462,2007-10-17,Burglary - Home: Person Present,2005-02-01,5570.0,2007-10-17,988,5570.0,4582.0


In [56]:
# Using the earlist DOS, we want to get the number of charges sentenced for the initial hearing of each JP
# !!! AZ Edit
more_than_one_dos_df['CHARGE_COUNT'] = more_than_one_dos_df.sort_values(by=['ID_VARIABLE', 'JPR_ID', 'DOS'], ascending=[True, True, True]).groupby(by=['ID_VARIABLE', 'JPR_ID'])['CHARGE_COUNT'].transform('last')

In [57]:
# using the latest JP_MIN information (i.e. the most 'final' incarceration length), get the inc_relationship associated with that JP 
# we also want to take the max(IND_REL_NUMERIC)
# !!!! AZ edit
more_than_one_dos_df['INC_REL_NUMERIC_CLEANED'] = more_than_one_dos_df.sort_values(by=['ID_VARIABLE', 'JPR_ID', 'DOS', 'INC_REL_NUMERIC_CLEANED'], ascending=[True, True, False, True]).groupby(by=['ID_VARIABLE', 'JPR_ID'])['INC_REL_NUMERIC_CLEANED'].transform('last')

In [58]:
#combine the data back together

df_combo_dos = pd.concat([only_one_dos, more_than_one_dos_df])

df = df_combo_dos

## **Step 2:** Create a New JP_MIN variable that takes the Max(JP_MIN) for a given JPR_ID, for cases that were not sentenced multiple times (EDIT & EXAMINE)

In [64]:
#inspect the results

#impute missing values for the ADJ_JPMIN values that are currently null -- if there are multiple JP_MINS for an instance that DOESNT have multiple sentencing dates, 
# just take the maximum value of the JP_MIN available and set it equal to the adj_jpmin value

# !!! AZ shouldn't this also be grouping by ID_VAR and JPR_ID? -- TJ ADDRESSED THIS

df.loc[df["ADJ_JPMIN"].isnull(), "ADJ_JPMIN"] =  df.groupby(["JPR_ID", "ID_VARIABLE"])["JP_MIN"].transform("max")

#examine if for a jpr_id, individiuals have the same jp_min in the same jpr_id
# 1. first just subset to those jpr_ids with multiple id_variables


#2. inspect if the jp_min values are the same

#df.sort_values(["JPR_ID"])[:20][["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


#[["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]





In [63]:
#IN THE BELOW, WE CHECK TO SEE HOW MANY CASES THERE ARE WHERE THERE ARE MULTIPLE ID_VARS FOR A GIVEN JPR_ID 
# THIS NUMBER IS NECESSARY TO KNOW BC IT SERVES AS THE JUSTIFICATION FOR GROUPING AT THE JPR_ID, ID_VARIABLE LEVEL IN OUR DATASET
num_id_vars = df.groupby(["JPR_ID"])['ID_VARIABLE'].agg(lambda x: len(set(x)))
more_than_one_id_var = list(num_id_vars[num_id_vars > 1].index)
#more_than_one_id_var_df = df.loc[df["JPR_ID"].isin(more_than_one_id_var)]

len(more_than_one_id_var) #TJ EDIT


11

In [65]:
df[["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]

df.loc[df["MAX_DOS"].isnull()][["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


Unnamed: 0,JPR_ID,DOS,MIN_DOS,JP_MIN,MAX_DOS,TIME_SERVED,LATEST_JPMIN,ADJ_JPMIN
0,640001,2001-06-12,NaT,,NaT,,,
1,642480,2001-12-03,NaT,2.0,NaT,,,2.0
2,660434,2001-04-26,NaT,2.0,NaT,,,2.0
3,628940,2001-05-22,NaT,,NaT,,,
4,594048,2001-01-03,NaT,183.0,NaT,,,183.0
...,...,...,...,...,...,...,...,...
2189215,5614645,2016-03-29,NaT,,NaT,,,
2189216,5614645,2016-03-29,NaT,,NaT,,,
2189217,5640865,2016-06-22,NaT,2.0,NaT,,,2.0
2189219,5663216,2016-12-08,NaT,90.0,NaT,,,90.0


## Step 3: Ensure that the INC_SANCTION_EXISTS Flag Is Aggregated Correctly

In [66]:
#figure out how many different values INC_SANCTION_EXISTS takes on for each JPR_ID
#if at least 1 charge is = Y (JPR_ID might have Yes and NO) -- then inc_sanction_exists for the ENTIRE JPR_ID should be "Y"

num_inc_sanc_vals = df.groupby(["JPR_ID"])['INC_SANCTION_EXISTS'].agg(lambda x: len(set(x))) #INC_SANCTION_EXISTS values does each jpr_id have?

#give back the list of JPR_IDs that have more than one inc_sanction_exists value
more_than_one_inc_sanc = list(num_inc_sanc_vals[num_inc_sanc_vals > 1].index)


In [67]:
only_one_inc_sanc = df.loc[~df["JPR_ID"].isin(more_than_one_inc_sanc)]
more_than_one_inc_sanc_df = df.loc[df["JPR_ID"].isin(more_than_one_inc_sanc)]


In [68]:
#more_than_one_inc_sanc_df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS"]]

more_than_one_inc_sanc_df["NEW_INC_SANCTION_EXISTS"] = "Y"
more_than_one_inc_sanc_df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_than_one_inc_sanc_df["NEW_INC_SANCTION_EXISTS"] = "Y"


Unnamed: 0,JPR_ID,DOS,INC_SANCTION_EXISTS,NEW_INC_SANCTION_EXISTS
18,665877,2001-10-24,N,Y
19,665877,2001-10-24,Y,Y
34,675242,2001-11-07,Y,Y
35,675242,2001-11-07,N,Y
41,622266,2001-11-19,N,Y
...,...,...,...,...
2523585,5934987,2019-04-15,N,Y
2523586,5934987,2019-04-15,N,Y
2414646,5935437,2018-12-07,N,Y
2414647,5935437,2018-12-07,N,Y


In [69]:
inc_sanc_combined = pd.concat([only_one_inc_sanc, more_than_one_inc_sanc_df])

In [70]:
inc_sanc_combined[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

In [None]:
df = inc_sanc_combined

df.loc[df["NEW_INC_SANCTION_EXISTS"].isnull(), "NEW_INC_SANCTION_EXISTS"] =  df["INC_SANCTION_EXISTS"]

df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

In [None]:
print("Note: As shown below, there are no entries before we collapse at the ID_VAR, DOS-LEVEL with the adj_jpmin & new_inc_sanction_exists mismatch")
df.loc[(df["ADJ_JPMIN"] > 0) & (df["NEW_INC_SANCTION_EXISTS"] == "N")][["ID_VARIABLE", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN"]] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 


# Aggregate Select Columns at the JPR_ID Level

## Get MAX PRS8 Score at the JPR_ID level

In [None]:
# !!! AZ shouldn't this be grouping by ID_VAR and JPR_ID level?? 
# original code: df['PRS8'] = df.groupby(["JPR_ID"])['PRS8'].transform("max")

# new code:
df['PRS8'] = df.groupby(["ID_VARIABLE", "JPR_ID"])['PRS8'].transform("max")

## Collapsing the The Sex Crime Tier Numeric Variable at the JPR_ID Level

In [None]:
df['SEXTIER_NUMERIC'] = df.groupby(["JPR_ID", "ID_VARIABLE"])['SEXTIER_NUMERIC'].transform("max") #may need to be at the jpr_id, id_variable level

In [None]:
# !!! AZ edit: get CTY at JPR_ID level 
df['CTY_PHL'] = df.groupby(by=['ID_VARIABLE', 'JPR_ID'])['CTY_PHL'].transform('max')

# Collapse the data at the ID_VARIABLE, DOS-LEVEL 



### Adjust the following columns to be at the id var, dos level

In [None]:
# !!! AZ Edit - COMMENTED OUT FOR NOW BY TJ
df['LATEST_JPMIN'] = np.where(df['LATEST_JPMIN'].isna(), df['JP_MIN'], df['LATEST_JPMIN']) # TJ edit: why is this added here?

In [None]:
df_collapsed = df.copy()


# #get the max values of the OGS and JP_MIN values -- possibly further adjustments need to be at this level
df_collapsed['OGS'] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["OGS"].transform(max)

#for the same id_variable, DOS pairing, get the "MIN" NEXT_DOF
df_collapsed['NEW_DOF'] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["NEW_DOF"].transform(min)

#added this in on 4/11/22
df_collapsed["ADJ_JPMIN"] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["ADJ_JPMIN"].transform(max)

#get the max PRS8 score at the id_variable, date of sentencing level
df_collapsed["PRS8"] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["PRS8"].transform(max)

#collapse the sex tier numeric variable at the id var, dos level
df_collapsed['SEXTIER_NUMERIC'] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])['SEXTIER_NUMERIC'].transform(max)


# !!! AZ Edit: 
#for charge counts and inc relationship, use the data observed for the most serious sanction (e.g. max latest_jp_min)
#we use latest JPMIN here instead of adjusted JPMIN to consider the actual total length of sentence, rather than the remaining sentence that needs to be served, for cases with multiple DOS per JP

df_collapsed['CHARGE_COUNT'] = df_collapsed.sort_values(by=['ID_VARIABLE', 'DOS', 'LATEST_JPMIN'], ascending=[True, True, False]).groupby(['ID_VARIABLE', 'DOS'])['CHARGE_COUNT'].transform('first')

# note that skipna is set to False here since we want to keep N
df_collapsed['INC_REL_NUMERIC_CLEANED'] = df_collapsed.sort_values(by=['ID_VARIABLE', 'DOS', 'LATEST_JPMIN'], ascending=[True, True, False]).groupby(['ID_VARIABLE', 'DOS'])['INC_REL_NUMERIC_CLEANED'].transform('first')


In [None]:
# test case !!! AZ
#df_collapsed[df_collapsed["ID_VARIABLE"]==1884187][['ID_VARIABLE', 'JPR_ID', 'DOS', 'JP_MIN', 'LATEST_JPMIN', 'CHARGE_COUNT', 'INC_REL_NUMERIC_CLEANED', 'INC_RELATIONSHIP']]

In [None]:
#df_collapsed[df_collapsed["ID_VARIABLE"]==1001234][['ID_VARIABLE', 'JPR_ID', 'DOS', 'JP_MIN', 'LATEST_JPMIN', 'CHARGE_COUNT', 'INC_REL_NUMERIC_CLEANED', 'INC_RELATIONSHIP']]

In [None]:
# note the last two rows above have INC_REL as 0 even though there are multiple charges with JP_MIN
# in the orig dataset this is also missing
#psc_trimmed[psc_trimmed["ID_VARIABLE"]==1001234][['ID_VARIABLE', 'JPR_ID', 'DOS', 'JP_MIN', 'INC_RELATIONSHIP']]

## Actually Collpase the Dataset Now 

In [None]:
#collapse data to be at the id variable, DOS level (need to ungroup the data for the at_risk date calculation to work)
df_collapsed = df_collapsed.copy().groupby(["ID_VARIABLE", "DOS"]).first().reset_index()

#inspect the results
df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN"]] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 



### **STEP 3:** Calculate the AT_RISK_DT using the following logic

In [None]:
def create_at_risk_date(row):
    #need to account for REALLY large JP_MIN values
    
    # Because of this error message OverflowError: Python int too large to convert to C long
    # 25 is more years than we have in our data, so their at_risk date also get set to some value far in the future
    upper_limit = 25.0 * 365.0
    
    num_days_in_month = 30.0
    
    #if offense has a life or death flag, set their at_risk_date abritarily large
    if row['OFN_LIFE_DEATH'] == "Y":
        at_risk_date = pd.to_datetime('2035-12-31')
    
    if row['JP_LIFE_DEATH'] == "Y":
        at_risk_date = pd.to_datetime('2035-12-31')

    #if they were not incarcerated, then their at risk date is just their date of offense
    if row["NEW_INC_SANCTION_EXISTS"] == "N":
        at_risk_date = row['DOS'] #(was previously DOF but should be DOS)
    
    #if they were incarcerated, look at the below logic to determine their at-risk date
    else:

        if row["ADJ_JPMIN"] < upper_limit:

            if row["NEW_INC_SANCTION_EXISTS"] == "Y" and pd.notna(row['ADJ_JPMIN']):
                at_risk_date = row['DOS'] + pd.Timedelta(days = row['ADJ_JPMIN'])
            
            elif row["NEW_INC_SANCTION_EXISTS"] == "Y" and pd.notna(row['INCMIN']):
                at_risk_date = row['DOS'] + pd.Timedelta(days = row['INCMIN'] * num_days_in_month)

            else:
                at_risk_date = row['INC_END']

        else:
            at_risk_date = pd.to_datetime('2035-12-31')

    return at_risk_date


#apply the function to the data (row by row)
df_collapsed["AT_RISK_DT"] = df_collapsed.apply(create_at_risk_date, axis = 1)

 #adjust so that the times do not include minutes and seconds
df_collapsed["AT_RISK_DT"] = pd.to_datetime(df_collapsed["AT_RISK_DT"]).dt.date

# #inspect the results
df_collapsed[['ID_VARIABLE', 'JPR_ID',"JP_MIN", "INCMIN", "INC_END", "ADJ_JPMIN", "INC_SANCTION_EXISTS", "DOS", "NEW_DOF", "AT_RISK_DT"]]



**Note:** In the above at_risk_date calculation code, there is an "upper_limit" because the largest JP_MIN value is 230,000+ days, which is the equivalent of about 631 years. This person would not recidivate in our dataset and Python throws a "OverflowError: Python int too large to convert to C long" for these individuals. So, in order to allow the code to run, those with jp_min values equivalent to more days than we have data for, will just get an at-risk date very far into the future.

In [None]:
# #correct the at_risk_dt calculation for some rows:
df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull())), "AT_RISK_DT"] = df_collapsed['MAX_DOS']  + pd.to_timedelta(df_collapsed['ADJ_JPMIN'], unit='d')

#will another line here work to resolve this issue?
df_subset_mult = df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull()))]  #, "AT_RISK_DT"] # = pd.to_datetime(df_collapsed["AT_RISK_DT"])
df_subset_rest = df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & ((df_collapsed['MAX_DOS'].isnull()))]  #, "AT_RISK_DT"] # = pd.to_datetime(df_collapsed["AT_RISK_DT"])



In [None]:
df_subset_mult[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]

df_subset_mult["AT_RISK_DT"] = pd.to_datetime(df_subset_mult["AT_RISK_DT"])

df_subset_mult[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]


In [None]:
#concatenate the dataframes back together
df_collapsed = pd.concat([df_subset_mult, df_subset_rest])

df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull()))].head()[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]



## Populate Next DOF

In [None]:
#sort the data
df_collapsed = df_collapsed.sort_values(by = ["ID_VARIABLE", "NEW_DOF"])

#shift the data up by one to create the new vaariable "NEXT_DOF"
df_collapsed['NEXT_DOF'] = df_collapsed.groupby(['ID_VARIABLE'])['NEW_DOF'].shift(-1).dt.date

df_collapsed[:20][["ID_VARIABLE", "JPR_ID", "DOS", "NEW_DOF", "NEXT_DOF", "AT_RISK_DT", "INC_SANCTION_EXISTS"]]

## Check for "Free Time" 
(i.e.: Do we have enough data for an individual to see if they recidivated in 3 years or not?)

**Procedure Below:**
1. Subset just to those whose at_risk date < max DOS df[["DOS"]].max()
2. Then, we also want to remove those whose last next_dof is null and whose last dof > 2017
3. Essentially, we want to subset (whatever grouping variable we're using) to just those entries where next_dof is null and FOR THIS SAME ROW, if the dof >= pd.todatetime("2017-01-01") -- remove these entries



In [None]:
#subset to those whose at_risk_date < the largest sentencing date that we have

before_length = len(df_collapsed)

#what is the maximum sentence date?
last_day = pd.to_datetime(df_collapsed[["DOS"]].max())[0]  
df_collapsed = df_collapsed[df_collapsed["AT_RISK_DT"] <= last_day]

after_length = len(df_collapsed) 

print("There are {:,} id_var, dos combos where the at risk date is after the last date of sentence available.".format(before_length - after_length))


Here, I calculate a "LAST_DOF" variable, which will then be used to subset the data to only those whose latest offense was before 2017

In [None]:
df_collapsed["LAST_DOF"] = df_collapsed.loc[df_collapsed["NEXT_DOF"].isnull(), "NEW_DOF"]

df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "LAST_DOF"]]

In [None]:
#subset the data to only those whose last_dof is before 2017
#before_length = len(df_collapsed)

last_day = pd.to_datetime("2017-01-01") 

#subset the dataset to either where the LAST_DOF is null OR LAST_DOF < last_day
df_collapsed = df_collapsed.loc[(df_collapsed["LAST_DOF"].isnull()) | (df_collapsed["LAST_DOF"] < last_day)]

# after_length = len(df_collapsed) 
# print("There are {:,} id_var, dos combos whose's last dof is not in scope.".format(before_length - after_length))

df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "LAST_DOF"]]


# Create The Time To Recidivite and Recidivism Variables

In [None]:
#subtract the next_dof and at_risk_dt variables  -- update this 
df_collapsed['TIME_TO_RECIDIVATE'] = pd.to_datetime(df_collapsed['NEXT_DOF']) - pd.to_datetime(df_collapsed['AT_RISK_DT'])#update to this level 
    
#update the time to recidivate column to JUST be the number of days as an integer/float
df_collapsed['TIME_TO_RECIDIVATE'] = df_collapsed['TIME_TO_RECIDIVATE'].dt.days

df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "TIME_TO_RECIDIVATE"]]


In [None]:
#number of days in  years
three_years_in_days = float(3) * 365.0  
five_years_in_days = float(5) * 365.0  

#ID_VARIABLE, DOS-LEVEL RECIDIVISM -- does not count times where the next_dof < at_risk_dt as instances of recidivism

df_collapsed["RECIDIVISM_3Y"] = np.where(
    (df_collapsed['TIME_TO_RECIDIVATE'] > 0) & (df_collapsed['TIME_TO_RECIDIVATE'] <= three_years_in_days), 1, 0)

df_collapsed["RECIDIVISM_5Y"] = np.where(
    (df_collapsed['TIME_TO_RECIDIVATE'] > 0) & (df_collapsed['TIME_TO_RECIDIVATE'] <= five_years_in_days), 1, 0)

df_collapsed[["ID_VARIABLE", "DOS", "ADJ_JPMIN", "NEW_DOF", "NEXT_DOF", "AT_RISK_DT", "TIME_TO_RECIDIVATE", "RECIDIVISM_3Y", "RECIDIVISM_5Y"]]


## Export The Results to CSV 
(PA_SENTENCING/Project/data/recidivism_dataset.csv)

In [None]:
#create a new date of sentence variable

#if max_dos is null, then there is only one DOS associated with a given JPR_ID OTHERWISE, the new_dos becomes the max dos (meaning there were multiple dates of sentence associated with that JPR_ID)

df_collapsed.loc[df_collapsed["MAX_DOS"].notna(), "NEW_DOS"] = df_collapsed["MAX_DOS"]

df_collapsed.loc[df_collapsed["MAX_DOS"].isnull(), "NEW_DOS"] = df_collapsed["DOS"]

df_collapsed[["MAX_DOS","DOS", "NEW_DOS"]]

In [None]:
df_collapsed.rename(columns={'INC_REL_NUMERIC_CLEANED': 'INC_REL_NUMERIC'}, inplace=True)

In [None]:
#Export the Results to a CSV
#subset the dataset before exporting it
df_collapsed_subset = df_collapsed[["ID_VARIABLE", "NEW_DOS", "NEW_DOF", "PRS", 
"PRS8", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN", "LATEST_JPMIN", "AT_RISK_DT", "COUNTY", 
"NEXT_DOF", "TIME_TO_RECIDIVATE", "RECIDIVISM_3Y", "RECIDIVISM_5Y", "OGS",
"MS_SANCTION","SEXCRIME","DUI","DRUGOFFENSE","FIREARMS","VIOLENCE",'JPS_WALSHACTASSESSMENT', 'CTY_PHL', 'CHARGE_COUNT', 'INC_REL_NUMERIC']]


#get the demographics dataset
cleaned_demographics = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "demographic_dataset.csv"))

#merged the recidivism dataset with the cleaned demographics dataset
result = pd.merge(df_collapsed_subset, cleaned_demographics, how="left", on=["ID_VARIABLE", "ID_VARIABLE"])


# #export the dataframe with the recidivism variables to a new dataframe
#output_path = os.path.join(pa_sentencing_path, "Project", "data", "recidivism_dataset.csv")

#changes the output path to include a flag for including the PRS score 8 values
output_path = os.path.join(pa_sentencing_path, "Project", "data", "recidivism_dataset_w_additional_analysis_2.csv")



result.to_csv(output_path,index=False) #export the final results

In [None]:
# #check to see what the data looks like
test = pd.read_csv(output_path)


print(test.RECIDIVISM_3Y.value_counts())
