## Load Data

In [95]:
#this jupyter notebook is essentially the same as the "recidivism-check" notebook, just cleaned up a bit (hence the name)
#import required libraries
import os
import pandas as pd
import numpy as np
import datetime
import sqlite3

#get the folder path for this data
pa_sentencing_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))

In [96]:

#read in the correct data file (need to read in this file because of the additional columns it has)
#psc_trimmed = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "PSC_data_trimmed_v1.csv"))

#read in trimmed version WITH 8th edition PRS score implementation
psc_trimmed = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "trimmed_w_prs8.csv"))


  psc_trimmed = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "trimmed_w_prs8.csv"))


## Dask Method to Load Data

In [None]:
# from dask import dataframe as dd
# from dask.distributed import Client

# pa_sentencing_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))

# #read in the correct data file (need to read in this file because of the additional columns it has)
# psc_trimmed = dd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "PSC_data_trimmed_v1.csv"),
# dtype={'GUILTY_NO_PENALTY': 'object',
#        'INC_RELATEDOTN': 'object',
#        'INC_RELATIONSHIP': 'object',
#        'OFN_COUNT': 'object',
#        'OFN_LIFE_DEATH': 'object',
#        'PRS': 'object',
#        'PRS_LAPSING': 'object',
#        'PRS_MANUAL': 'object',
#        'PRS_NONLAPSING': 'object',
#        'REASON_ONE': 'object',
#        'REASON_THREE': 'object',
#        'REASON_TWO': 'object',
#        'SGR_LVL': 'object',
#        'STAT_MIN': 'object'},low_memory=False)

In [None]:
# copying the original loaded data to a working data frame to use and compare with later
#df = df_tbl_db.copy() #if accessing the database

df = psc_trimmed.copy() # if accessing the psc_trimmed file directly


#change column names to uppercase
df.columns = df.columns.str.upper()


In [None]:
df.head() #inspect the dataset

In [None]:
#Potential Issue: There are two separate variables for PRS Score -- so just checking here that they are infact the same 
print(df.PRS8.value_counts())

print(df["PRS8.1"].value_counts())

#confirmed (based on below output that these are the same variable)

In [None]:
#testing the issue with the NEW DOF (id_var == 1468038) 
# df_test = df.copy()

# df_test = df_test[df_test["ID_VARIABLE"] == 1468038]

# df_test[["JPR_ID", "ID_VARIABLE", "DOS", "DOF", "OFN_LABEL"]]



# Table of Contents

1. Data Quality Checks & Data Cleaning
    1. Consistency Issues
        1. [Combine REVOC and RFEL Categories](#combining-revoc-into-rfel)
        2. [Clean Dates & Create New DOF](#get-minimum-value-for-the-dof-across-all-of-the-charges-associated-with-one-jprid)
    2. Accuracy Issues
        1. [Missing PRS Scores](#clean-missing-prs-score)
        1. [Address JP_CC_BUG Issue](#clean-jp-cc-bug)
2. [At-Risk Date Calculation](#implement-at-risk-date-calculation-logic)
    1. Group Data at JPR_ID Level
        1. [Address Mutiple Dates of Sentencing](#multiple-dos-for-one-jprid)
        2. [Create Adjusted JP_MIN Value]
        3. [Check INC_SANCTION EXISTS](#incsanctionexists-check)
    2. Group Data at the ID_VARIABLE, DOS LEVEL
    3. Implement At-Risk Date Logic
3. Calculate Recidivism
    1. [Calculate Next Date of Offense](#populate-next-dof)
    2. [Check for Free Time](#check-for-"free-time")
    3. [Calculate Time to Recidivate & 3-Year and 5-Year Recidivism Variables](#create-time-to-recidivate-and-recidivsm-variables)

Note: the links aove may not work well in VSCode but wold work better in Jupyter Notebooks via Anaconda
    


## Getting the Dimensions of the Original Dataset

In [None]:
## Getting the Dimensions of the Original Main.CSV dataset
psc_main_fewrows = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "Main.csv"),nrows=10)
print(psc_main_fewrows.shape[1],'Total Number of columns  in the original dataset')
print(psc_trimmed.shape[0],'Total Rows in the original dataset')

# Combining REVOC into RFEL

In [None]:
# Values before conversion
df['PRS'].unique()

In [None]:
# Function to combine REVOC and spelling issues PRS categories into RFEL
def refl_combine(x):
    if x in ['REVOC', 'rfel', 'RFEl','Rfel', 'revoc']:
        return('RFEL')
    else:
        return(x)   

In [None]:
df['PRS'] = df['PRS'].apply(refl_combine)

In [None]:
df['PRS'].unique()

## Convert Dates

In [None]:
# Making sure that Pandas pd.to_datetime is not messing up the format for dates ending in 2020 in DOS. 
# Extracting out the last two digits of the DOS string and inspecting that there are no "2020" values 

date_list= list(df['DOS'].astype("str"))
year_string = [i[-2:] for i in date_list ]
print(set(year_string))   
'20' in set(year_string)

In [None]:
#convert date strings to datetime variable
df[['DOF','DOS']] = df[['DOF','DOS']].apply(pd.to_datetime,format="%d %b %y")

In [None]:
#testing code 
# df_test = df.copy()

# df_test = df_test[df_test["ID_VARIABLE"] == 1468038]

# df_test[["JPR_ID", "ID_VARIABLE", "DOS", "DOF", "OFN_LABEL"]]


In [None]:
# extracting out the just the year from the date to be used later 
df['DOF_YEAR'] = pd.DatetimeIndex(df['DOF']).year
df['DOS_YEAR'] = pd.DatetimeIndex(df['DOS']).year

In [None]:
#checking the range of values for the DOF and DOS variables
print("The minimum date of offense in the dataset is: {}".format(df[["DOF"]].min()[0]))
print("The maximum date of offense in the dataset is: {}".format(df[["DOF"]].max()[0]))
print("The minimum date of sentencing in the dataset is: {}".format(df[["DOS"]].min()[0]))
print("The maximum date of sentencing in the dataset is: {}".format(df[["DOS"]].max()[0]))

Note: As shown in the above code chunk, there **isn't** anamolous behavior in the date ranges (i.e. a date in the year 1909 or 2090) for the date of offense (DOF) or date of sentence (DOS) variables -- therefore, an additional date correction was **not** applied in this case.

## Clean DOS > DOF

Note: group offense by ID_VAR, JPR_ID, MIN(DOF) to get the first DOF associated for a single JPR_ID

In [None]:
 #count how many values of DOF are missing in the original dataset
dof_missing = df[df['DOF'].isnull()]

print("There are {:,} rows with missing DOFs in the dataset.".format(len(dof_missing)))

### Get **minimum** value for the DOF across all of the charges associated with **one** JPR_ID. 

Note: This is the procedure because we don't wan't to count a DOF as an instance of recidivism if it occurs BEFORE the date of sentencing.

In [None]:
#at the JPR_ID level we only want ONE DOF because becuase we don't want to take into account DOF's that occur
#BEFORE the DOS (associated with the JPR_ID) as an instance of recidivism. -- each JPR_ID should have only ONE DOS

#df["NEW_DOF"] = df.groupby(["JPR_ID"])["DOF"].transform("min")

#here we will group by id_Variable as well as jpr_id
df["NEW_DOF"] = df.groupby(["JPR_ID", "ID_VARIABLE"])["DOF"].transform("min")

# df["MIN_DOF"] = df.groupby(["JPR_ID"])["DOF"].transform("min")
# df["MAX_DOF"] = df.groupby(["JPR_ID"])["DOF"].transform("max")

In [None]:
#testing code
df_test = df.copy()

df_jprid = df_test[df_test["JPR_ID"] == 5499834]
df_test = df_test[df_test["ID_VARIABLE"] == 1468038]

#here is where the issue seems to occur

df_test[["JPR_ID", "ID_VARIABLE", "DOS", "DOF", "NEW_DOF","OFN_LABEL"]]


In [None]:
#still part of the testing of the code
#df_jprid[["JPR_ID", "ID_VARIABLE", "DOS", "DOF", "MIN_DOF", "MAX_DOF", "NEW_DOF","OFN_LABEL"]]


In [None]:
#testing the code
# 'test_date = "2014-08-18"
# print(type(test_date))

# test_date_conversion = pd.to_datetime(test_date) #, format="%d %b %y")
# print(test_date_conversion, type(test_date_conversion))

# test_date_conversion.min


In [None]:
df.head()[["JPR_ID", "DOF", "NEW_DOF"]]

In [None]:
dof_missing = df[df['NEW_DOF'].isnull()]

percent_missing = len(dof_missing)/len(df)
print("After cleaning, there are {:,} ({:%}) rows with missing DOFs in the dataset.".format(len(dof_missing), percent_missing))

### **Step 2**: Subset the data to just include those rows where NEW_DOF <= DOS

In [None]:
#make sure the sentencing 
before_length = len(df)
df = df[df.NEW_DOF <= df.DOS] #should this be <= ?
after_length = len(df)

print("Before DOF <= DOS correction there were {:,} rows and after cleaning there were {:,} rows. A change of {:,}.".format(before_length, after_length, before_length - after_length))


## Clean Missing PRS Score 

In [None]:
before_length = len(df)
#subset to just the id variables with a PRS score missing
id_varswith_prsmissing= set(df[df.PRS.isnull()].ID_VARIABLE)

#remove id vars with missing PRS
df_prs_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_prsmissing)]

#reassign to working dataframe
df = df_prs_notaffected 

after_length = len(df)
print("Before PRS correction there were {:,} rows and after cleaning there were {:,} rows. A change of {:,} rows and {} people.".format(before_length, after_length, before_length - after_length, len(id_varswith_prsmissing)))


## Clean Missing PRS8 (8th Edition Sentencing Guidelines) Score

In [None]:
before_length = len(df)
#subset to just the id variables with a PRS score missing
id_varswith_prs8missing= set(df[df.PRS8.isnull()].ID_VARIABLE)

#remove id vars with missing PRS
df_prs8_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_prs8missing)]

#reassign to working dataframe
df = df_prs8_notaffected 

after_length = len(df)
print("Before PRS8 correction there were {:,} rows and after cleaning there were {:,} rows. A change of {:,} rows and {} people.".format(before_length, after_length, before_length - after_length, len(id_varswith_prs8missing)))


## Clean JP CC Bug

## Steps followed in cleaning JP_CC Bug
1. It is evident that there are JPR_ID's with DOS from 2016 to 2019 that were impacted by the JP_CC_BUG 
2. The first step was to extract out the ID variables that were impacted out by the bug. 
3. next we removed the Judicial proceedings of these JPR_ID's where the DOS is in 2017,2018, and 2019. However, the JPR_ID's associated with the first occurence of the JP_CC Bug is kept- In other words, the JPR_ID's where the DOS was in 2016 is kept. 


In [None]:
#confirming the years that impacted the JP_CC_BUG
set(df[df.JP_CC_BUG=='Y'].DOS_YEAR)


In [None]:
# Obtaining the id variables with jp_bug
id_varswith_jpbug= set(df[df.JP_CC_BUG=='Y'].ID_VARIABLE) #pull out both id_variable and DOS

In [None]:
# assigning all the rows associated with the jp bugs to a seperate dataframe 
df_with_jpbug=  df[df.ID_VARIABLE.isin(id_varswith_jpbug)]  #want to remove the charges that come after the DOS associated with the JP_CC_BUG row (want to eliminate the problemative date of sentencing)

In [None]:
# Removing the JPR'IDS's that have DOS in 2017,2018 and 2019
df_jp_bug_cleaned = df_with_jpbug[df.DOS_YEAR<2017]

In [None]:
# Isolating the rows associated with id_vars in the original dataframe that is not associated with the bug
df_jpbug_notaffected = df[~df.ID_VARIABLE.isin(id_varswith_jpbug)]

In [None]:
# Rejoining the rows affected by the JP_CC_bug after cleaning them to the rows not affected by the bug
df_cleaned_1 = pd.concat([df_jpbug_notaffected,df_jp_bug_cleaned])  #new working df

df = df_cleaned_1

In [None]:
after_length = len(df)

print("After the JP_CC_BUG correction there are {:,} rows. ".format(after_length))


## Implement At Risk Date Calculation Logic

### **STEP 1:** 
#### 1. Fix issues where one JPR_ID has more than one date of sentence NEW_DOS = (min(dos)) and JPMIN =  lastest (most recent jp-min) -- which is associated with the max(dos) -- the most recent date of sentence & 
#### 2. make sure that the row that we subset at includes a inc_sanction_exists == yes if at LEAST one of the charges in the list is equal to Y 

#### Multiple DOS for one JPR_ID

In [None]:
dos_vals = df.groupby(["JPR_ID"])['DOS'].agg(lambda x: set(x)) #> 1 #how many rows have two UNIQUE DOS for the same JPR_ID (set)

In [None]:
num_dos = df.groupby(["JPR_ID"])['DOS'].agg(lambda x: len(set(x))) #how many dates of sentencing does each jpr_id have?
#num_dos[2847193]

In [None]:
#more_than_one_dos.reset_index()
more_than_one_dos = list(num_dos[num_dos > 1].index)

print("There are {:,} JPR_IDS in the dataset with more than one date of sentence.".format(len(more_than_one_dos)))

In [None]:
#example of multiple dates of sentencing
#df.loc[df["JPR_ID"] == 662328][["JPR_ID", "ID_VARIABLE", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "ADJ_JPMIN", "MS_SENTJP", "INC_SANCTION_EXISTS"]]

In [None]:
#subset the data JUST to those ID's in the more_than_one dos bucket
only_one_dos = df.loc[~df["JPR_ID"].isin(more_than_one_dos)]
more_than_one_dos_df = df.loc[df["JPR_ID"].isin(more_than_one_dos)]


In [None]:
more_than_one_dos_df = more_than_one_dos_df.sort_values(["JPR_ID", "DOS"]) #sort by jpr_id AND DOS

#more_than_one_dos_df.head()[["JPR_ID", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "ADJ_JPMIN", "MS_SENTJP", "INC_SANCTION_EXISTS"]]
more_than_one_dos_df.head()[["JPR_ID", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "MS_SENTJP", "INC_SANCTION_EXISTS"]]


In [None]:
#create a new column with the NEW_DOS value
more_than_one_dos_df["MAX_DOS"] = more_than_one_dos_df.groupby("JPR_ID")["DOS"].transform("max") #take the latest date of sentencing
more_than_one_dos_df["MIN_DOS"] = more_than_one_dos_df.groupby("JPR_ID")["DOS"].transform("min") #take the latest date of sentencing

# #create a new time served column
more_than_one_dos_df["TIME_SERVED"] = more_than_one_dos_df["MAX_DOS"] - more_than_one_dos_df["MIN_DOS"]
more_than_one_dos_df["TIME_SERVED"] = more_than_one_dos_df["TIME_SERVED"].dt.days

#finds the JP_MIN associated with the latest DOS (because the data is already sorted by JPR_ID and DOS)
more_than_one_dos_df["LATEST_JPMIN"] = more_than_one_dos_df.groupby("JPR_ID")["JP_MIN"].transform("last")

#calculate an adjusted JP_MIN from the logic provided by Miranda
more_than_one_dos_df['ADJ_JPMIN'] = more_than_one_dos_df["LATEST_JPMIN"] - more_than_one_dos_df["TIME_SERVED"]

# # more_than_one_dos_df[:20][["JPR_ID", "DOS", "OFN_LABEL", "PRS", "JP_MIN", "ADJ_JPMIN", "INC_SANCTION_EXISTS", "NEW_DOS"]]
more_than_one_dos_df[:20][["JPR_ID", "ID_VARIABLE", "DOS", "OFN_LABEL", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


In [None]:
#combine the data back together

df_combo_dos = pd.concat([only_one_dos, more_than_one_dos_df])

df = df_combo_dos

### **STEP 2:** Create a New JP_MIN variable that takes the Max(JP_MIN) for a given JPR_ID

In [None]:
#Fix Issues with the missing JP_MIN
# num_missing_jp_min = len(df.loc[pd.isna(df["JP_MIN"])]) #[["JPR_ID", "JP_MIN"]]
# print("There are {:,} entries in the dataset missing a JP_MIN value.".format(num_missing_jp_min))

# df["ADJ_JPMIN"] = df.groupby(["JPR_ID"])["JP_MIN"].transform("max")

# num_missing_jp_min = len(df.loc[pd.isna(df["ADJ_JPMIN"])]) #[["JPR_ID", "JP_MIN"]]
# print("There are {:,} entries in the dataset missing a  ADJ_JPMIN value.".format(num_missing_jp_min))

#when should we calculated a consolidated JP_MIN? before or after grouping at the id_variable, dos level


In [None]:
#inspect the results

#impute missing values for the ADJ_JPMIN values that are currently null -- if there are multiple JP_MINS for an instance that DOESNT have multiple sentencing dates, 
# just take the maximum value of the JP_MIN available and set it equal to the adj_jpmin value

df.loc[df["ADJ_JPMIN"].isnull(), "ADJ_JPMIN"] =  df.groupby(["JPR_ID"])["JP_MIN"].transform("max")


#df.sort_values(["JPR_ID"])[:20][["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


#[["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


In [None]:
df[["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]

df.loc[df["MAX_DOS"].isnull()][["JPR_ID", "DOS", "MIN_DOS", "JP_MIN", "MAX_DOS", "TIME_SERVED", "LATEST_JPMIN", "ADJ_JPMIN"]]


#### INC_SANCTION_EXISTS Check

In [None]:
#figure out how many different values INC_SANCTION_EXISTS takes on for each JPR_ID
#if at least 1 charge is = Y (JPR_ID might have Yes and NO) -- then inc_sanction_exists for the ENTIRE JPR_ID should be "Y"

num_inc_sanc_vals = df.groupby(["JPR_ID"])['INC_SANCTION_EXISTS'].agg(lambda x: len(set(x))) #INC_SANCTION_EXISTS values does each jpr_id have?

#give back the list of JPR_IDs that have more than one inc_sanction_exists value
more_than_one_inc_sanc = list(num_inc_sanc_vals[num_inc_sanc_vals > 1].index)


In [None]:
#more_than_one_inc_sanc

#df.loc[df["JPR_ID"] == 2286][["JPR_ID", "DOS", "INC_SANCTION_EXISTS"]]

In [None]:
only_one_inc_sanc = df.loc[~df["JPR_ID"].isin(more_than_one_inc_sanc)]
more_than_one_inc_sanc_df = df.loc[df["JPR_ID"].isin(more_than_one_inc_sanc)]


In [None]:
#more_than_one_inc_sanc_df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS"]]

more_than_one_inc_sanc_df["NEW_INC_SANCTION_EXISTS"] = "Y"
more_than_one_inc_sanc_df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

In [None]:
inc_sanc_combined = pd.concat([only_one_inc_sanc, more_than_one_inc_sanc_df])



In [None]:
inc_sanc_combined[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

In [None]:
df = inc_sanc_combined

df.loc[df["NEW_INC_SANCTION_EXISTS"].isnull(), "NEW_INC_SANCTION_EXISTS"] =  df["INC_SANCTION_EXISTS"]

df[["JPR_ID", "DOS", "INC_SANCTION_EXISTS", "NEW_INC_SANCTION_EXISTS"]]

In [None]:
print("Note: As shown below, there are no entries before we collapse at the ID_VAR, DOS-LEVEL with the adj_jpmin & new_inc_sanction_exists mismatch")
df.loc[(df["ADJ_JPMIN"] > 0) & (df["NEW_INC_SANCTION_EXISTS"] == "N")][["ID_VARIABLE", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN"]] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 


## Get MAX PRS8 Score at the JPR_ID level

In [None]:
df['PRS8'] = df.groupby(["JPR_ID"])['PRS8'].transform("max")


## Placeholder: Need to Add Flags for the Following categories (during the collapsing process)
* Crimes of violence
    * There is some public opinion that the existing definition in the guidelines may be too narrow. The public prefers a broader consideration of violent offenses (e.g.  Any violence against a person)
        * Not sure if there's an easy way to identify these
* Sex offenses
    * There are 3 tiers of sex offenses
        * Look at recidivism rates for the general categorization and for each of the three tiers
* Firearms (VUFA - violation of the uniform fire offense types)

# Collapse the data at the ID_VARIABLE, DOS-LEVEL 

 *** Changed this to be at the id_variable, new-dos level (on 4/11/22)


### Create a NEW_DOS Variable To Account for Individuals with Multiple Dates of Sentencing

In [None]:
# #create the new date of sentencing variable 4/11/22
# df_collapsed = df.copy()

# df_collapsed.loc[df_collapsed["MAX_DOS"].notna(), "NEW_DOS"] = df_collapsed["MAX_DOS"]

# df_collapsed.loc[df_collapsed["MAX_DOS"].isnull(), "NEW_DOS"] = df_collapsed["DOS"]


In [None]:
df_collapsed = df.copy()

#do people have different PRS scores for the same JPR_ID?

# #get the max values of the OGS and JP_MIN values -- possibly further adjustments need to be at this level
df_collapsed['OGS'] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["OGS"].transform(max)

#for the same id_variable, DOS pairing, get the "MIN" NEXT_DOF
df_collapsed['NEW_DOF'] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["NEW_DOF"].transform(min)

#added this in on 4/11/22
df_collapsed["ADJ_JPMIN"] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["ADJ_JPMIN"].transform(max)

#get the max PRS8 score at the id_variable, date of sentencing level
df_collapsed["PRS8"] = df_collapsed.groupby(["ID_VARIABLE", "DOS"])["PRS8"].transform(max)

#combine the offense type flags into one string

#collapse data to be at the id variable, DOS level (need to ungroup the data for the at_risk date calculation to work)
df_collapsed = df_collapsed.copy().groupby(["ID_VARIABLE", "DOS"]).first().reset_index()

#inspect the results
df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN"]] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 


#deal with the INC_SANCTION_EXISTS PART OF THIS -- don't only keep (keep the one with Y and not no)


In [None]:
#CHECK INC_SANCTION_EXIST ADJ_JPMIN > 0 and INC_SANCTION_EXISTS = 'N'

#df[(df['col1'] >= 1) & (df['col1'] <=1 )]
#df_collapsed.loc[(df_collapsed["ADJ_JPMIN"] > 0) & (df_collapsed["NEW_INC_SANCTION_EXISTS"] == "N")][["ID_VARIABLE", "DOS", "NEW_DOF", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN"]] #, "OFN_LIFE_DEATH", "JP_LIFE_DEATH"]] 


#df.loc[df['TeamID']==12]


### **STEP 3:** Calculate the AT_RISK_DT using the following logic

In [None]:
def create_at_risk_date(row):
    #need to account for REALLY large JP_MIN values
    
    # Because of this error message OverflowError: Python int too large to convert to C long
    # 25 is more years than we have in our data, so their at_risk date also get set to some value far in the future
    upper_limit = 25.0 * 365.0
    
    num_days_in_month = 30.0
    
    #if offense has a life or death flag, set their at_risk_date abritarily large
    if row['OFN_LIFE_DEATH'] == "Y":
        at_risk_date = pd.to_datetime('2035-12-31')
    
    if row['JP_LIFE_DEATH'] == "Y":
        at_risk_date = pd.to_datetime('2035-12-31')

    #if they were not incarcerated, then their at risk date is just their date of offense
    if row["NEW_INC_SANCTION_EXISTS"] == "N":
        at_risk_date = row['DOS'] #(was previously DOF but should be DOS)
    
    #if they were incarcerated, look at the below logic to determine their at-risk date
    else:

        if row["ADJ_JPMIN"] < upper_limit:

            if row["NEW_INC_SANCTION_EXISTS"] == "Y" and pd.notna(row['ADJ_JPMIN']):
                at_risk_date = row['DOS'] + pd.Timedelta(days = row['ADJ_JPMIN'])
            
            elif row["NEW_INC_SANCTION_EXISTS"] == "Y" and pd.notna(row['INCMIN']):
                at_risk_date = row['DOS'] + pd.Timedelta(days = row['INCMIN'] * num_days_in_month)

            else:
                at_risk_date = row['INC_END']

        else:
            at_risk_date = pd.to_datetime('2035-12-31')

    #address the special case where there are multiple dates of sentencing
    # if pd.notna(row['MAX_DOS']) and pd.notna(row['ADJ_JPMIN']):
    #     at_risk_date = row['MAX_DOS'] + pd.Timedelta(days = row['ADJ_JPMIN'])

    
    return at_risk_date


# df["AT_RISK_DT"] = np.where(
#     df['INC_SANCTION_EXISTS'] == "Y" and pd.notna(df['JP_MIN']), 1, 0)

# test = df[:2000]
# #apply the function to the data (row by row)
# test["AT_RISK_DT"] = test.apply(create_at_risk_date, axis = 1)

#  #adjust so that the times do not include minutes and seconds
# test["AT_RISK_DT"] = pd.to_datetime(test["AT_RISK_DT"]).dt.date

# # #inspect the results
# test[['ID_VARIABLE', 'JPR_ID',"JP_MIN", "INCMIN", "INC_END", "ADJ_JPMIN", "INC_SANCTION_EXISTS", "DOS", "NEW_DOF", "AT_RISK_DT"]]

#test = df[:2000]
#apply the function to the data (row by row)
df_collapsed["AT_RISK_DT"] = df_collapsed.apply(create_at_risk_date, axis = 1)

 #adjust so that the times do not include minutes and seconds
df_collapsed["AT_RISK_DT"] = pd.to_datetime(df_collapsed["AT_RISK_DT"]).dt.date

# #inspect the results
df_collapsed[['ID_VARIABLE', 'JPR_ID',"JP_MIN", "INCMIN", "INC_END", "ADJ_JPMIN", "INC_SANCTION_EXISTS", "NEW_DOS", "NEW_DOF", "AT_RISK_DT"]]



**Note:** In the above at_risk_date calculation code, there is an "upper_limit" because the largest JP_MIN value is 230,000+ days, which is the equivalent of about 631 years. This person would not recidivate in our dataset and Python throws a "OverflowError: Python int too large to convert to C long" for these individuals. So, in order to allow the code to run, those with jp_min values equivalent to more days than we have data for, will just get an at-risk date very far into the future.

In [None]:
# #correct the at_risk_dt calculation for some rows:
df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull())), "AT_RISK_DT"] = df_collapsed['MAX_DOS']  + pd.to_timedelta(df_collapsed['ADJ_JPMIN'], unit='d')

#will another line here work to resolve this issue?
df_subset_mult = df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull()))]  #, "AT_RISK_DT"] # = pd.to_datetime(df_collapsed["AT_RISK_DT"])
df_subset_rest = df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & ((df_collapsed['MAX_DOS'].isnull()))]  #, "AT_RISK_DT"] # = pd.to_datetime(df_collapsed["AT_RISK_DT"])



# # #for some reason, the above gives dates (although correct) in the following format:1556150400000000000)
#df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull()))].head()[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]



In [None]:
df_subset_mult[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]

df_subset_mult["AT_RISK_DT"] = pd.to_datetime(df_subset_mult["AT_RISK_DT"])

df_subset_mult[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]



In [None]:
#concatenate the dataframes back together
df_collapsed = pd.concat([df_subset_mult, df_subset_rest])

df_collapsed.loc[(~(df_collapsed["ADJ_JPMIN"]).isnull()) & (~(df_collapsed['MAX_DOS'].isnull()))].head()[['ID_VARIABLE', 'JPR_ID', "MAX_DOS","ADJ_JPMIN", "NEW_DOF", "AT_RISK_DT"]]



In [None]:
#OverflowError: Python int too large to convert to C long

largest_jpmin =  df_collapsed["ADJ_JPMIN"].max()
largest_jpmin_in_years = largest_jpmin/365.0
print("The largest JP_MIN value is {:,} days, which is {} years. This causes Python to throw the following error: OverflowError: Python int too large to convert to C long.".format(largest_jpmin, largest_jpmin_in_years))



## Populate Next DOF

In [None]:
#sort the data
df_collapsed = df_collapsed.sort_values(by = ["ID_VARIABLE", "NEW_DOF"])

#shift the data up by one to create the new vaariable "NEXT_DOF"
df_collapsed['NEXT_DOF'] = df_collapsed.groupby(['ID_VARIABLE'])['NEW_DOF'].shift(-1).dt.date

df_collapsed[:20][["ID_VARIABLE", "JPR_ID", "NEW_DOS", "NEW_DOF", "NEXT_DOF", "AT_RISK_DT", "INC_SANCTION_EXISTS"]]

## Check for "Free Time" 
(i.e.: Do we have enough data for an individual to see if they recidivated in 3 years or not?)

**Procedure Below:**
1. Subset just to those whose at_risk date < max DOS df[["DOS"]].max()
2. Then, we also want to remove those whose last next_dof is null and whose last dof > 2017
3. Essentially, we want to subset (whatever grouping variable we're using) to just those entries where next_dof is null and FOR THIS SAME ROW, if the dof >= pd.todatetime("2017-01-01") -- remove these entries



In [None]:
#subset to those whose at_risk_date < the largest sentencing date that we have

before_length = len(df_collapsed)

#what is the maximum sentence date?
last_day = pd.to_datetime(df_collapsed[["DOS"]].max())[0]  
df_collapsed = df_collapsed[df_collapsed["AT_RISK_DT"] <= last_day]

after_length = len(df_collapsed) 

print("There are {:,} id_var, dos combos where the at risk date is after the last date of sentence available.".format(before_length - after_length))


Here, I calculate a "LAST_DOF" variable, which will then be used to subset the data to only those whose latest offense was before 2017

In [None]:
df_collapsed["LAST_DOF"] = df_collapsed.loc[df_collapsed["NEXT_DOF"].isnull(), "NEW_DOF"]

df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "LAST_DOF"]]

In [None]:
#subset the data to only those whose last_dof is before 2017
#before_length = len(df_collapsed)

last_day = pd.to_datetime("2017-01-01") 

#subset the dataset to either where the LAST_DOF is null OR LAST_DOF < last_day
df_collapsed = df_collapsed.loc[(df_collapsed["LAST_DOF"].isnull()) | (df_collapsed["LAST_DOF"] < last_day)]

# after_length = len(df_collapsed) 
# print("There are {:,} id_var, dos combos whose's last dof is not in scope.".format(before_length - after_length))

df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "LAST_DOF"]]


## CREATE TIME TO RECIDIVATE AND RECIDIVSM VARIABLES

In [None]:
#subtract the next_dof and at_risk_dt variables  -- update this 
df_collapsed['TIME_TO_RECIDIVATE'] = pd.to_datetime(df_collapsed['NEXT_DOF']) - pd.to_datetime(df_collapsed['AT_RISK_DT'])#update to this level 
    
#update the time to recidivate column to JUST be the number of days as an integer/float
df_collapsed['TIME_TO_RECIDIVATE'] = df_collapsed['TIME_TO_RECIDIVATE'].dt.days

df_collapsed[["ID_VARIABLE", "DOS", "NEW_DOF", "NEXT_DOF", "TIME_TO_RECIDIVATE"]]


In [None]:
#number of days in  years
three_years_in_days = float(3) * 365.0  
five_years_in_days = float(5) * 365.0  

#ID_VARIABLE, DOS-LEVEL RECIDIVISM -- does not count times where the next_dof < at_risk_dt as instances of recidivism

df_collapsed["RECIDIVISM_3Y"] = np.where(
    (df_collapsed['TIME_TO_RECIDIVATE'] > 0) & (df_collapsed['TIME_TO_RECIDIVATE'] <= three_years_in_days), 1, 0)

df_collapsed["RECIDIVISM_5Y"] = np.where(
    (df_collapsed['TIME_TO_RECIDIVATE'] > 0) & (df_collapsed['TIME_TO_RECIDIVATE'] <= five_years_in_days), 1, 0)

df_collapsed[["ID_VARIABLE", "DOS", "ADJ_JPMIN", "NEW_DOF", "NEXT_DOF", "AT_RISK_DT", "TIME_TO_RECIDIVATE", "RECIDIVISM_3Y", "RECIDIVISM_5Y"]]


## Export The Results to CSV 
(PA_SENTENCING/Project/data/recidivism_dataset.csv)

In [None]:
#create a new date of sentence variable

#if max_dos is null, then there is only one DOS associated with a given JPR_ID OTHERWISE, the new_dos becomes the max dos (meaning there were multiple dates of sentence associated with that JPR_ID)

df_collapsed.loc[df_collapsed["MAX_DOS"].notna(), "NEW_DOS"] = df_collapsed["MAX_DOS"]

df_collapsed.loc[df_collapsed["MAX_DOS"].isnull(), "NEW_DOS"] = df_collapsed["DOS"]

df_collapsed[["MAX_DOS","DOS", "NEW_DOS"]]

In [None]:
#Export the Results to a CSV
#subset the dataset before exporting it
df_collapsed_subset = df_collapsed[["ID_VARIABLE", "NEW_DOS", "NEW_DOF", "PRS", "PRS8", "NEW_INC_SANCTION_EXISTS", "ADJ_JPMIN","AT_RISK_DT", "COUNTY", "NEXT_DOF", "TIME_TO_RECIDIVATE", "RECIDIVISM_3Y", "RECIDIVISM_5Y", "OGS"]]


#get the demographics dataset
cleaned_demographics = pd.read_csv(os.path.join(pa_sentencing_path, "Project", "data", "demographic_dataset.csv"))

#merged the recidivism dataset with the cleaned demographics dataset
result = pd.merge(df_collapsed_subset, cleaned_demographics, how="left", on=["ID_VARIABLE", "ID_VARIABLE"])


# #export the dataframe with the recidivism variables to a new dataframe
#output_path = os.path.join(pa_sentencing_path, "Project", "data", "recidivism_dataset.csv")

#changes the output path to include a flag for including the PRS score 8 values
output_path = os.path.join(pa_sentencing_path, "Project", "data", "recidivism_dataset_w_prs8.csv")



result.to_csv(output_path) #export the final results

In [None]:
# #check to see what the data looks like
test = pd.read_csv(output_path)

# print("num rows:", len(test.index))
# test.head()

print(test.RECIDIVISM_3Y.value_counts())
