In [1]:
import pandas as pd
import numpy as np
#import data
df = pd.read_csv('data/Recidivism_Full_Dataset.csv')

### Looking at just Males since many values are missing for females

In [2]:
# Take out the 3167 female observations so only male prisoners are left in the dataframe
# Note that some data such as 'Gang_Affiliated' do not have data for female offenders
df = df[df['Gender']=='M']

### Clean Drug Test Data

In [3]:
#If values of Drugs Test results were NaN, it means the offender did not require drug tests
#Create a new column for whether individuals were required to conduct drug tests
df['Required_DrugTests'] = np.where(df['DrugTests_Other_Positive'].isna(), 0, 1)

#Change all values of Drug Tests to 1 if they were positive and 0 otherwise 
df['DrugTests_THC_Positive'] = np.where(df['DrugTests_THC_Positive'] > 0, 1, 0)
df['DrugTests_Cocaine_Positive'] = np.where(df['DrugTests_Cocaine_Positive'] > 0, 1, 0)
df['DrugTests_Meth_Positive'] = np.where(df['DrugTests_Meth_Positive'] > 0, 1, 0)
df['DrugTests_Other_Positive'] = np.where(df['DrugTests_Other_Positive'] > 0, 1, 0)


### Clean Employement Data

In [4]:
# Clean Employment Data

# If an individual was not employed, then they had 0 jobs per year; this is to help fill some NaN values
df.loc[df['Percent_Days_Employed'] == 0, 'Jobs_Per_Year'] = 0

#Fill Remainder of NaN values for Percent_Days_Employed and Jobs_Per_Year with median
df['Percent_Days_Employed'].fillna(df['Percent_Days_Employed'].median(), inplace=True)
df['Jobs_Per_Year'].fillna(df['Jobs_Per_Year'].median(), inplace=True)

### Clean Supervision Data

In [5]:
#Note that we can impute Supervision_Risk_Score_First based on Supervision_Level_First and vice versa because certain scores are always associated with certain levels

# Fill NaN in 'Supervision_Level_First' based on 'Supervision_Risk_Score_First'
conditions = [
    (df['Supervision_Risk_Score_First'].between(1, 5)) & (df['Supervision_Level_First'].isna()),
    (df['Supervision_Risk_Score_First'].between(6, 8)) & (df['Supervision_Level_First'].isna()),
    (df['Supervision_Risk_Score_First'].between(9, 10)) & (df['Supervision_Level_First'].isna())
]
choices = ['Standard', 'High', 'Specialized']
df['Supervision_Level_First'] = np.select(conditions, choices, default=df['Supervision_Level_First'])

# Fill NaN in 'Supervision_Risk_Score_First' based on 'Supervision_Level_First'
conditions = [
    (df['Supervision_Level_First'] == 'Standard') & (df['Supervision_Risk_Score_First'].isna()),
    (df['Supervision_Level_First'] == 'High') & (df['Supervision_Risk_Score_First'].isna()),
    (df['Supervision_Level_First'] == 'Specialized') & (df['Supervision_Risk_Score_First'].isna())
]
choices = [3, 7, 9]
df['Supervision_Risk_Score_First'] = np.select(conditions, choices, default=df['Supervision_Risk_Score_First'])

In [6]:
# Impute remaining NaN values in 'Supervision_Risk_Score_First' with the median
df['Supervision_Risk_Score_First'].fillna(df['Supervision_Risk_Score_First'].median(), inplace=True)

# Impute remaining NaN values in 'Supervision_Level_First' with the mode
df['Supervision_Level_First'].fillna(df['Supervision_Level_First'].mode()[0], inplace=True)

### Clean Numerical Data

In [7]:
# Function to replace specific strings with other strings and then convert to integer
def replace_and_convert_to_int(df, column, replacements):
    for old_value, new_value in replacements.items():
        df[column] = df[column].replace(old_value, new_value)
    df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')  # Convert to integer

# Define replacements and apply the function
replace_and_convert_to_int(df, 'Dependents', {'3 or more': '3'})

replace_and_convert_to_int(df, 'Prior_Arrest_Episodes_Felony', {'10 or more': '10'})
replace_and_convert_to_int(df, 'Prior_Arrest_Episodes_Misd', {'6 or more': '6'})
replace_and_convert_to_int(df, 'Prior_Arrest_Episodes_Violent', {'3 or more': '3'})
replace_and_convert_to_int(df, 'Prior_Arrest_Episodes_Property', {'5 or more': '5'})
replace_and_convert_to_int(df, 'Prior_Arrest_Episodes_Drug', {'5 or more': '5'})
replace_and_convert_to_int(df, 'Prior_Arrest_Episodes_PPViolationCharges', {'5 or more': '5'})

replace_and_convert_to_int(df, 'Prior_Conviction_Episodes_Felony', {'3 or more': '3'})
replace_and_convert_to_int(df, 'Prior_Conviction_Episodes_Misd', {'4 or more': '4'})
replace_and_convert_to_int(df, 'Prior_Conviction_Episodes_Prop', {'3 or more': '3'})
replace_and_convert_to_int(df, 'Prior_Conviction_Episodes_Drug', {'2 or more': '2'})

replace_and_convert_to_int(df, 'Delinquency_Reports', {'4 or more': '4'})
replace_and_convert_to_int(df, 'Program_Attendances', {'10 or more': '10'})
replace_and_convert_to_int(df, 'Program_UnexcusedAbsences', {'3 or more': '3'})
replace_and_convert_to_int(df, 'Residence_Changes', {'3 or more': '3'})

In [8]:
# 'Prison_Years' has multiple conditions so we do this manually rather than use the function
df['Prison_Years'] = df['Prison_Years'].replace({
    'Less than 1 year': '1', 
    '1-2 years': '2', 
    'Greater than 2 to 3 years': '3', 
    'More than 3 years': '4'
})
df['Prison_Years'] = pd.to_numeric(df['Prison_Years'], errors='coerce').astype('Int64')

In [9]:
#Currently, the age range is a string with the lower and upper bounds separated by a dash
#This will just take the midpoint of the range and replace the string with an integer

# Function to calculate the midpoint of an age range
def calculate_midpoint(age_range):
    # If the age range is '48 or older', use 55 as the midpoint
    if age_range == "48 or older":
        return 55
    else:
        lower, upper = age_range.split('-')
        return (int(lower) + int(upper)) / 2
    
# Apply the function to each row in the 'Age_at_Release' column
df['Age_at_Release'] = df['Age_at_Release'].apply(calculate_midpoint)

# df now contains a new column 'Age_at_Release_Midpoint' with the midpoints of the age ranges


### Clean Categorical Data

In [10]:
# Fill NaN values in 'Prison_Offense' with 'Unknown'
df['Prison_Offense'] = df['Prison_Offense'].fillna('Unknown')

### Drop Columns

In [11]:
# No value in the ID variable that just numbers the rows
#Avg_Days_per_DrugTest was replaced by categorical variable Required_DrugTests

df.drop(['ID', 'Avg_Days_per_DrugTest'], axis=1, inplace=True)

### Save to csv

In [12]:
# Save df as a csv file
df.to_csv('data/Recidivism_Data_Cleaned.csv', index=False)