# Data preparation

In this module, we will start processing the dataset

In [5]:

import os
import pandas as pd

In [8]:
DATA_FOLDER = '../data'

df_compas = pd.read_csv(os.path.join(DATA_FOLDER, 'compas-scores-two-years.csv'))

## First dataset reduction

We reduce the fields that contain administrative information, the fields that are 'replicated' in other fields (for example age and dob, here we only keep one) and fields that to not contain information that can contribute to prediction (for example description fields)

In [10]:
columns_to_keep = [ "sex", "age", "race", "juv_fel_count", 
                   "decile_score", "juv_misd_count", "juv_other_count", 
                   "priors_count", "days_b_screening_arrest", "c_jail_in", 
                   "c_jail_out", "c_offense_date", "c_arrest_date", 
                   "c_days_from_compas", "c_charge_degree", "is_recid", 
                   "r_charge_degree", "r_days_from_arrest", "r_jail_in", 
                   "r_jail_out", "is_violent_recid", "vr_charge_degree", 
                   "v_decile_score", "in_custody", "out_custody", 
                   "start", "end", "event", "two_year_recid" ]


df_reduced = df_compas[columns_to_keep].copy()



## Processing

We process fields that can be combined into one through difference

-  "c_jail_in",  "c_jail_out" will become c_jail_time
-  "c_offense_date", "c_arrest_date" will become arrest_offense_time
- "r_jail_in", "r_jail_out" will become r_jail_time
- "in_custody", "out_custody" will become custody_time
- "start", "end" will become end_start

In [None]:
# "c_jail_in",  "c_jail_out" will become c_jail_time
df_reduced['c_jail_in'] = pd.to_datetime(
    df_reduced['c_jail_in'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['c_jail_out'] = pd.to_datetime(
    df_reduced['c_jail_out'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['c_jail_time'] = abs((df_reduced['c_jail_out'] - df_reduced['c_jail_in']).dt.days)

# "c_offense_date", "c_arrest_date" will become arrest_offense_time
df_reduced['c_offense_date'] = pd.to_datetime(
    df_reduced['c_offense_date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['c_arrest_date'] = pd.to_datetime(
    df_reduced['c_arrest_date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['arrest_offense_time'] = abs((df_reduced['c_arrest_date'] - df_reduced['c_offense_date']).dt.days)

# "r_jail_in", "r_jail_out" will become r_jail_time
df_reduced['r_jail_in'] = pd.to_datetime(
    df_reduced['r_jail_in'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['r_jail_out'] = pd.to_datetime(
    df_reduced['r_jail_out'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['r_jail_time'] = abs((df_reduced['r_jail_out'] - df_reduced['r_jail_in']).dt.days)

# "in_custody", "out_custody" will become days_in_custody
df_reduced['in_custody'] = pd.to_datetime(
    df_reduced['in_custody'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['out_custody'] = pd.to_datetime(
    df_reduced['out_custody'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['days_in_custody'] = abs((df_reduced['out_custody'] - df_reduced['in_custody']).dt.days)

# "start", "end" will become end_start
df_reduced['end_start'] = abs((df_reduced['end'] - df_reduced['start']))

#remove all the columns that are not needed
df_reduced = df_reduced.drop(['c_jail_in', 'c_jail_out', 
                              'c_offense_date', 'c_arrest_date',
                              'r_jail_in', 'r_jail_out',
                              'in_custody', 'out_custody',
                              'start', 'end'], axis=1)
