# Data preparation

In this module, we will start processing the dataset

In [104]:

import os
import pandas as pd

In [105]:
DATA_FOLDER = '../data'

df_compas = pd.read_csv(os.path.join(DATA_FOLDER, 'compas-scores-two-years.csv'))

## First dataset reduction

We reduce the fields that contain administrative information, the fields that are 'replicated' in other fields (for example age and dob, here we only keep one) and fields that to not contain information that can contribute to prediction (for example description fields)

In [106]:
columns_to_keep = [ "sex", "age", "race", "juv_fel_count", "decile_score",
                   "juv_misd_count", "juv_other_count", "priors_count",
                   "days_b_screening_arrest", "c_jail_in", "c_jail_out",
                   "c_days_from_compas", "c_charge_degree", "v_decile_score",
                   "in_custody", "out_custody", "start", "end", "event", "two_year_recid" ]


df_reduced = df_compas[columns_to_keep].copy()



## Processing

We process fields that can be combined into one through difference

-  "c_jail_in",  "c_jail_out" will become c_jail_time
- "r_jail_in", "r_jail_out" will become r_jail_time
- "in_custody", "out_custody" will become custody_time
- "start", "end" will become end_start

In [107]:
# "c_jail_in",  "c_jail_out" will become c_jail_time
df_reduced['c_jail_in'] = pd.to_datetime(
    df_reduced['c_jail_in'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['c_jail_out'] = pd.to_datetime(
    df_reduced['c_jail_out'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_reduced['c_jail_time'] = abs((df_reduced['c_jail_out'] - df_reduced['c_jail_in']).dt.days)

# "in_custody", "out_custody" will become days_in_custody
df_reduced['in_custody'] = pd.to_datetime(
    df_reduced['in_custody'], format='%Y-%m-%d', errors='coerce')
df_reduced['out_custody'] = pd.to_datetime(
    df_reduced['out_custody'], format='%Y-%m-%d', errors='coerce')
df_reduced['days_in_custody'] = abs((df_reduced['out_custody'] - df_reduced['in_custody']).dt.days)

# "start", "end" will become end_start
df_reduced['end_start'] = abs((df_reduced['end'] - df_reduced['start']))

#remove all the columns that are not needed
df_reduced = df_reduced.drop(['c_jail_in', 'c_jail_out', 
                              'in_custody', 'out_custody',
                              'start', 'end'], axis=1)

# save the dataset
df_reduced.to_csv(os.path.join(DATA_FOLDER, 'df_reduced.csv'), index=False)


In [108]:
# gather information about the dataset - data types and missing values
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sex                      7214 non-null   object 
 1   age                      7214 non-null   int64  
 2   race                     7214 non-null   object 
 3   juv_fel_count            7214 non-null   int64  
 4   decile_score             7214 non-null   int64  
 5   juv_misd_count           7214 non-null   int64  
 6   juv_other_count          7214 non-null   int64  
 7   priors_count             7214 non-null   int64  
 8   days_b_screening_arrest  6907 non-null   float64
 9   c_days_from_compas       7192 non-null   float64
 10  c_charge_degree          7214 non-null   object 
 11  v_decile_score           7214 non-null   int64  
 12  event                    7214 non-null   int64  
 13  two_year_recid           7214 non-null   int64  
 14  c_jail_time             

In [109]:
# knn imputation of missing values in days_b_screening_arrest, c_jail_time, days_in_custody

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df_reduced['days_b_screening_arrest'] = imputer.fit_transform(df_reduced[['days_b_screening_arrest']])
df_reduced['c_jail_time'] = imputer.fit_transform(df_reduced[['c_jail_time']])
df_reduced['days_in_custody'] = imputer.fit_transform(df_reduced[['days_in_custody']])

df_reduced.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sex                      7214 non-null   object 
 1   age                      7214 non-null   int64  
 2   race                     7214 non-null   object 
 3   juv_fel_count            7214 non-null   int64  
 4   decile_score             7214 non-null   int64  
 5   juv_misd_count           7214 non-null   int64  
 6   juv_other_count          7214 non-null   int64  
 7   priors_count             7214 non-null   int64  
 8   days_b_screening_arrest  7214 non-null   float64
 9   c_days_from_compas       7192 non-null   float64
 10  c_charge_degree          7214 non-null   object 
 11  v_decile_score           7214 non-null   int64  
 12  event                    7214 non-null   int64  
 13  two_year_recid           7214 non-null   int64  
 14  c_jail_time             

In [110]:
# encoding of categorical features

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

# transform feature 'sex'
df_reduced['sex'] = df_reduced['sex'].astype('category')
ohe_sex = pd.DataFrame(ohe.fit_transform(df_reduced[['sex']]).toarray(), columns=ohe.get_feature_names_out(['sex']))
df_reduced = pd.concat([df_reduced, ohe_sex], axis=1)

# transform feature 'race'
df_reduced['race'] = df_reduced['race'].astype('category')
ohe_race = pd.DataFrame(ohe.fit_transform(df_reduced[['race']]).toarray(), columns=ohe.get_feature_names_out(['race']))
df_reduced = pd.concat([df_reduced, ohe_race], axis=1)

# transform feature 'c_charge_degree'
df_reduced['c_charge_degree'] = df_reduced['c_charge_degree'].astype('category')
ohe_c_charge_degree = pd.DataFrame(ohe.fit_transform(df_reduced[['c_charge_degree']]).toarray(), columns=ohe.get_feature_names_out(['c_charge_degree']))
df_reduced = pd.concat([df_reduced, ohe_c_charge_degree], axis=1)

# remove original categorical features
df_reduced = df_reduced.drop(['sex', 'race', 'c_charge_degree'], axis=1)

df_reduced.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      7214 non-null   int64  
 1   juv_fel_count            7214 non-null   int64  
 2   decile_score             7214 non-null   int64  
 3   juv_misd_count           7214 non-null   int64  
 4   juv_other_count          7214 non-null   int64  
 5   priors_count             7214 non-null   int64  
 6   days_b_screening_arrest  7214 non-null   float64
 7   c_days_from_compas       7192 non-null   float64
 8   v_decile_score           7214 non-null   int64  
 9   event                    7214 non-null   int64  
 10  two_year_recid           7214 non-null   int64  
 11  c_jail_time              7214 non-null   float64
 12  days_in_custody          7214 non-null   float64
 13  end_start                7214 non-null   int64  
 14  sex_Female              

In [111]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      7214 non-null   int64  
 1   juv_fel_count            7214 non-null   int64  
 2   decile_score             7214 non-null   int64  
 3   juv_misd_count           7214 non-null   int64  
 4   juv_other_count          7214 non-null   int64  
 5   priors_count             7214 non-null   int64  
 6   days_b_screening_arrest  7214 non-null   float64
 7   c_days_from_compas       7192 non-null   float64
 8   v_decile_score           7214 non-null   int64  
 9   event                    7214 non-null   int64  
 10  two_year_recid           7214 non-null   int64  
 11  c_jail_time              7214 non-null   float64
 12  days_in_custody          7214 non-null   float64
 13  end_start                7214 non-null   int64  
 14  sex_Female              

In [112]:
labels = ['two_year_recid', 'decile_score', 'v_decile_score']

df_features = df_reduced.copy().drop(labels, axis=1)

from sklearn.preprocessing import StandardScaler

features_scaled = StandardScaler().fit(df_features).transform(df_features)

pd.DataFrame(features_scaled, columns=df_features.columns).describe()


Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,event,c_jail_time,days_in_custody,...,sex_Female,sex_Male,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,c_charge_degree_F,c_charge_degree_M
count,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7192.0,7214.0,7214.0,7214.0,...,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0
mean,-4.5307690000000006e-17,2.8071070000000004e-17,3.8413040000000006e-17,2.2161370000000002e-17,5.1709860000000007e-17,3.939799e-18,-1.234953e-18,1.5266720000000002e-17,-3.4473240000000004e-17,-1.231187e-18,...,6.697658e-17,-1.418328e-16,5.2202340000000007e-17,2.265384e-17,-8.987666e-17,-2.6593640000000002e-17,1.280435e-17,-1.18194e-17,-1.009573e-16,-1.083445e-17
std,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.00007,1.000069,1.000069,1.000069,...,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069
min,-1.414692,-0.1418546,-0.1874142,-0.2180649,-0.7112398,-5.626061,-0.1750935,-0.7876518,-0.3661522,-0.2350366,...,-0.4896243,-2.042382,-1.024986,-0.06675016,-0.7180155,-0.3112116,-0.05001389,-0.2348215,-1.353233,-0.7389709
25%,-0.8258674,-0.1418546,-0.1874142,-0.2180649,-0.7112398,-0.05803639,-0.1720606,-0.7876518,-0.3661522,-0.2298454,...,-0.4896243,0.4896243,-1.024986,-0.06675016,-0.7180155,-0.3112116,-0.05001389,-0.2348215,-1.353233,-0.7389709
50%,-0.321161,-0.1418546,-0.1874142,-0.2180649,-0.3015884,-0.05803639,-0.1720606,-0.7876518,-0.3461385,-0.2246541,...,-0.4896243,0.4896243,0.9756228,-0.06675016,-0.7180155,-0.3112116,-0.05001389,-0.2348215,0.7389709,-0.7389709
75%,0.6041343,-0.1418546,-0.1874142,-0.2180649,0.3128887,-0.04455449,-0.1690277,1.269597,-0.06594646,-0.07929927,...,-0.4896243,0.4896243,0.9756228,-0.06675016,1.392728,-0.3112116,-0.05001389,-0.2348215,0.7389709,1.353233
max,5.146493,42.0577,26.60539,33.67679,7.072137,14.20581,28.59198,1.269597,15.62481,31.09412,...,2.042382,0.4896243,0.9756228,14.98124,1.392728,3.213248,19.99444,4.258554,0.7389709,1.353233
