# Catalan Juvenile Justice - Data Processing 

In [163]:
import torch
import pandas as pd
import numpy as np
from collections import Counter

### Load dataset

First, the data is loaded...

In [164]:
DATA_PATH = "../../data" 
df = pd.read_csv(f"{DATA_PATH}/raw/catalan-juvenile-recidivism-subset.csv", header=0)

# Display dataframe head
df.head()

Unnamed: 0,id,V1_sex,V4_area_origin,V6_province,V8_age,V9_age_at_program_end,V11_criminal_record,V12_n_criminal_record,V13_n_crime_cat,V15_main_crime_cat,...,V28_days_from_crime_to_program,V29_program_duration,V115_RECID2015_recid,V10_date_of_birth_year,V10_date_of_birth_month,V22_main_crime_date_year,V22_main_crime_date_month,V30_program_start_year,V30_program_start_month,V31_program_end_month
0,691,male,Spain,Lleida,15,17,1,1-2,3+,Against People,...,587.0,132,1,1992,11,2008,8,2010,4,8
1,956,female,Spain,Barcelona,14,15,1,1-2,2,Against People,...,107.0,258,1,1995,7,2009,8,2009,12,8
2,2560,male,Maghreb,Barcelona,15,16,1,3-5,3+,Against People,...,135.0,85,1,1994,1,2009,11,2010,3,6
3,2018,male,Maghreb,Girona,14,15,1,5+,2,Against People,...,176.0,271,1,1994,4,2009,1,2009,6,3
4,1650,male,Spain,Tarragona,17,20,1,5+,3+,Against People,...,261.0,533,1,1990,9,2008,9,2009,5,11


Next, an overview of the columns contained is provided.

In [165]:
# Column overview
Counter(df.columns)
df.columns


Index(['id', 'V1_sex', 'V4_area_origin', 'V6_province', 'V8_age',
       'V9_age_at_program_end', 'V11_criminal_record', 'V12_n_criminal_record',
       'V13_n_crime_cat', 'V15_main_crime_cat', 'V16_violent_crime',
       'V17_crime_classification', 'V19_committed_crime',
       'V20_n_juvenile_records', 'V21_n_crime', 'V23_territory_of_execution',
       'V24_finished_program', 'V26_finished_measure_grouped',
       'V27_program_duration_cat', 'V28_days_from_crime_to_program',
       'V29_program_duration', 'V115_RECID2015_recid',
       'V10_date_of_birth_year', 'V10_date_of_birth_month',
       'V22_main_crime_date_year', 'V22_main_crime_date_month',
       'V30_program_start_year', 'V30_program_start_month',
       'V31_program_end_month'],
      dtype='object')

The attributes are of different types (nominal, ordinal, numerical) from which some of them are sensitive attributes (in terms of being fairness-related attributes). An overview of a potential mapping is provided below:

In [145]:
numerical_cols = [
    'V20_n_juvenile_records',
    'V28_days_from_crime_to_program',
    'V29_program_duration',
    'V22_main_crime_date_year',
    'V22_main_crime_date_month',
    'V30_program_start_year',
    'V30_program_start_month',
    'V31_program_end_month',
    'V10_date_of_birth_month',
]

ordinal_cols = [
    'V12_n_criminal_record',
    'V13_n_crime_cat',
    'V21_n_crime',
    'V27_program_duration_cat',   
]

cat_cols = [
    'V1_sex',
    'V4_area_origin',
    'V6_province',
    'V11_criminal_record',
    'V15_main_crime_cat',
    'V16_violent_crime',
    'V17_crime_classification',
    'V19_committed_crime',
    'V23_territory_of_execution',
    'V24_finished_program',
    'V26_finished_measure_grouped',
    'V8_age',
    'V10_date_of_birth_year_grouped',
    'V9_age_at_program_end_grouped',
]

target = 'V115_RECID2015_recid'

# Check that shapes add up (disregarding the id columns)
assert df.columns.__len__() - 1 == len(numerical_cols) + len(ordinal_cols) + len(cat_cols) + 1

For the ordinal attributes we need to map the string content to numerical ordered data. An overview of the mappings are provided below.

In [146]:
df['V10_date_of_birth_year_grouped'] = pd.cut(df['V10_date_of_birth_year'], bins=[1982, 1990, 1993, 1996])
df['V9_age_at_program_end_grouped'] = pd.cut(df['V9_age_at_program_end'], bins=[14, 16, 18, 27])


In [147]:
# Ordinal mappings
mappings = {
    'V12_n_criminal_record': {'0': 0, '1-2': 1, '3-5': 2, '5+': 3},
    'V13_n_crime_cat': {'1': 1, '2': 2, '3+': 3},
    'V27_program_duration_cat': {'<6 months': 0, '6 months < 1 year': 1, '>1 year': 2},
    'V10_date_of_birth_year_grouped':{ '(1990, 1993]': 0, '(1993, 1996]': 1, '(1982, 1990]': 2},
    'V9_age_at_program_end_grouped':{ '(14, 16]': 0, '(16, 18]': 1, '(18, 27]': 2}
}

Now, we have all information for modifying the dataframe to only contain numerical input, which will be necessary when training a Neural Network. The categorical data is one-hot-encoded along wit the binary target attribute, whereas the ordinal attributes are numericalized using the previously defined mappings: 

In [148]:
# Replace ordinal values with the specified mappings
for col in ordinal_cols:
    if col in mappings.keys():
        reverse_mapping = {v: k for k, v in mappings[col].items()}
        df[col] = df[col].astype('category').replace(mappings[col])
        df[col] = df[col].cat.rename_categories(reverse_mapping)
        df[col] = df[col].cat.codes

# Categorical attributes
df = pd.get_dummies(
    df,
    columns=cat_cols
)

df = df.drop(columns='V10_date_of_birth_year')
df = df.drop(columns='V9_age_at_program_end')

df[target] = df[target].astype('category').cat.rename_categories({0: 'No', 1: 'Yes'})

For a better workflow in the dataloader, the dataset is split into `data`, `labels` and `sensitive_data`. 

In [149]:
labels          = df[target].cat.codes # pd.get_dummies(pd.DataFrame(df[target]))
data            = df.drop(columns=[target])

In [150]:
df.head()

Unnamed: 0,id,V12_n_criminal_record,V13_n_crime_cat,V20_n_juvenile_records,V21_n_crime,V27_program_duration_cat,V28_days_from_crime_to_program,V29_program_duration,V115_RECID2015_recid,V10_date_of_birth_month,...,V8_age_14,V8_age_15,V8_age_16,V8_age_17,"V10_date_of_birth_year_grouped_(1982, 1990]","V10_date_of_birth_year_grouped_(1990, 1993]","V10_date_of_birth_year_grouped_(1993, 1996]","V9_age_at_program_end_grouped_(14, 16]","V9_age_at_program_end_grouped_(16, 18]","V9_age_at_program_end_grouped_(18, 27]"
0,691,1,2,1.0,4,1,587.0,132,Yes,11,...,0,1,0,0,0,1,0,0,1,0
1,956,1,1,1.0,2,0,107.0,258,Yes,7,...,1,0,0,0,0,0,1,1,0,0
2,2560,2,2,4.0,3,1,135.0,85,Yes,1,...,0,1,0,0,0,0,1,1,0,0
3,2018,3,1,7.0,2,0,176.0,271,Yes,4,...,1,0,0,0,0,0,1,1,0,0
4,1650,3,2,6.0,3,2,261.0,533,Yes,9,...,0,0,0,1,1,0,0,0,0,1


In [151]:
# Standardize
mu, sigma = data[numerical_cols].mean(axis=0), data[numerical_cols].std(axis=0)
data[numerical_cols] = (data[numerical_cols] - mu) / sigma

Now, the processed datafile is constructed as a dictionary and saved as a `torch`-pickle. 

In [152]:
# Setup datafile structure
datafile = {
    'data': {
        'columns': np.array(data.columns),
        'content': data.to_numpy(),
    },
    'labels': {
        'name': labels.name,
        'content': labels.to_numpy().reshape(-1, 1), 
        'mean': mu,
        'variance': sigma**2,
    }
}

# Save datafile
torch.save(datafile, f"{DATA_PATH}/processed/catalan_dataset_including_sensitive.pth")


In [153]:
print(f"Data shape: \t\t{datafile['data']['content'].shape}")
print(f"Label shape: \t\t{datafile['labels']['content'].shape}")

Data shape: 		(4652, 92)
Label shape: 		(4652, 1)


In [154]:
data.columns

Index(['id', 'V12_n_criminal_record', 'V13_n_crime_cat',
       'V20_n_juvenile_records', 'V21_n_crime', 'V27_program_duration_cat',
       'V28_days_from_crime_to_program', 'V29_program_duration',
       'V10_date_of_birth_month', 'V22_main_crime_date_year',
       'V22_main_crime_date_month', 'V30_program_start_year',
       'V30_program_start_month', 'V31_program_end_month', 'V1_sex_female',
       'V1_sex_male', 'V4_area_origin_Europe', 'V4_area_origin_Latin America',
       'V4_area_origin_Maghreb', 'V4_area_origin_Other',
       'V4_area_origin_Spain', 'V6_province_Barcelona', 'V6_province_Girona',
       'V6_province_Lleida', 'V6_province_Tarragona', 'V11_criminal_record_0',
       'V11_criminal_record_1', 'V15_main_crime_cat_Against People',
       'V15_main_crime_cat_Against Property', 'V15_main_crime_cat_Other',
       'V16_violent_crime_0', 'V16_violent_crime_1',
       'V17_crime_classification_0', 'V17_crime_classification_1',
       'V19_committed_crime_Atemptat contra l'

### Test custom `torch` DataLoader

In [155]:
from src.data.dataloader import CatalanJuvenileJustice

dataset = CatalanJuvenileJustice(
    data_path=f"{DATA_PATH}/processed/catalan_dataset_including_sensitive.pth"
)

train_loader, val_loader, test_loader = dataset.get_loaders(
    batch_size=128, shuffle=True, num_workers=1, 
    test_size=0.2, val_size=0.2, split_type='random',
)


try:
    # Checking that the dataloader works...
    for i, batch in enumerate(train_loader, 0):
        data_, label_ = batch['data'], batch['label']

    print("Successfully looped through the DataLoader!")

except KeyError as e:
    raise e

92
Successfully looped through the DataLoader!


### Test seed functionality

In [156]:
def set_seed(seed: int):
    torch.manual_seed(seed)

First in a setting for two dataloaders using the same seed which should give same splits...

In [157]:
set_seed(41)
train_loader1, val_loader1, test_loader1 = dataset.get_loaders(
    batch_size=128, shuffle=True, num_workers=1, 
    test_size=0.2, val_size=0.2, split_type='random',
)

set_seed(41)
train_loader2, val_loader2, test_loader2 = dataset.get_loaders(
    batch_size=128, shuffle=True, num_workers=1, 
    test_size=0.2, val_size=0.2, split_type='random',
)

# Should give same splits
assert train_loader1.dataset.indices == train_loader2.dataset.indices
assert val_loader1.dataset.indices == val_loader2.dataset.indices
assert test_loader1.dataset.indices == test_loader2.dataset.indices

Next, when using different seeds, we should get different splits...

In [158]:
set_seed(0)
train_loader1, val_loader1, test_loader1 = dataset.get_loaders(
    batch_size=128, shuffle=True, num_workers=1, 
    test_size=0.2, val_size=0.2, split_type='random',
)

set_seed(41)
train_loader2, val_loader2, test_loader2 = dataset.get_loaders(
    batch_size=128, shuffle=True, num_workers=1, 
    test_size=0.2, val_size=0.2, split_type='random',
)

# Should give non-identical splits
assert train_loader1.dataset.indices != train_loader2.dataset.indices
assert val_loader1.dataset.indices != val_loader2.dataset.indices
assert test_loader1.dataset.indices != test_loader2.dataset.indices

## Train model (temporary)

In [18]:
from src.models.model import FullyConnected, get_loss_function, get_optimizer

model = FullyConnected(channels_in = dataset.n_attributes, channels_out = 2)
criterion = get_loss_function(type='NLL')
optimizer = get_optimizer(model, type='Adam')


TypeError: 'NotImplementedType' object is not callable

In [19]:
print(model)

FullyConnected(
  (net): Sequential(
    (0): Linear(in_features=84, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=2, bias=True)
    (5): Softmax(dim=1)
  )
)


In [None]:
batch = next(iter(train_loader))
inputs, labels = batch['data'], batch['label']

probs, logits = model(inputs)
loss = criterion(logits, labels.flatten())

In [None]:
# Label distribution
counts = Counter(df[target].cat.codes)
sorted({k: np.round(v / sum(counts.values()), 4) for k, v in counts.items()}.items(), key=lambda x: x[0])

[(0, 0.6574), (1, 0.3426)]