In [144]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns

In [145]:
RESULTS_FOLDER = "../results"
DATA_FOLDER = "../data"
TEMP_FOLDER = "../tmp"

## Load the training dataset

Load the train dataset in a pandas dataframe

In [146]:
df_train_path = os.path.join(DATA_FOLDER, 'train_dataset.csv')
df_train = pd.read_csv(df_train_path)

## Create additional features

We create the following additional features:

- `history_of_violence` - sum of all violence-related crimes in the past
- `socioeconomic_stability` - 1 / (1 + `priors_count`). If no priors count this will be equal to 1 (good stability), otherwise it will start getting smaller with each increase of priors


In [147]:
df_train["history_of_violence"] = (
    df_train["juv_fel_count"] +
    df_train["juv_misd_count"] +
    df_train["juv_other_count"] +
    df_train["priors_count"]
)

# Socioeconomic stability proxy
df_train["socioeconomic_stability"] = (1 / (1 + df_train["priors_count"])) 

## Prepare data for model training

- Select features to be used for training
    - `age`
    - `priors_count`
    - `history_of_violence`
    - `days_b_screening_arrest`
    - `socioeconomic_stability`
    - `c_charge_degree_F`
    - `c_charge_degree_M`
- Scale all features, mean 0 and std dev 1


- Select the label for training
    - `two_year_recid` * 10 to put the scale between 0 and 10



In [148]:

X_train = df_train[[ 
    "age", "priors_count", "history_of_violence", 
    "socioeconomic_stability", "c_charge_degree_F", "c_charge_degree_M"
]]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

y_train = df_train["two_year_recid"] * 10


In [None]:
print(X_train_scaled.shape)
print(pd.DataFrame(X_train_scaled).info())
print(pd.DataFrame(X_train_scaled).describe())
print(pd.DataFrame(X_train_scaled).head())

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load dataset

df_train_path = os.path.join(DATA_FOLDER, 'train_dataset.csv')
df_train = pd.read_csv(df_train_path)

# Feature engineering
df_train["history_of_violence"] = (
    df_train["juv_fel_count"] +
    df_train["juv_misd_count"] +
    df_train["juv_other_count"] +
    df_train["priors_count"]
)

# Socioeconomic stability proxy
df_train["socioeconomic_stability"] = (1 / (1 + df_train["priors_count"]))

# Select features and target
X_train = df_train[[ 
    "age", "priors_count", "history_of_violence", 
    "socioeconomic_stability", "c_charge_degree_F", "c_charge_degree_M"
]]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

y_train = (df_train["two_year_recid"] * 10)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_dev_tensor = torch.tensor(X_val, dtype=torch.float32)
y_dev_tensor = torch.tensor(y_val.values, dtype=torch.long)

# Define the neural network model
class RiskScoreModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(RiskScoreModel, self).__init__()
        self.hidden1 = nn.Linear(input_size, 64)
        self.bn1 = nn.BatchNorm1d(64)  # BatchNorm after the first hidden layer
        self.hidden2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)  # BatchNorm after the second hidden layer
        self.output = nn.Linear(32, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.relu(self.bn1(self.hidden1(x)))  # Apply BN and ReLU
        x = self.relu(self.bn2(self.hidden2(x)))  # Apply BN and ReLU
        x = self.softmax(self.output(x))  # Apply Softmax
        return x

# Model parameters
N_INPUT = X_train.shape[1]
N_OUTPUT = 11  # 10 classes
model = RiskScoreModel(N_INPUT, N_OUTPUT)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
epochs = 50
batch_size = 32
num_batches = len(X_train_tensor) // batch_size

for epoch in range(epochs):
    model.train()
    for i in range(num_batches):
        start = i * batch_size
        end = start + batch_size
        X_batch = X_train_tensor[start:end]
        y_batch = y_train_tensor[start:end]

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    
    val_accuracies = []  
    
    # # Validation step
    # model.eval()
    # with torch.no_grad():
    #     val_outputs = model(X_val_tensor)
    #     val_loss = criterion(val_outputs, y_val_tensor)
    #     val_preds = torch.argmax(val_outputs, dim=1)
    #     val_accuracy = (val_preds == y_val_tensor).float().mean()
    #     val_accuracies.append(val_accuracy.item())

    # print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_accuracy.item():.4f}")


    # Validation step
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_train_tensor)
        val_loss = criterion(val_outputs, y_train_tensor)
        val_preds = torch.argmax(val_outputs, dim=1)
        val_accuracy = (val_preds == y_train_tensor ).float().mean()
        val_accuracies.append(val_accuracy.item())

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_accuracy.item():.4f}")




# Evaluate the model on validation data
model.eval()
with torch.no_grad():
    val_outputs = model(X_dev_tensor)
    val_preds = torch.argmax(val_outputs, dim=1)
    print(classification_report(y_dev_tensor, val_preds))

import matplotlib.pyplot as plt

# Plot accuracy
plt.figure(figsize=(10, 5))

plt.plot(val_accuracies, label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Training and Validation Accuracy")
plt.legend()
plt.show()

In [167]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch.nn.functional as F


# Load dataset

df_train_path = os.path.join(DATA_FOLDER, 'train_dataset.csv')
df_train = pd.read_csv(df_train_path)

df_dev_path = os.path.join(DATA_FOLDER, 'dev_dataset.csv')
df_dev = pd.read_csv(df_dev_path)

print(df_train.shape)
print(df_dev.shape)


def feature_engineering(df):
    df["history_of_violence"] = (
        df["juv_fel_count"] +
        df["juv_misd_count"] +
        df["juv_other_count"] +
        df["priors_count"]
    )

    # Socioeconomic stability proxy
    df["socioeconomic_stability"] = (1 / (1 + df["priors_count"]))

    X = df[[ "age", "priors_count", "history_of_violence",
                "socioeconomic_stability", "c_charge_degree_F", "c_charge_degree_M"]]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled


X_train_scaled = feature_engineering(df_train)
X_dev_scaled = feature_engineering(df_dev)

# prepare targets for train and validation
y_train = (df_train["two_year_recid"] * 10).clip(0,9)
y_dev = (df_dev["two_year_recid"] * 10).clip(0,9)

print(X_train_scaled.shape, y_train.shape)
print(X_dev_scaled.shape, y_dev.shape)


# convert to torch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_dev_tensor = torch.tensor(X_dev_scaled, dtype=torch.float32)
y_dev_tensor = torch.tensor(y_dev.values, dtype=torch.long)


N_INPUT = X_train_tensor.shape[1]
N_OUTPUT = 10


# define the model
model = nn.Sequential(
            nn.Linear(N_INPUT, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, N_OUTPUT),
            nn.Softmax(dim=1)
        )

parameters = [p for layer in layers for p in layer.parameters()]
print("number of parameters:", sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

epochs = 50
batch_size = 32

number_of_samples = X_train_tensor.size(0)  

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)


for epoch in range(epochs):
    # Shuffle the data at the start of each epoch
    indices = torch.randperm(number_of_samples)
    X_train_shuffled = X_train_tensor[indices]
    y_train_shuffled = y_train_tensor[indices]
    
    # Loop through mini-batches
    for i in range(0, number_of_samples, batch_size):
        # Get the mini-batch
        X_batch = X_train_shuffled[i:i+batch_size]
        y_batch = y_train_shuffled[i:i+batch_size]


        # forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    scheduler.step()


    # Validation step
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_dev_tensor)
        val_loss = criterion(val_outputs, y_dev_tensor)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

        
        # for layer in layers:
        #     layer.out.retain_grad()
        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
        
        # epoch_train_loss += loss.item()
        
        
    # Validation step
    # x = X_dev_tensor
    # for layer in layers:
    #     x = layer(x)
    # loss = F.cross_entropy(x, y_dev_tensor)
    # print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")    

(5771, 27)
(722, 27)
(5771, 6) (5771,)
(722, 6) (722,)
number of parameters: 3050
Epoch 1/50, Loss: 2.0245, Val Loss: 1.8206
Epoch 2/50, Loss: 1.9582, Val Loss: 1.7868
Epoch 3/50, Loss: 1.7638, Val Loss: 1.7891
Epoch 4/50, Loss: 1.7892, Val Loss: 1.7908
Epoch 5/50, Loss: 2.1527, Val Loss: 1.7892
Epoch 6/50, Loss: 1.7947, Val Loss: 1.7901
Epoch 7/50, Loss: 1.7311, Val Loss: 1.7891
Epoch 8/50, Loss: 1.9237, Val Loss: 1.7903
Epoch 9/50, Loss: 1.7339, Val Loss: 1.7933
Epoch 10/50, Loss: 1.6219, Val Loss: 1.7932
Epoch 11/50, Loss: 1.7866, Val Loss: 1.7874
Epoch 12/50, Loss: 1.7372, Val Loss: 1.7913
Epoch 13/50, Loss: 1.8281, Val Loss: 1.7883
Epoch 14/50, Loss: 1.5565, Val Loss: 1.7837
Epoch 15/50, Loss: 1.7112, Val Loss: 1.7922
Epoch 16/50, Loss: 1.6870, Val Loss: 1.7861
Epoch 17/50, Loss: 1.5614, Val Loss: 1.7889
Epoch 18/50, Loss: 1.6341, Val Loss: 1.7881
Epoch 19/50, Loss: 1.6426, Val Loss: 1.7851
Epoch 20/50, Loss: 1.6223, Val Loss: 1.7871
Epoch 21/50, Loss: 1.5869, Val Loss: 1.7832
Epo

In [170]:
risk_metrics = torch.argmax(outputs, dim=1)+1

print(risk_metrics)

tensor([10, 10,  1, 10,  1, 10,  1, 10, 10,  1,  1])


In [None]:
def compare_with_two_year_recid(pred_type, y_pred):
    '''
    categorize the predicted scores into low, medium, and high risk groups
    '''
    
    def categorize_score(score):
        if score <= 4:
            return "Low"
        elif 5 <= score <= 7:
            return "Medium"
        else:
            return "High"

    # Add the predictions to the dataframe by mapping the categorize_score function to the predictions
    # prediction values will be low, medium, or high
    df_train[f"Predicted_{pred_type}_Risk_Group"] = pd.Categorical(
        pd.Series(y_pred).map(categorize_score),
        categories=["Low", "Medium", "High"],
        ordered=True
    )

    # groups based on predicted risk group and actual recidivism
    # size() returns the number of rows in each group
    # unstack() pivots the table so that the predicted risk group is the index and the two_year_recid is the column
    predicted_grouped = df_train.groupby(
        [f"Predicted_{pred_type}_Risk_Group", "two_year_recid"]).size().unstack(fill_value=0)

    # save to csv
    predicted_file_path = os.path.join(RESULTS_FOLDER, f"predicted_vs_recid_{pred_type}.csv")
    predicted_grouped.to_csv(predicted_file_path)

    # as above but add race to the grouping
    race_comparison = df_train.groupby(
        ["race", f"Predicted_{pred_type}_Risk_Group", "two_year_recid"]).size().unstack(fill_value=0)

    # save to csv
    race_comparison_file_path = os.path.join(RESULTS_FOLDER, f"predicted_risk_by_race_{pred_type}_summary.csv")
    race_comparison.to_csv(race_comparison_file_path)