In [1]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns

In [2]:
RESULTS_FOLDER = "../results"
DATA_FOLDER = "../data"
TEMP_FOLDER = "../tmp"

## Load the training dataset

Load the train dataset in a pandas dataframe

In [3]:
df_train_path = os.path.join(DATA_FOLDER, 'train_dataset.csv')
df_train = pd.read_csv(df_train_path)

## Create additional features

We create the following additional features:

- `history_of_violence` - sum of all violence-related crimes in the past
- `socioeconomic_stability` - 1 / (1 + `priors_count`). If no priors count this will be equal to 1 (good stability), otherwise it will start getting smaller with each increase of priors


In [4]:
df_train["history_of_violence"] = (
    df_train["juv_fel_count"] +
    df_train["juv_misd_count"] +
    df_train["juv_other_count"] +
    df_train["priors_count"]
)

# Socioeconomic stability proxy
df_train["socioeconomic_stability"] = (1 / (1 + df_train["priors_count"])) 

## Prepare data for model training

- Select features to be used for training
    - `age`
    - `priors_count`
    - `history_of_violence`
    - `days_b_screening_arrest`
    - `socioeconomic_stability`
    - `c_charge_degree_F`
    - `c_charge_degree_M`
- Scale all features, mean 0 and std dev 1


- Select the label for training
    - `two_year_recid` * 10 to put the scale between 0 and 10



In [5]:

X_train = df_train[[ 
    "age", "priors_count", "history_of_violence", 
    "socioeconomic_stability", "c_charge_degree_F", "c_charge_degree_M"
]]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

y_train = df_train["two_year_recid"] * 10


In [10]:
print(X_train_scaled.shape)
print(pd.DataFrame(X_train_scaled).info())
print(pd.DataFrame(X_train_scaled).describe())
print(pd.DataFrame(X_train_scaled).head())

(5771, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5771 entries, 0 to 5770
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       5771 non-null   float64
 1   1       5771 non-null   float64
 2   2       5771 non-null   float64
 3   3       5771 non-null   float64
 4   4       5771 non-null   float64
 5   5       5771 non-null   float64
dtypes: float64(6)
memory usage: 270.6 KB
None
                  0             1             2             3             4  \
count  5.771000e+03  5.771000e+03  5.771000e+03  5.771000e+03  5.771000e+03   
mean  -1.274323e-16  3.078075e-17 -4.432427e-17 -1.280479e-16 -1.083482e-16   
std    1.000087e+00  1.000087e+00  1.000087e+00  1.000087e+00  1.000087e+00   
min   -1.408046e+00 -7.109982e-01 -7.163197e-01 -1.307391e+00 -1.365291e+00   
25%   -8.143374e-01 -7.109982e-01 -7.163197e-01 -9.133677e-01 -1.365291e+00   
50%   -3.054445e-01 -2.990746e-01 -3.310558e-01 -4.477038e-01  7.32

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define a custom dataset
class COMPASDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# Define the Neural Network
class RiskScoreNN(nn.Module):
    def __init__(self, input_size):
        super(RiskScoreNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 10)  # 10 neurons for 10 risk scores
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

# # Load and preprocess the dataset
# df = pd.read_csv("compas-dataset.csv")  # Replace with your dataset file
# features = df.drop(columns=["two_year_recid", "decile_score"])  # Drop label and COMPAS score
# labels = df["two_year_recid"]  # Use the two_year_recid label

# # Standardize features
# scaler = StandardScaler()
# features = scaler.fit_transform(features)

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create DataLoaders
train_dataset = COMPASDataset(X_train_scaled, y_train)
# test_dataset = COMPASDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model, loss function, and optimizer
input_size = X_train_scaled.shape[1]
model = RiskScoreNN(input_size)
criterion = nn.CrossEntropyLoss()  # For multiclass classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")

# # Evaluation
# model.eval()
# correct = 0
# total = 0
# compas_correct = 0
# with torch.no_grad():
#     for inputs, targets in test_loader:
#         outputs = model(inputs)
#         _, predicted = torch.max(outputs, 1)
#         correct += (predicted == targets).sum().item()
#         total += targets.size(0)

#         # Compare with COMPAS decile_score (optional)
#         compas_scores = df.loc[test_dataset.indices, "decile_score"]  # Assuming decile_score exists
#         compas_pred = (compas_scores > 5).astype(int)  # Example threshold
#         compas_correct += (compas_pred == targets.numpy()).sum()

# print(f"Model Accuracy: {correct / total * 100:.2f}%")
# print(f"COMPAS Accuracy: {compas_correct / total * 100:.2f}%")


ModuleNotFoundError: No module named 'torch'

In [None]:
import torch

N_INPUT = 6
# number of neurons in the hidden layer of the MLP
N_HIDDEN = 64

# define network parameters
GEN = torch.Generator(device=device).manual_seed(2147483647)

# input layer
C = torch.randn((vocab_size, N_INPUT),
                 generator=GEN, device=device)

# hidden layer
w1 = torch.randn((N_INPUT*N_BLOCK, N_HIDDEN),
                 generator=GEN, device=device) * (5/3) / ((N_BLOCK * N_INPUT)**0.5)

# useless because of batchnorm
b1 = torch.randn((N_HIDDEN,1),
                 generator=GEN, device=device) * 0.1

# output layer
w2 = torch.randn((N_HIDDEN, vocab_size),
                 generator=GEN, device=device) * 0.1
b2 = torch.randn( (vocab_size, 1),
                 generator=GEN, device=device) * 0.1

# batch normalization
bn_gain = torch.randn((1, N_HIDDEN), generator=GEN, device=device) * 0.1 + 1.0
bn_bias = torch.randn((1, N_HIDDEN), generator=GEN, device=device) * 0.1

# network params
parameters = [C, w1, b1, w2, b2, bn_gain, bn_bias]
# print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

# network definition
BATCH_SIZE = 32

N_BATCH = torch.randint(0, X_train.shape[0], (BATCH_SIZE, ), generator=GEN, device=device)
x_batch, y_batch = X_train[N_BATCH], Y_train[N_BATCH]

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# # Load your dataset
# # Assuming you have a CSV file, replace 'dataset.csv' with your file path
# df_train = pd.read_csv('dataset.csv')

# # Preprocess features and target
# # Assuming 'two_year_recid' is your target column and 'decile_score' is used for comparison
# target_column = 'two_year_recid'
# features = df_train.drop(columns=[target_column, 'decile_score'])  # Drop unused columns
# target = df_train[target_column]

# # Create decile bins if necessary (e.g., scale target into 0-9)
# # Assuming target is binary, you can scale it into deciles based on conditions or continuous scores
# # For simplicity, assume it is already in 0-9 deciles.

# # Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# # Preprocessing pipeline
# numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
# categorical_features = X_train.select_dtypes(include=['object']).columns

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_features),
#         ('cat', OneHotEncoder(), categorical_features)
#     ])

# Build the TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')  # 10 neurons for deciles
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Combine preprocessing and training in a pipeline
pipeline = Pipeline(steps=[
    ('model', tf.keras.wrappers.scikit_learn.KerasClassifier(
        build_fn=lambda: model,
        epochs=20,
        batch_size=32,
        verbose=1
    ))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
loss, accuracy = model.evaluate(preprocessor.transform(X_test), y_test)
print(f"Test Accuracy: {accuracy:.2f}")

# Predict and compare with COMPAS
y_pred = model.predict(preprocessor.transform(X_test))
print(f"Predicted deciles: {np.argmax(y_pred, axis=1)}")


AttributeError: module 'keras._tf_keras.keras' has no attribute 'wrappers'