# 0. Load libraries

In [23]:
# Core libraries
import numpy as np
import pandas as pd

# Preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures

# Pipeline utilities
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier


import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# 1. Data Info

In [2]:
df = pd.read_csv("../data/compas-scores-two-years.csv")
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,8/14/13,Male,4/18/47,69,Greater than 45,Other,...,1,Low,8/14/13,7/7/14,7/14/14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,1/27/13,Male,1/22/82,34,25 - 45,African-American,...,1,Low,1/27/13,1/26/13,2/5/13,0,9,159,1,1
2,4,ed philo,ed,philo,4/14/13,Male,5/14/91,24,Less than 25,African-American,...,3,Low,4/14/13,6/16/13,6/16/13,4,0,63,0,1
3,5,marcu brown,marcu,brown,1/13/13,Male,1/21/93,23,Less than 25,African-American,...,6,Medium,1/13/13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,3/26/13,Male,1/22/73,43,25 - 45,Other,...,1,Low,3/26/13,,,2,0,1102,0,0


# 2. Data Cleaning

In [3]:
# Function to find duplicated columns
def find_duplicated_columns(df):
    duplicated_columns = []
    for i in range(len(df.columns)):
        for j in range(i+1, len(df.columns)):
            if df.iloc[:,i].equals(df.iloc[:,j]):
                duplicated_columns.append((df.columns[i], df.columns[j]))
    return duplicated_columns

In [4]:
# Find duplicated columns in the dataset
duplicated_columns = find_duplicated_columns(df)
duplicated_columns

[('compas_screening_date', 'screening_date'),
 ('compas_screening_date', 'v_screening_date'),
 ('decile_score', 'decile_score.1'),
 ('priors_count', 'priors_count.1'),
 ('screening_date', 'v_screening_date')]

In [5]:
# Merge the deplicated columns
df['screening_date'] = df['compas_screening_date'] 
df['decile_score'] = df['decile_score'] 
df['priors_count'] = df['priors_count'] 

# Excluding time components in c_jail_in and c_jail_out
df['c_jail_in'] = pd.to_datetime(df['c_jail_in']).dt.date
df['c_jail_out'] = pd.to_datetime(df['c_jail_out']).dt.date

# Drop some irrelevant columns
data_drop = df.drop(columns=['compas_screening_date', 
                               'v_screening_date', 
                               'decile_score.1', 
                               'priors_count.1', 
                               'violent_recid', 
                               'type_of_assessment', 
                               'v_type_of_assessment'])

  df['c_jail_in'] = pd.to_datetime(df['c_jail_in']).dt.date
  df['c_jail_out'] = pd.to_datetime(df['c_jail_out']).dt.date


In [189]:
# Save to CSV
data_drop.to_csv('../data/data_cleaned.csv', index=False)

# 3. Feature Engineering

In [190]:
# Partition
def partition(df, age_cat):
    partitions = {}
    unique_values = df[age_cat].unique() 
    for value in unique_values: 
        partitions[value] = df[df[age_cat] == value]
    return partitions 

In [191]:
df_path = '../data/data_cleaned.csv'
df = pd.read_csv(df_path)

In [192]:
age_cat_partitions = partition(df, 'age_cat')
less_than_25_df = age_cat_partitions.get('Less than 25', pd.DataFrame())
twenty_five_to_45_df = age_cat_partitions.get('25 - 45', pd.DataFrame())
greater_than_45_df = age_cat_partitions.get('Greater than 45', pd.DataFrame())

print("Less than 25 DataFrame Shape:", less_than_25_df.shape)
print("25 - 45 DataFrame Shape:", twenty_five_to_45_df.shape)
print("Greater than 45 DataFrame Shape:", greater_than_45_df.shape)

Less than 25 DataFrame Shape: (1529, 46)
25 - 45 DataFrame Shape: (4109, 46)
Greater than 45 DataFrame Shape: (1576, 46)


In [193]:
# Delta Function
def calculate_delta(partitions):
    delta_results = {
        'Less than 25': {'male': 0, 'female': 0},
        '25 - 45': {'male': 0, 'female': 0},
        'Greater than 45': {'male': 0, 'female': 0},
    }
    
    for age_cat, df_partition in partitions.items():
        # Calculate the probabilities
        P_plus_male = df_partition[df_partition['sex'] == 'Male']['two_year_recid'].mean() # calculate P(+|e_i, gender)
        P_plus_female = df_partition[df_partition['sex'] == 'Female']['two_year_recid'].mean()
        
        # Assuming the number of people of each gender in each partition as G_i
        G_male = len(df_partition[df_partition['sex'] == 'Male'])
        G_female = len(df_partition[df_partition['sex'] == 'Female'])
        
        # Calculate delta using the formula provided
        delta_male = G_male * abs((P_plus_male - P_plus_female) / 2) # G_male
        delta_female = G_female * abs((P_plus_female - P_plus_male) / 2) # G_female
        
        delta_results[age_cat]['male'] = delta_male
        delta_results[age_cat]['female'] = delta_female
    
    return delta_results

In [194]:
delta_results = calculate_delta(age_cat_partitions)

# Print the delta values to check them
for age_cat, deltas in delta_results.items():
    print(f"Age Category: {age_cat}")
    print(f"  Delta Male: {deltas['male']}")
    print(f"  Delta Female: {deltas['female']}\n")

Age Category: Less than 25
  Delta Male: 132.03993055555554
  Delta Female: 30.642626913779207

Age Category: 25 - 45
  Delta Male: 145.10346964064436
  Delta Female: 35.46290127195639

Age Category: Greater than 45
  Delta Male: 62.50666666666667
  Delta Female: 14.695924764890282



In [195]:
delta_results

{'Less than 25': {'male': 132.03993055555554, 'female': 30.642626913779207},
 '25 - 45': {'male': 145.10346964064436, 'female': 35.46290127195639},
 'Greater than 45': {'male': 62.50666666666667, 'female': 14.695924764890282}}

## Algo 1: Local Massaging:
less than 25

In [196]:
# Prepare data
X_columns = [
    'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count',
    'priors_count', 'days_b_screening_arrest', 'c_days_from_compas', 'c_charge_degree',
    'is_recid', 'r_days_from_arrest', 'is_violent_recid', 'score_text', 'v_decile_score',
    'v_score_text', 'start', 'end', 'event' #, 'two_year_recid'
]

y_column = ['two_year_recid']

# Get partition X and y
less_than_25_X_1 = less_than_25_df[columns_to_select]
less_than_25_y_1 = less_than_25_df[y_column]


# Get delta
delta_male_1 = delta_results['Less than 25']['male']
delta_female_1 = delta_results['Less than 25']['female']



# 1. H1: Build ranker (logistic regrewssion)

import pandas as pd
import xgboost as xgb

# Assuming less_than_25_df is your DataFrame containing the features and target variable

# Convert text columns to category
for col in X_columns:
    if less_than_25_df[col].dtype == 'object':
        less_than_25_df[col] = less_than_25_df[col].astype('category')

# Prepare X and y
X = less_than_25_df[X_columns]
y = less_than_25_df[y_column].values.ravel()  # Ensure y is in the correct shape

# Since we're using categories, tell XGBoost to treat these columns as categorical
categorical_columns = [X.columns.get_loc(c) for c in X.select_dtypes(['category']).columns]
dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns, feature_types='c')

# Specify parameters for XGBoost
params = {
    'objective': 'binary:logistic',  # Objective for binary classification
    'eval_metric': 'logloss',  # Evaluation metric
    'learning_rate': 0.1,  # Learning rate
    'max_depth': 6,  # Depth of the trees
    'min_child_weight': 1,  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': 0.8,  # Subsample ratio of the training instances
    'colsample_bytree': 0.8,  # Subsample ratio of columns when constructing each tree
    # 'n_estimators': 100,  # Number of trees
}

# Training the model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Predicting probabilities for y
less_than_25_y_hat_1 = bst.predict(dtrain)

less_than_25_y_hat_1 = pd.DataFrame(less_than_25_y_hat_1, columns=['two_year_recid_prob'])
less_than_25_y_hat_1 = less_than_25_y_hat_1.set_index(less_than_25_y_1.index)

# Join all tables
less_than_25_df = pd.concat([less_than_25_X_1, less_than_25_y_1, less_than_25_y_hat_1], axis=1)

# Add sex back for future category
less_than_25_df = less_than_25_df.join(df['sex'], how='left')


# Female
num_rows = int(round(delta_female_1, 0))

# Get the indices of the rows that meet your conditions and you want to update
indices_to_update = less_than_25_df[(less_than_25_df['sex'] == 'Female') & (less_than_25_df['two_year_recid'] == 0)].sort_values(by='two_year_recid_prob', ascending=False).head(num_rows).index

# Use loc to update the 'two_year_recid' values of those indices in the original DataFrame
less_than_25_df.loc[indices_to_update, 'two_year_recid'] = 1

# Male
num_rows = int(round(delta_male_1, 0))

# Get the indices of the rows that meet your conditions and you want to update
indices_to_update = less_than_25_df[(less_than_25_df['sex'] == 'Male') & (less_than_25_df['two_year_recid'] == 1)].sort_values(by='two_year_recid_prob', ascending=True).head(num_rows).index

# Use loc to update the 'two_year_recid' values of those indices in the original DataFrame
less_than_25_df.loc[indices_to_update, 'two_year_recid'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  less_than_25_df[col] = less_than_25_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  less_than_25_df[col] = less_than_25_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  less_than_25_df[col] = less_than_25_df[col].astype('category')
A value is tryin

In [197]:
# 25-45

# Get partition X and y
less_than_25_X_1 = twenty_five_to_45_df[columns_to_select]
less_than_25_y_1 = twenty_five_to_45_df[y_column]


# Get delta
delta_male_1 = delta_results['25 - 45']['male']
delta_female_1 = delta_results['25 - 45']['female']



# 1. H1: Build ranker (logistic regrewssion)

import pandas as pd
import xgboost as xgb

# Assuming less_than_25_df is your DataFrame containing the features and target variable

# Convert text columns to category
for col in X_columns:
    if twenty_five_to_45_df[col].dtype == 'object':
        twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astype('category')

# Prepare X and y
X = twenty_five_to_45_df[X_columns]
y = twenty_five_to_45_df[y_column].values.ravel()  # Ensure y is in the correct shape

# Since we're using categories, tell XGBoost to treat these columns as categorical
categorical_columns = [X.columns.get_loc(c) for c in X.select_dtypes(['category']).columns]
dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns, feature_types='c')

# Specify parameters for XGBoost
params = {
    'objective': 'binary:logistic',  # Objective for binary classification
    'eval_metric': 'logloss',  # Evaluation metric
    'learning_rate': 0.1,  # Learning rate
    'max_depth': 6,  # Depth of the trees
    'min_child_weight': 1,  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': 0.8,  # Subsample ratio of the training instances
    'colsample_bytree': 0.8,  # Subsample ratio of columns when constructing each tree
    # 'n_estimators': 100,  # Number of trees
}

# Training the model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Predicting probabilities for y
less_than_25_y_hat_1 = bst.predict(dtrain)

less_than_25_y_hat_1 = pd.DataFrame(less_than_25_y_hat_1, columns=['two_year_recid_prob'])
less_than_25_y_hat_1 = less_than_25_y_hat_1.set_index(less_than_25_y_1.index)

# Join all tables
twenty_five_to_45_df = pd.concat([less_than_25_X_1, less_than_25_y_1, less_than_25_y_hat_1], axis=1)

# Add sex back for future category
twenty_five_to_45_df = twenty_five_to_45_df.join(df['sex'], how='left')


# Female
num_rows = int(round(delta_female_1, 0))

# Get the indices of the rows that meet your conditions and you want to update
indices_to_update = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Female') & (twenty_five_to_45_df['two_year_recid'] == 0)].sort_values(by='two_year_recid_prob', ascending=False).head(num_rows).index

# Use loc to update the 'two_year_recid' values of those indices in the original DataFrame
twenty_five_to_45_df.loc[indices_to_update, 'two_year_recid'] = 1

# Male
num_rows = int(round(delta_male_1, 0))

# Get the indices of the rows that meet your conditions and you want to update
indices_to_update = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Male') & (twenty_five_to_45_df['two_year_recid'] == 1)].sort_values(by='two_year_recid_prob', ascending=True).head(num_rows).index

# Use loc to update the 'two_year_recid' values of those indices in the original DataFrame
twenty_five_to_45_df.loc[indices_to_update, 'two_year_recid'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astyp

In [198]:
# greater than 45

# to be continued:


Unnamed: 0,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,is_recid,r_days_from_arrest,is_violent_recid,score_text,v_decile_score,v_score_text,start,end,event,two_year_recid,two_year_recid_prob,sex
1,African-American,0,3,0,0,0,-1.0,1.0,F,1,,1,Low,1,Low,9,159,1,1,0.998814,Male
4,Other,0,1,0,0,2,,76.0,F,0,,0,Low,1,Low,0,1102,0,0,0.000487,Male
5,Other,0,1,0,0,0,0.0,0.0,M,0,,0,Low,1,Low,1,853,0,0,0.000256,Male
6,Caucasian,0,6,0,0,14,-1.0,1.0,F,1,0.0,0,Medium,2,Low,5,40,1,1,0.999537,Male
7,Other,0,4,0,0,3,-1.0,1.0,F,0,,0,Low,3,Low,0,265,0,0,0.001361,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7202,Hispanic,0,2,0,0,0,0.0,0.0,F,0,,0,Low,3,Low,173,322,0,0,0.002238,Male
7203,Hispanic,0,3,0,0,0,-1.0,1.0,F,0,,0,Low,1,Low,0,887,0,0,0.000261,Male
7204,African-American,0,4,0,0,2,-1.0,1.0,M,0,,0,Low,3,Low,5,825,0,0,0.000331,Male
7207,African-American,0,2,0,0,0,-1.0,1.0,M,1,1.0,0,Low,2,Low,0,529,1,1,0.993684,Male


In [39]:
def preprocess_dataset(file_path, features):
    data = pd.read_csv(file_path)
    
    # Generate interaction features
    data['race_x_priors_count'] = data['race'].astype(str) + '_' + data['priors_count'].astype(str)
    data['sex_age_cat_interaction'] = data['sex'].astype(str) + "_" + data['age_cat'].astype(str)
    data['race_decile_interaction'] = data['race'].astype(str) + "_" + data['decile_score'].astype(str)
    data['charge_priors_interaction'] = data['c_charge_degree'].astype(str) + "_" + data['priors_count'].astype(str)
    data['sex_race_interaction'] = data['sex'].astype(str) + "_" + data['race'].astype(str)
    data['age_priors_interaction'] = data['age_cat'].astype(str) + "_" + data['priors_count'].astype(str)
    
    # Update features list to include the new interaction features
    interaction_features = [
        'race_x_priors_count', 'sex_age_cat_interaction', 'race_decile_interaction',
        'charge_priors_interaction', 'sex_race_interaction', 'age_priors_interaction'
    ]
    all_features = features + interaction_features
    
    # Identify categorical and numerical features
    categorical_features = [feature for feature in all_features if data[feature].dtype == 'O']
    numerical_features = [feature for feature in all_features if feature not in categorical_features]
    
    # Preprocessing transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Create preprocessing pipeline
    preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
    
    # Apply preprocessing to the dataset with the features
    data_processed = preprocessing_pipeline.fit_transform(data)
    
    # Get feature names after preprocessing
    try:
        categorical_features_names = preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out()
    except AttributeError:  # For older versions of sklearn
        categorical_features_names = preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names(categorical_features)
    processed_feature_names = numerical_features + list(categorical_features_names)
    
    remainder_columns = [col for col in data.columns if col not in all_features]
    feature_names = processed_feature_names + remainder_columns
    
    # Convert the numpy array returned by ColumnTransformer back to a dataframe
    data_processed_df = pd.DataFrame(data_processed, columns=feature_names)
    
    for col in remainder_columns:
        if data[col].dtype in ['int64', 'int32']:
            data_processed_df[col] = data_processed_df[col].astype(data[col].dtype)
    
    data_processed_df.index = data.index
    
    return data_processed_df

In [40]:
# File path and features list
file_path = '../data/data_cleaned.csv'
features = ['sex', 'race', 'age_cat', 'decile_score', 'v_decile_score', 'priors_count', 'c_charge_degree']

# Process the dataset
data_processed_df = preprocess_dataset(file_path, features)

# Save the processed dataset to a new CSV file
data_processed_df.to_csv('../data/featured_data.csv', index=False)



# 3. Feature Selection

In [43]:
# XGB
def xgboost_feature_importance(file_path, target):
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Dynamically identify numeric and categorical features excluding the target
    numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.drop(target).tolist()
    categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()

    # Define preprocessing for numeric and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(sparse=True, handle_unknown='ignore'), categorical_features),
        ])

    # Splitting data into features (X) and target (y)
    X = data.drop(columns=[target])
    y = data[target]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply preprocessing
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Initialize and train the XGBoost classifier
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train_transformed, y_train)

    feature_names = numeric_features + \
                    preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()

    # Extract feature importance
    feature_importance = model.feature_importances_
    
    # Create a DataFrame for feature importances
    importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importances_df = importances_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

    return importances_df

# Example usage
file_path = '../data/featured_data.csv'
target = 'two_year_recid'
feature_importance_df = xgboost_feature_importance(file_path, target)

# Display the top 10 most important features
print(feature_importance_df.head(20))



                                        Feature  Importance
0                                      is_recid    0.830077
1                             r_charge_desc_nan    0.076068
2                                           end    0.009828
3                         vr_charge_degree_(M1)    0.005974
4                                         event    0.005766
5                                  first_joshua    0.004266
6        race_x_priors_count_African-American_8    0.003999
7    race_decile_interaction_African-American_8    0.003773
8    race_decile_interaction_African-American_2    0.003451
9              age_priors_interaction_25 - 45_5    0.003176
10                                        start    0.002898
11                charge_priors_interaction_F_6    0.002755
12                        r_charge_desc_Battery    0.002562
13                charge_priors_interaction_M_2    0.002509
14  c_charge_desc_Driving While License Revoked    0.002506
15                charge_priors_interact

# 4. Partitioning

# 5. Fairness Intervention (Local Massaging and Preferential Sampling)