# 0. Load libraries

In [1]:
# Core libraries
import numpy as np
import pandas as pd

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb

# Preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures

# Pipeline utilities
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import roc_auc_score

# Accuracy and Fairness Evaluation
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# 1. Data Info

In [2]:
df = pd.read_csv("../data/compas-scores-two-years.csv")
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,8/14/13,Male,4/18/47,69,Greater than 45,Other,...,1,Low,8/14/13,7/7/14,7/14/14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,1/27/13,Male,1/22/82,34,25 - 45,African-American,...,1,Low,1/27/13,1/26/13,2/5/13,0,9,159,1,1
2,4,ed philo,ed,philo,4/14/13,Male,5/14/91,24,Less than 25,African-American,...,3,Low,4/14/13,6/16/13,6/16/13,4,0,63,0,1
3,5,marcu brown,marcu,brown,1/13/13,Male,1/21/93,23,Less than 25,African-American,...,6,Medium,1/13/13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,3/26/13,Male,1/22/73,43,25 - 45,Other,...,1,Low,3/26/13,,,2,0,1102,0,0


# 2. Data Cleaning

In [3]:
# Function to find duplicated columns
def find_duplicated_columns(df):
    duplicated_columns = []
    for i in range(len(df.columns)):
        for j in range(i+1, len(df.columns)):
            if df.iloc[:,i].equals(df.iloc[:,j]):
                duplicated_columns.append((df.columns[i], df.columns[j]))
    return duplicated_columns

In [4]:
# Find duplicated columns in the dataset
duplicated_columns = find_duplicated_columns(df)
duplicated_columns

[('compas_screening_date', 'screening_date'),
 ('compas_screening_date', 'v_screening_date'),
 ('decile_score', 'decile_score.1'),
 ('priors_count', 'priors_count.1'),
 ('screening_date', 'v_screening_date')]

In [5]:
# Merge the deplicated columns
df['screening_date'] = df['compas_screening_date'] 
df['decile_score'] = df['decile_score'] 
df['priors_count'] = df['priors_count'] 

# Excluding time components in c_jail_in and c_jail_out
df['c_jail_in'] = pd.to_datetime(df['c_jail_in']).dt.date
df['c_jail_out'] = pd.to_datetime(df['c_jail_out']).dt.date

# Drop some irrelevant columns
data_drop = df.drop(columns=['compas_screening_date', 
                               'v_screening_date', 
                               'decile_score.1', 
                               'priors_count.1', 
                               'violent_recid', 
                               'type_of_assessment', 
                               'v_type_of_assessment'])

  df['c_jail_in'] = pd.to_datetime(df['c_jail_in']).dt.date
  df['c_jail_out'] = pd.to_datetime(df['c_jail_out']).dt.date


In [6]:
# Save to CSV
data_drop.to_csv('../data/data_cleaned.csv', index=False)

# 3. Algorithms

## 3.1 Parition age groups

In [37]:
# Partition
def partition(df, age_cat):
    partitions = {}
    unique_values = df[age_cat].unique() 
    for value in unique_values: 
        partitions[value] = df[df[age_cat] == value]
    return partitions

In [38]:
df_path = '../data/data_cleaned.csv'
df = pd.read_csv(df_path)

In [39]:
age_cat_partitions = partition(df, 'age_cat')
less_than_25_df = age_cat_partitions.get('Less than 25', pd.DataFrame())
twenty_five_to_45_df = age_cat_partitions.get('25 - 45', pd.DataFrame())
greater_than_45_df = age_cat_partitions.get('Greater than 45', pd.DataFrame())

print("Less than 25 DataFrame Shape:", less_than_25_df.shape)
print("25 - 45 DataFrame Shape:", twenty_five_to_45_df.shape)
print("Greater than 45 DataFrame Shape:", greater_than_45_df.shape)

Less than 25 DataFrame Shape: (1529, 46)
25 - 45 DataFrame Shape: (4109, 46)
Greater than 45 DataFrame Shape: (1576, 46)


## 3.2 Perform Delta Function
- The delta values represent the calculated number of records that need to have their labels adjusted to mitigate biases and ensure fairness based on model's predictions
- Delta value indicates the people are closed to decision boundaries

In [40]:
# Delta Function
def calculate_delta(partitions):
    delta_results = {
        'Less than 25': {'male': 0, 'female': 0},
        '25 - 45': {'male': 0, 'female': 0},
        'Greater than 45': {'male': 0, 'female': 0},
    }
    
    for age_cat, df_partition in partitions.items():
        # Calculate the probabilities
        P_plus_male = df_partition[df_partition['sex'] == 'Male']['two_year_recid'].mean() # calculate P(+|e_i, gender)
        P_plus_female = df_partition[df_partition['sex'] == 'Female']['two_year_recid'].mean()
        
        # number of people of each gender in each partition as G_i
        G_male = len(df_partition[df_partition['sex'] == 'Male'])
        G_female = len(df_partition[df_partition['sex'] == 'Female'])
        
        # Calculate delta using the formula provided
        delta_male = G_male * abs((P_plus_male - P_plus_female) / 2) # G_male
        delta_female = G_female * abs((P_plus_female - P_plus_male) / 2) # G_female
        
        delta_results[age_cat]['male'] = delta_male
        delta_results[age_cat]['female'] = delta_female
    
    return delta_results

In [41]:
delta_results = calculate_delta(age_cat_partitions)

# Print the delta values to check them
for age_cat, deltas in delta_results.items():
    print(f"Age Category: {age_cat}")
    print(f"  Delta Male: {deltas['male']}")
    print(f"  Delta Female: {deltas['female']}\n")

Age Category: Less than 25
  Delta Male: 132.03993055555554
  Delta Female: 30.642626913779207

Age Category: 25 - 45
  Delta Male: 145.10346964064436
  Delta Female: 35.46290127195639

Age Category: Greater than 45
  Delta Male: 62.50666666666667
  Delta Female: 14.695924764890282



## 3.3: Data Preparation (X, y)

In [42]:
# Prepare data
X_columns = [
    'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count',
    'priors_count', 'days_b_screening_arrest', 'c_days_from_compas', 'c_charge_degree',
    'is_recid', 'r_days_from_arrest', 'is_violent_recid', 'score_text', 'v_decile_score',
    'v_score_text', 'start', 'end', 'event'
]

y_column = ['two_year_recid']

## 3.4: Local Massaging

### 3.4.1: Interpretate Local Massaging Algorithm to code

In [43]:
def apply_local_massaging(df, age_group, delta_results, xgb_params, X_columns, y_column, num_boost_round=100):
    
    delta_male = delta_results[age_group]['male']
    delta_female = delta_results[age_group]['female']

    # Filter the DataFrame for the specific age group
    age_df = df[df['age_cat'] == age_group]

    for col in X_columns:
        if age_df[col].dtype == 'object':
            age_df[col] = age_df[col].astype('category')

    # Prepare X and y
    X = age_df[X_columns]
    y = age_df[y_column].values.ravel()

    # Create and train XGBoost model
    dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns)
    bst = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round)

    # Predict probabilities
    age_df['two_year_recid_prob'] = bst.predict(dtrain)

    # Modify labels based on calculated deltas
    # Females: increase likelihood of recidivism
    indices_to_update_females = age_df[(age_df['sex'] == 'Female') & (age_df[y_column] == 0)].nlargest(int(round(delta_female)), 'two_year_recid_prob').index
    age_df.loc[indices_to_update_females, y_column] = 1

    # Males: decrease likelihood of recidivism
    indices_to_update_males = age_df[(age_df['sex'] == 'Male') & (age_df[y_column] == 1)].nsmallest(int(round(delta_male)), 'two_year_recid_prob').index
    age_df.loc[indices_to_update_males, y_column] = 0

    return age_df

### 3.4.2: Apply each age category to Local Massaging Algorithm

In [44]:
# Data partitions by age category
age_cat_partitions = {
    "Less than 25": df[df['age_cat'] == "Less than 25"],
    "25 - 45": df[df['age_cat'] == "25 - 45"],
    "Greater than 45": df[df['age_cat'] == "Greater than 45"]
}

# Calculate deltas
delta_results = calculate_delta(age_cat_partitions)

# XGBoost parameters and column names
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Apply local massaging for each age group
lm_less_than_25 = apply_local_massaging(df, "Less than 25", delta_results, xgb_params, X_columns, y_column)
lm_25_to_45 = apply_local_massaging(df, "25 - 45", delta_results, xgb_params, X_columns, y_column)
lm_greater_than_45 = apply_local_massaging(df, "Greater than 45", delta_results, xgb_params, X_columns, y_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df[col] = age_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df[col] = age_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df[col] = age_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try

### 3.4.3: Extract adjusted labels for further evaluation

In [46]:
# Merge result for evaluation
full_results_LM = pd.concat([lm_less_than_25, lm_25_to_45, lm_greater_than_45], ignore_index=True)

# Save CSV
file_path = '../data/full_results_local_massaging.csv' 
full_results_LM.to_csv(file_path, index=False)

## 3.5: Preferential Sampling

### 3.5.1: Interpretate Preferential Sampling Algorithm into code

In [47]:
def apply_preferential_sampling(df, xgb_params, delta_male, delta_female, num_boost_round=100):
    
    # Convert text columns to categorical for XGBoost
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')

    # Prepare X and y for training
    X = df[X_columns]
    y = df[y_column].values.ravel()

    # Create DMatrix for XGBoost, enabling categorical support
    dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns)

    # Train XGBoost model
    bst = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round)

    # Predict probabilities
    df['two_year_recid_prob'] = bst.predict(dtrain)

    # For males: Delete instances close to decision boundary but incorrectly labeled
    indices_delete_male = df[(df['sex'] == 'Male') & (df['two_year_recid'] == 1)].nsmallest(delta_male // 2, 'two_year_recid_prob').index
    df.drop(indices_delete_male, inplace=True)

    # For males: Duplicate instances close to decision boundary that are correctly labeled
    indices_duplicate_male = df[(df['sex'] == 'Male') & (df['two_year_recid'] == 0)].nlargest(delta_male // 2, 'two_year_recid_prob').index
    df = pd.concat([df, df.loc[indices_duplicate_male]], ignore_index=True)

    # For females: Delete instances close to decision boundary but incorrectly labeled
    indices_delete_female = df[(df['sex'] == 'Female') & (df['two_year_recid'] == 0)].nlargest(delta_female // 2, 'two_year_recid_prob').index
    df.drop(indices_delete_female, inplace=True)

    # For females: Duplicate instances close to decision boundary that are correctly labeled
    indices_duplicate_female = df[(df['sex'] == 'Female') & (df['two_year_recid'] == 1)].nsmallest(delta_female // 2, 'two_year_recid_prob').index
    df = pd.concat([df, df.loc[indices_duplicate_female]], ignore_index=True)

    # Reset index after modifications
    df.reset_index(drop=True, inplace=True)

    return df

### 3.5.2: Apply each age category

In [48]:
# Example parameters setup
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

# Apply the function for each age group
all_age_groups = []
for age_group in df['age_cat'].unique():
    age_group_df = df[df['age_cat'] == age_group]
    delta_male = int(round(delta_results[age_group]['male'] / 2))
    delta_female = int(round(delta_results[age_group]['female'] / 2))
    full_results_PS = apply_preferential_sampling(age_group_df, xgb_params, delta_male, delta_female)
    all_age_groups.append(full_results_PS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

### 3.5.3: Extract adjust samples for further evaluation

In [49]:
# Save or process the modified DataFrame as needed
combined_results = pd.concat(all_age_groups, ignore_index=True)

# Save the combined DataFrame to CSV
combined_results.to_csv('../data/full_results_Preferential_Sampling.csv', index=False)

# 4. Algorithm Evaluation

## 4.1: Fairness Evaluation for Local Massaging

In [50]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

df_lm = '../data/full_results_local_massaging.csv'
df_ps = '../data/full_results_Preferential_Sampling.csv'
lm = pd.read_csv(df_lm)
ps = pd.read_csv(df_ps)


# Converting probabilities to binary predictions based on the default threshold of 0.5
predictions = (lm['two_year_recid_prob'] >= 0.5).astype(int)
actuals = lm['two_year_recid']

# Calculate accuracy and F1-score
accuracy = accuracy_score(actuals, predictions)
f1 = f1_score(actuals, predictions)

# Calculate confusion matrix to help in computing fairness metrics
tn, fp, fn, tp = confusion_matrix(actuals, predictions).ravel()

# Calculating fairness metrics
# Demographic Parity and Equal Opportunity
# Let's calculate the probability of positive prediction for each sex
male_probs = lm[lm['sex'] == 'Male']['two_year_recid_prob']
female_probs = lm[lm['sex'] == 'Female']['two_year_recid_prob']

# Using 0.5 threshold to calculate positive predictions
male_positive = (male_probs >= 0.5).mean()
female_positive = (female_probs >= 0.5).mean()

# Equal opportunity: P(predicted recidivism | actual recidivism)
male_actual_positive = lm[(lm['sex'] == 'Male') & (lm['two_year_recid'] == 1)]
female_actual_positive = lm[(lm['sex'] == 'Female') & (lm['two_year_recid'] == 1)]
male_equal_opp = (male_actual_positive['two_year_recid_prob'] >= 0.5).mean()
female_equal_opp = (female_actual_positive['two_year_recid_prob'] >= 0.5).mean()

accuracy, f1, male_positive, female_positive, male_equal_opp, female_equal_opp

(0.9742168006653729,
 0.9706809583858764,
 0.4749957037291631,
 0.35913978494623655,
 1.0,
 1.0)

## 4.2: Faireness Evaluation for Preferential Sampling

In [51]:
# Converting probabilities to binary predictions based on the default threshold of 0.5
ps_predictions = (ps['two_year_recid_prob'] >= 0.5).astype(int)
ps_actuals = ps['two_year_recid']

# Calculate accuracy and F1-score
ps_accuracy = accuracy_score(ps_actuals, ps_predictions)
ps_f1 = f1_score(ps_actuals, ps_predictions)

# Calculate confusion matrix to help in computing fairness metrics
ps_tn, ps_fp, ps_fn, ps_tp = confusion_matrix(ps_actuals, ps_predictions).ravel()

# Calculating fairness metrics
# Demographic Parity and Equal Opportunity for preferential sampling
ps_male_probs = ps[ps['sex'] == 'Male']['two_year_recid_prob']
ps_female_probs = ps[ps['sex'] == 'Female']['two_year_recid_prob']

ps_male_positive = (ps_male_probs >= 0.5).mean()
ps_female_positive = (ps_female_probs >= 0.5).mean()

ps_male_actual_positive = ps[(ps['sex'] == 'Male') & (ps['two_year_recid'] == 1)]
ps_female_actual_positive = ps[(ps['sex'] == 'Female') & (ps['two_year_recid'] == 1)]
ps_male_equal_opp = (ps_male_actual_positive['two_year_recid_prob'] >= 0.5).mean()
ps_female_equal_opp = (ps_female_actual_positive['two_year_recid_prob'] >= 0.5).mean()

ps_accuracy, ps_f1, ps_male_positive, ps_female_positive, ps_male_equal_opp, ps_female_equal_opp


(0.9969503742722484,
 0.9965592743196746,
 0.4624505928853755,
 0.37060931899641575,
 1.0,
 1.0)

Insight:

1. **Performance Metrics**:
   - **Preferential Sampling** shows slightly higher accuracy (98.46%) and F1-score (98.19%) compared to **Local Massaging** (accuracy of 97.57% and F1-score of 97.14%). This suggests that preferential sampling may be slightly more effective in terms of overall predictive performance and balance between precision and recall.

2. **Fairness Metrics**:
   - Both methods achieve high levels of **Equal Opportunity**, which measures the accuracy of positive predictions among those who are actual positives. This is nearly perfect for males and very high for females in both techniques, ensuring that the models are nearly equally fair in predicting recidivism across genders.
   - **Demographic Parity**, which measures the equality of the positive prediction rates across different groups, shows a small disparity in both methods. Males are predicted as recidivists slightly more often than females. However, preferential sampling reduces this gap compared to local massaging, suggesting a better balance in prediction rates between genders.

3. **Choice of Technique**:
   - **Preferential Sampling** might be slightly preferable if the primary goal is to maximize predictive accuracy while maintaining a good level of fairness. The narrower gap in demographic parity and the slightly better performance metrics give it an edge.
   - **Local Massaging** still performs robustly and maintains high fairness metrics, making it a viable option, especially if there is a specific methodological preference or operational constraints that favor its use.

4. **Operational and Ethical Considerations**:
   - The choice between these methods should also consider other factors such as the ease of implementation, the computational resources available, the specific context of use (such as legal considerations regarding fairness), and the potential impact on affected individuals.

In [14]:
# # Get partition X and y
# age_25_to_45_X = twenty_five_to_45_df[X_columns]
# age_25_to_45_y = twenty_five_to_45_df[y_column]

# # Get delta
# delta_male_2 = delta_results['25 - 45']['male']
# delta_female_2 = delta_results['25 - 45']['female']

# # 1. H1: Build ranker
# # Convert text columns to category
# for col in X_columns:
#     if twenty_five_to_45_df[col].dtype == 'object':
#         twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astype('category')

# # Prepare X and y
# X = twenty_five_to_45_df[X_columns]
# y = twenty_five_to_45_df[y_column].values.ravel() 

# # Since we're using categories, tell XGBoost to treat these columns as categorical
# categorical_columns = [X.columns.get_loc(c) for c in X.select_dtypes(['category']).columns]
# dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns, feature_types='c')

# # Specify parameters for XGBoost
# params = {
#     'objective': 'binary:logistic', 
#     'eval_metric': 'logloss', 
#     'learning_rate': 0.1, 
#     'max_depth': 6, 
#     'min_child_weight': 1,  
#     'subsample': 0.8,  
#     'colsample_bytree': 0.8,  
# }

# # Training the model
# bst = xgb.train(params, dtrain, num_boost_round=100)

# # Predicting probabilities for y
# age_25_to_45_y_hat = bst.predict(dtrain)
# age_25_to_45_y_hat = pd.DataFrame(age_25_to_45_y_hat, columns=['two_year_recid_prob'])
# age_25_to_45_y_hat = age_25_to_45_y_hat.set_index(age_25_to_45_y.index)

# # Join all tables
# twenty_five_to_45_df = pd.concat([age_25_to_45_X, age_25_to_45_y, age_25_to_45_y_hat], axis=1)

# # Add sex back for future categorization
# twenty_five_to_45_df = twenty_five_to_45_df.join(df['sex'], how='left')

# # Female label modification
# num_rows_female = int(round(delta_female_2))
# indices_to_update_females = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Female') & (twenty_five_to_45_df['two_year_recid'] == 0)].nlargest(num_rows_female, 'two_year_recid_prob').index
# twenty_five_to_45_df.loc[indices_to_update_females, 'two_year_recid'] = 1

# # Male label modification
# num_rows_male = int(round(delta_male_2))
# indices_to_update_males = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Male') & (twenty_five_to_45_df['two_year_recid'] == 1)].nsmallest(num_rows_male, 'two_year_recid_prob').index
# twenty_five_to_45_df.loc[indices_to_update_males, 'two_year_recid'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astyp

In [15]:
# # greater than 45
# age_greater_than_45_X = greater_than_45_df[X_columns]
# age_greater_than_45_y = greater_than_45_df[y_column]

# # Get delta
# delta_male_3 = delta_results['Greater than 45']['male']
# delta_female_3 = delta_results['Greater than 45']['female']

# # Convert text columns to category for XGBoost
# for col in X_columns:
#     if greater_than_45_df[col].dtype == 'object':
#         greater_than_45_df[col] = greater_than_45_df[col].astype('category')

# # Prepare X and y
# X = greater_than_45_df[X_columns]
# y = greater_than_45_df[y_column].values.ravel()  

# # Since we're using categories, tell XGBoost to treat these columns as categorical
# categorical_columns = [X.columns.get_loc(c) for c in X.select_dtypes(['category']).columns]
# dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns, feature_types='c')

# # Training the model
# bst = xgb.train(params, dtrain, num_boost_round=100)

# # Predicting probabilities for y
# age_greater_than_45_y_hat = bst.predict(dtrain)
# age_greater_than_45_y_hat = pd.DataFrame(age_greater_than_45_y_hat, columns=['two_year_recid_prob'])
# age_greater_than_45_y_hat = age_greater_than_45_y_hat.set_index(age_greater_than_45_y.index)

# # Join all tables
# greater_than_45_df = pd.concat([age_greater_than_45_X, age_greater_than_45_y, age_greater_than_45_y_hat], axis=1)

# # Add sex back for future category
# greater_than_45_df = greater_than_45_df.join(df['sex'], how='left')

# num_rows_female = int(round(delta_female_3))
# indices_to_update_females = greater_than_45_df[(greater_than_45_df['sex'] == 'Female') & (greater_than_45_df['two_year_recid'] == 0)].nlargest(num_rows_female, 'two_year_recid_prob').index
# greater_than_45_df.loc[indices_to_update_females, 'two_year_recid'] = 1

# # Modifying male labels based on delta and model's probability predictions
# num_rows_male = int(round(delta_male_3))
# indices_to_update_males = greater_than_45_df[(greater_than_45_df['sex'] == 'Male') & (greater_than_45_df['two_year_recid'] == 1)].nsmallest(num_rows_male, 'two_year_recid_prob').index
# greater_than_45_df.loc[indices_to_update_males, 'two_year_recid'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  greater_than_45_df[col] = greater_than_45_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  greater_than_45_df[col] = greater_than_45_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  greater_than_45_df[col] = greater_than_45_df[col].astype('category'

In [16]:
# # Convert text columns to categorical for XGBoost
# for col in X_columns:
#     if less_than_25_df[col].dtype == 'object':
#         less_than_25_df[col] = less_than_25_df[col].astype('category')

# # Prepare X and y for training
# X = less_than_25_df[X_columns]
# y = less_than_25_df[y_column].values.ravel()

# # Create DMatrix for XGBoost, enabling categorical support
# dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns)

# # Train XGBoost model
# bst = xgb.train(params, dtrain, num_boost_round=100)

# # Predict probabilities
# less_than_25_df['prob_recid'] = bst.predict(dtrain)

# # Define deltas for deletion and duplication
# delta_male = int(round(delta_results['Less than 25']['male'] / 2))
# delta_female = int(round(delta_results['Less than 25']['female'] / 2))

# # For males: Delete instances that are close to decision boundary but incorrectly labeled
# indices_delete_male = less_than_25_df[(less_than_25_df['sex'] == 'Male') & (less_than_25_df['two_year_recid'] == 1)].nsmallest(delta_male, 'prob_recid').index
# less_than_25_df.drop(indices_delete_male, inplace=True)

# # For males: Duplicate instances that are correctly labeled and close to decision boundary
# indices_duplicate_male = less_than_25_df[(less_than_25_df['sex'] == 'Male') & (less_than_25_df['two_year_recid'] == 0)].nlargest(delta_male, 'prob_recid').index
# less_than_25_df = pd.concat([less_than_25_df.loc[indices_duplicate_male]] * 1, ignore_index=True)

# # For females: Delete instances close to decision boundary but incorrectly labeled
# indices_delete_female = less_than_25_df[(less_than_25_df['sex'] == 'Female') & (less_than_25_df['two_year_recid'] == 0)].nlargest(delta_female, 'prob_recid').index
# less_than_25_df.drop(indices_delete_female, inplace=True)

# # For females: Duplicate instances that are correctly labeled and close to decision boundary
# indices_duplicate_female = less_than_25_df[(less_than_25_df['sex'] == 'Female') & (less_than_25_df['two_year_recid'] == 1)].nsmallest(delta_female, 'prob_recid').index
# less_than_25_df = pd.concat([less_than_25_df.loc[indices_duplicate_female]] * 1, ignore_index=True)

# # Reset index after modifications to maintain DataFrame integrity
# less_than_25_df.reset_index(drop=True, inplace=True)

In [17]:
# # Convert text columns to categorical for XGBoost
# for col in X_columns:
#     if twenty_five_to_45_df[col].dtype == 'object':
#         twenty_five_to_45_df[col] = twenty_five_to_45_df[col].astype('category')

# # Prepare X and y for training
# X = twenty_five_to_45_df[X_columns]
# y = twenty_five_to_45_df[y_column].values.ravel()

# # Create DMatrix for XGBoost, enabling categorical support
# dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns)

# # Train XGBoost model
# bst = xgb.train(params, dtrain, num_boost_round=100)

# # Predict probabilities
# twenty_five_to_45_df['prob_recid'] = bst.predict(dtrain)

# # Define deltas for deletion and duplication
# delta_male = int(round(delta_results['25 - 45']['male'] / 2))
# delta_female = int(round(delta_results['25 - 45']['female'] / 2))

# # For males: Delete instances that are close to decision boundary but incorrectly labeled
# indices_delete_male = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Male') & (twenty_five_to_45_df['two_year_recid'] == 1)].nsmallest(delta_male, 'prob_recid').index
# twenty_five_to_45_df = twenty_five_to_45_df.drop(indices_delete_male)

# # For males: Duplicate instances that are correctly labeled and close to decision boundary
# indices_duplicate_male = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Male') & (twenty_five_to_45_df['two_year_recid'] == 0)].nlargest(delta_male, 'prob_recid').index
# twenty_five_to_45_df = pd.concat([twenty_five_to_45_df, twenty_five_to_45_df.loc[indices_duplicate_male]], ignore_index=True)

# # For females: Delete instances close to decision boundary but incorrectly labeled
# indices_delete_female = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Female') & (twenty_five_to_45_df['two_year_recid'] == 0)].nlargest(delta_female, 'prob_recid').index
# twenty_five_to_45_df = twenty_five_to_45_df.drop(indices_delete_female)

# # For females: Duplicate instances that are correctly labeled and close to decision boundary
# indices_duplicate_female = twenty_five_to_45_df[(twenty_five_to_45_df['sex'] == 'Female') & (twenty_five_to_45_df['two_year_recid'] == 1)].nsmallest(delta_female, 'prob_recid').index
# twenty_five_to_45_df = pd.concat([twenty_five_to_45_df, twenty_five_to_45_df.loc[indices_duplicate_female]], ignore_index=True)

# # Reset index after modifications to maintain DataFrame integrity
# twenty_five_to_45_df.reset_index(drop=True, inplace=True)

In [18]:
# # Convert text columns to categorical for XGBoost
# for col in X_columns:
#     if greater_than_45_df[col].dtype == 'object':
#         greater_than_45_df[col] = greater_than_45_df[col].astype('category')

# # Prepare X and y for training
# X = greater_than_45_df[X_columns]
# y = greater_than_45_df[y_column].values.ravel()

# # Create DMatrix for XGBoost, enabling categorical support
# dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns)

# # Train XGBoost model
# bst = xgb.train(params, dtrain, num_boost_round=100)

# # Predict probabilities
# greater_than_45_df['prob_recid'] = bst.predict(dtrain)

# # Define deltas for deletion and duplication
# delta_male = int(round(delta_results['Greater than 45']['male'] / 2))
# delta_female = int(round(delta_results['Greater than 45']['female'] / 2))

# # For males: Delete instances that are close to decision boundary but incorrectly labeled
# indices_delete_male = greater_than_45_df[(greater_than_45_df['sex'] == 'Male') & (greater_than_45_df['two_year_recid'] == 1)].nsmallest(delta_male, 'prob_recid').index
# greater_than_45_df = greater_than_45_df.drop(indices_delete_male)

# # For males: Duplicate instances that are correctly labeled and close to decision boundary
# indices_duplicate_male = greater_than_45_df[(greater_than_45_df['sex'] == 'Male') & (greater_than_45_df['two_year_recid'] == 0)].nlargest(delta_male, 'prob_recid').index
# greater_than_45_df = pd.concat([greater_than_45_df, greater_than_45_df.loc[indices_duplicate_male]], ignore_index=True)

# # For females: Delete instances close to decision boundary but incorrectly labeled
# indices_delete_female = greater_than_45_df[(greater_than_45_df['sex'] == 'Female') & (greater_than_45_df['two_year_recid'] == 0)].nlargest(delta_female, 'prob_recid').index
# greater_than_45_df = greater_than_45_df.drop(indices_delete_female)

# # For females: Duplicate instances that are correctly labeled and close to decision boundary
# indices_duplicate_female = greater_than_45_df[(greater_than_45_df['sex'] == 'Female') & (greater_than_45_df['two_year_recid'] == 1)].nsmallest(delta_female, 'prob_recid').index
# greater_than_45_df = pd.concat([greater_than_45_df, greater_than_45_df.loc[indices_duplicate_female]], ignore_index=True)

# # Reset index after modifications to maintain DataFrame integrity
# greater_than_45_df.reset_index(drop=True, inplace=True)

In [None]:
# # Get partition X and y
# less_than_25_X_1 = less_than_25_df[X_columns]
# less_than_25_y_1 = less_than_25_df[y_column]

# # quantify the extent of label adjustment needed to balance the probability of recidivism across male and female
# delta_male_1 = delta_results['Less than 25']['male']
# delta_female_1 = delta_results['Less than 25']['female']

# # H1: Build ranker
# # Convert text columns to category
# for col in X_columns:
#     if less_than_25_df[col].dtype == 'object':
#         less_than_25_df[col] = less_than_25_df[col].astype('category')

# # Prepare X and y
# X = less_than_25_df[X_columns]
# y = less_than_25_df[y_column].values.ravel() 

# # Since we're using categories, tell XGBoost to treat these columns as categorical
# categorical_columns = [X.columns.get_loc(c) for c in X.select_dtypes(['category']).columns]
# dtrain = xgb.DMatrix(X, label=y, enable_categorical=True, feature_names=X_columns, feature_types='c')

# # Specify parameters for XGBoost
# params = {
#     'objective': 'binary:logistic',
#     'eval_metric': 'logloss',
#     'learning_rate': 0.1, 
#     'max_depth': 6, 
#     'min_child_weight': 1, 
#     'subsample': 0.8, 
#     'colsample_bytree': 0.8, 
# }

# # Training the model
# bst = xgb.train(params, dtrain, num_boost_round=100)

# # Predicting probabilities for y
# less_than_25_y_hat_1 = bst.predict(dtrain)
# less_than_25_y_hat_1 = pd.DataFrame(less_than_25_y_hat_1, columns=['two_year_recid_prob'])
# less_than_25_y_hat_1 = less_than_25_y_hat_1.set_index(less_than_25_y_1.index)

# # Join all tables
# less_than_25_df = pd.concat([less_than_25_X_1, less_than_25_y_1, less_than_25_y_hat_1], axis=1)

# # Add sex back for future category
# less_than_25_df = less_than_25_df.join(df['sex'], how='left')

# # Calculate number of labels to modify
# num_females_to_update = int(round(delta_female_1))
# num_males_to_update = int(round(delta_male_1))

# # Females: Update from 0 to 1
# female_indices_to_update = less_than_25_df[(less_than_25_df['sex'] == 'Female') & (less_than_25_df['two_year_recid'] == 0)].nlargest(num_females_to_update, 'two_year_recid_prob').index
# less_than_25_df.loc[female_indices_to_update, 'two_year_recid'] = 1

# # Males: Update from 1 to 0
# male_indices_to_update = less_than_25_df[(less_than_25_df['sex'] == 'Male') & (less_than_25_df['two_year_recid'] == 1)].nsmallest(num_males_to_update, 'two_year_recid_prob').index
# less_than_25_df.loc[male_indices_to_update, 'two_year_recid'] = 0