In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

from sklearn.ensemble import BaggingRegressor # default classifier is a DT
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from mlxtend.regressor import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import SCORERS

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)})
import warnings
warnings.simplefilter("ignore")

%matplotlib inline

**The above code sets up the environment for building and evaluating machine learning models, including data preprocessing, model selection, and performance evaluation.**

# **Data Preprocessing**

In [2]:
df = pd.read_csv("compas-scores-two-years.csv")

In [4]:
pd.set_option('display.max_columns', 53)

In [5]:
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment','is_recid','is_violent_recid','event',
               'decile_score.1','priors_count.1']
df = df.drop(drop_columns,axis=1)

**The above code drops the columns listed.**

In [6]:
df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,v_decile_score,two_year_recid
0,Male,69,Greater than 45,Other,1,0,1,0
1,Male,34,25 - 45,African-American,3,0,1,1
2,Male,24,Less than 25,African-American,4,4,3,1
3,Male,23,Less than 25,African-American,8,1,6,0
4,Male,43,25 - 45,Other,1,2,1,0


In [7]:
df.count()

sex               7214
age               7214
age_cat           7214
race              7214
decile_score      7214
priors_count      7214
v_decile_score    7214
two_year_recid    7214
dtype: int64

In [8]:
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

**The above code simplifies the analysis by focusing on two racial categories, African American and Caucasian, and grouping all other racial categories into a single 'Other' category.**

In [9]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

**The above code separates the columns of the DataFrame into two groups: categorical_cols containing categorical columns and numerical_cols containing numeric columns.**

In [10]:
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)
numerical.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid
0,69.0,1.0,0.0,1.0,0.0
1,34.0,3.0,0.0,1.0,1.0
2,24.0,4.0,4.0,3.0,1.0
3,23.0,8.0,1.0,6.0,0.0
4,43.0,1.0,2.0,1.0,0.0


**The above code takes the numerical columns of the DataFrame, applies imputation using the median strategy to fill in missing values and creates a new DataFrame with the imputed values.**

In [11]:
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,0,1,0,1,0,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,1,0,0
3,0,1,0,0,1,1,0,0
4,0,1,1,0,0,0,0,1


**One-hot encoding is applied in the above code to convert categorical variables into a format suitable for training models, where binary columns indicate the presence or absence of each category.**

In [12]:
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0,0,0,1


**The above code combines the numerical and categorical features into a single DataFrame (df_new).**

# **Since the dataframe is clean, we will use ensemble methods-**

In [13]:
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

**The above code is preparing the data for training an ensemble machine learning model.The resulting feature matrix X contains the independent variables, and the target vector y contains the corresponding values of the target variable.**

In [14]:
def evaluate_model(estimator):
    cv_results = cross_validate(estimator, X, y, scoring='accuracy', n_jobs=-1, cv=10, return_train_score=True)
    return pd.DataFrame(cv_results).abs().mean().to_dict()

def display_results(results):
    results_df  = pd.DataFrame(results).T
    results_cols = results_df.columns
    for col in results_df:
        results_df[col] = results_df[col].apply(np.mean)
    return results_df

**Tha above used functions streamlines the process of evaluating machine learning models using cross-validation and displays the results.**

In [15]:
RESULTS = {}

In [16]:
RESULTS["tree"] = evaluate_model(DecisionTreeClassifier())
RESULTS["log_reg"] = evaluate_model(LogisticRegression())

pd.DataFrame.from_dict(RESULTS).T

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.020576,0.002901,0.614915,0.932492
log_reg,0.044517,0.003212,0.67923,0.680298


**The above code evaluates the performance of a Decision Tree Classifier and Logistic Regression. The results are stored in the RESULTS DataFrame, and the final DataFrame is transposed with models as rows and evaluation metrics as columns.**

In [17]:
#testing bagging classifier with 10 estimators
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
estimator_bagging_10 = BaggingClassifier(n_estimators=10)
RESULTS["bagging_tree_10"] = evaluate_model(estimator_bagging_10)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.020576,0.002901,0.614915,0.932492
log_reg,0.044517,0.003212,0.67923,0.680298
bagging_tree_10,0.194816,0.009458,0.631545,0.917229


In [18]:
#testing bagging classifier with 100 estimators
estimator_bagging_100 = BaggingClassifier(n_estimators=100)
RESULTS["bagging_tree_100"] = evaluate_model(estimator_bagging_100)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.020576,0.002901,0.614915,0.932492
log_reg,0.044517,0.003212,0.67923,0.680298
bagging_tree_10,0.194816,0.009458,0.631545,0.917229
bagging_tree_100,1.55504,0.051343,0.635704,0.932462


In [19]:
#testing random forest classifier with 100 estimators
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
estimator_randomforest = RandomForestClassifier(n_estimators=100)

RESULTS["randomforest_100"] = evaluate_model(estimator_randomforest)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.020576,0.002901,0.614915,0.932492
log_reg,0.044517,0.003212,0.67923,0.680298
bagging_tree_10,0.194816,0.009458,0.631545,0.917229
bagging_tree_100,1.55504,0.051343,0.635704,0.932462
randomforest_100,0.734032,0.031597,0.648867,0.932462


**The above codes tests the performance of different ensemble models (Bagging Classifier and Random Forest Classifier) with varying numbers of estimators.The DataFrame is displayed to compare the models based on various evaluation metrics.**

In [20]:
#testing extra tree classifier as base estimator for the bagging classifier
from sklearn.tree import ExtraTreeClassifier

estimator_bagging_random_tree = BaggingClassifier(n_estimators=100,
                                    base_estimator=ExtraTreeClassifier())
RESULTS["bagging_random_tree"] = evaluate_model(estimator_bagging_random_tree)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.020576,0.002901,0.614915,0.932492
log_reg,0.044517,0.003212,0.67923,0.680298
bagging_tree_10,0.194816,0.009458,0.631545,0.917229
bagging_tree_100,1.55504,0.051343,0.635704,0.932462
randomforest_100,0.734032,0.031597,0.648867,0.932462
bagging_random_tree,0.723627,0.04291,0.645684,0.932492


In [21]:
#boosting
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
estimator_adaboost = AdaBoostClassifier(n_estimators=100)

RESULTS["adaboost_100"]  = evaluate_model(estimator_adaboost)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.020576,0.002901,0.614915,0.932492
log_reg,0.044517,0.003212,0.67923,0.680298
bagging_tree_10,0.194816,0.009458,0.631545,0.917229
bagging_tree_100,1.55504,0.051343,0.635704,0.932462
randomforest_100,0.734032,0.031597,0.648867,0.932462
bagging_random_tree,0.723627,0.04291,0.645684,0.932492
adaboost_100,0.766608,0.040223,0.682696,0.68766


In [22]:
#xgboost classifier
from xgboost import XGBRegressor, XGBClassifier
estimator_xgboost = XGBClassifier(n_estimators=500)

RESULTS["xgboost_500"] = evaluate_model(estimator_xgboost)

display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.020576,0.002901,0.614915,0.932492
log_reg,0.044517,0.003212,0.67923,0.680298
bagging_tree_10,0.194816,0.009458,0.631545,0.917229
bagging_tree_100,1.55504,0.051343,0.635704,0.932462
randomforest_100,0.734032,0.031597,0.648867,0.932462
bagging_random_tree,0.723627,0.04291,0.645684,0.932492
adaboost_100,0.766608,0.040223,0.682696,0.68766
xgboost_500,0.497165,0.01885,0.648734,0.886224


**This code extends the ensemble model evaluation by testing Bagging with ExtraTreeClassifier, AdaBoostClassifier, and XGBoostClassifier. The results provide a comprehensive view of the models' performance across various metrics.Ensemble methods like Bagging and Random Forest did not show a significant improvement over a single Decision Tree in the above scenario.**

# **The results table shows that logistic regression is the best model since it has the highest train score, which evaluates the performance of the train data, and the highest test score, which tests accuracy.**

In [28]:
#select xgboost as the ideal model for predictions
model = XGBClassifier()

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
model.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [30]:
param_dist_random = {
    "max_depth": [3, 6],
    "n_estimators" : [300, 500, 600]
}

In [31]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist_random,
   scoring="roc_auc", n_jobs=-1,
    n_iter=50)

In [34]:
random_search.fit(X_train, y_train)

In [35]:
# Print the best score, best estimator, and best parameters
print("Best Score:", random_search.best_score_)
print("Best Estimator:", random_search.best_estimator_)
print("Best Parameters:", random_search.best_params_)

Best Score: 0.7219383320549382
Best Estimator: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Best Parameters: {'n_estimators': 300, 'max_depth': 3}


**The above codes performs a randomized search for hyperparameter tuning on the XGBoost classifier using cross-validated performance metrics. The best hyperparameters are determined based on the scoring metric ROC and AUC.**

# Using the model with the best hyperparameters

In [36]:
model = XGBClassifier(n_estimators = 300, max_depth = 3)

In [37]:
#high auc, thus the model is accurate
cross_val_score(
    model, X, y, scoring = "roc_auc", cv = 5
).mean()

0.7295769781299619

In [38]:
#splitting the data to prepare for predictions
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [39]:
X_test_addback = X_test[['race_African-American','race_Caucasian','race_Other']]

drops = ['race_African-American','race_Caucasian','race_Other']
X_train = X_train.drop(drops, axis=1)
X_test = X_test.drop(drops, axis=1)

In [40]:
#added target and prediction to existing df
model.fit(X=X_train, y=y_train)
predictions = model.predict(X_test)

X_test["target"] = y_test.tolist()
X_test["prediction"] = predictions

In [41]:
X_rejoin = pd.concat([X_test, X_test_addback], axis = 1)

**The above code assesses the model using cross-validation and predictions are added to the test set for further examination.**

# Searching for bias indicators such as false positive and false negative rates and numbers


In [42]:
#used a confusion matrix to identiy cases of false positives
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[607, 216],
       [261, 359]])

In [43]:
X_rejoin['race_African-American'].sum()

731

In [44]:
X_rejoin['race_Caucasian'].sum()

505

In [45]:
#manipulate the existing df to only show false positive rows
false_positives = X_rejoin[(X_rejoin.target == 0.0) & (X_rejoin.prediction == 1.0)]
false_positives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,target,prediction,race_African-American,race_Caucasian,race_Other
5952,26.0,5.0,2.0,4.0,0,1,1,0,0,0.0,1,0,1,0
509,35.0,2.0,12.0,1.0,1,0,1,0,0,0.0,1,1,0,0
676,22.0,9.0,3.0,8.0,0,1,0,0,1,0.0,1,1,0,0
5321,21.0,4.0,1.0,6.0,0,1,0,0,1,0.0,1,0,1,0
4264,29.0,2.0,5.0,3.0,0,1,1,0,0,0.0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,53.0,1.0,6.0,1.0,0,1,0,1,0,0.0,1,0,1,0
5292,56.0,4.0,13.0,1.0,0,1,0,1,0,0.0,1,0,1,0
2818,21.0,8.0,1.0,8.0,0,1,0,0,1,0.0,1,1,0,0
371,57.0,8.0,18.0,7.0,0,1,0,1,0,0.0,1,1,0,0


In [46]:
print('Total amount of False Positives are 216')
print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('Caucasian false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))


Total amount of False Positives are 216
African American false positive count: 123
African American false positive rate is : 0.16826265389876882
Caucasian false positive count: 65
Caucasian false positive rate is : 0.12871287128712872


In [47]:
#manipulating data to show false negative rates
false_negatives = X_rejoin[(X_rejoin.target == 1.0) & (X_rejoin.prediction == 0.0)]
false_negatives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,target,prediction,race_African-American,race_Caucasian,race_Other
3238,57.0,4.0,8.0,2.0,0,1,0,1,0,1.0,0,1,0,0
3858,52.0,1.0,2.0,1.0,1,0,0,1,0,1.0,0,1,0,0
132,28.0,2.0,0.0,3.0,0,1,1,0,0,1.0,0,0,1,0
5157,22.0,7.0,0.0,6.0,1,0,0,0,1,1.0,0,0,1,0
1042,32.0,2.0,2.0,2.0,0,1,1,0,0,1.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5144,40.0,3.0,11.0,2.0,0,1,1,0,0,1.0,0,1,0,0
2609,32.0,3.0,0.0,2.0,1,0,1,0,0,1.0,0,1,0,0
730,45.0,4.0,0.0,2.0,0,1,0,1,0,1.0,0,1,0,0
7181,51.0,6.0,7.0,3.0,0,1,0,1,0,1.0,0,1,0,0


In [48]:
print('Total amount of False Negatives are 261')
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))


Total amount of False Negatives are 261
African American false negative count: 115
African American false negative rate is : 0.1573187414500684
Caucasian false negative count: 103
Caucasian false negative rate is : 0.20396039603960395


# **Interpretation**

**Comparing false positive and false negative rates between the racial groups helps identify potential bias in the model.**

**The false positive rate is higher among African Americans (16.8% vs. 12.8%), while the false negative rate is higher among Caucasians (20.3% vs. 15.7%).**


**This indicates that the COMPAS algorithm is more likely to classify Caucasians as having a lower risk of recidivism and African Americans as having a higher risk.**